# Dataset Visualization
This notebook visualizes the cleaned dataset for role prediction.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('documents/cleaned_dataset.csv')
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Count the number of each role
role_counts = df['Role'].value_counts()
print(f"Number of unique roles: {len(role_counts)}")

# Plot the distribution of roles
plt.figure(figsize=(12, 8))
role_counts.plot(kind='bar')
plt.title('Distribution of Roles')
plt.xlabel('Role')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
# Create a heatmap of skills correlation
# First, convert skill levels to numeric values
skill_mapping = {
    'No Experience': 0,
    'Poor': 1,
    'Novice': 2,
    'Beginner': 3,
    'Entry-level': 4,
    'Average': 5,
    'Intermediate': 6,
    'Advanced': 7,
    'Expert': 8,
    'Professional': 9,
    'Excellent': 10,
    'N/A': None,
    'Not Interested': None,
    'None': None,
    '': None
}

# Create a copy to avoid modifying the original
df_numeric = df.copy()

# Convert all skill columns to numeric
for col in df.columns[:-1]:  # All columns except 'Role'
    df_numeric[col] = df_numeric[col].map(skill_mapping)

# Calculate correlation matrix
corr_matrix = df_numeric.iloc[:, :-1].corr()

# Plot heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Between Skills')
plt.tight_layout()
plt.show()

In [None]:
# Add a pie chart for top 10 roles
top_10_roles = role_counts.head(10)
plt.figure(figsize=(10, 10))
plt.pie(top_10_roles, labels=top_10_roles.index, autopct='%1.1f%%', startangle=90)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
plt.title('Top 10 Roles Distribution')
plt.tight_layout()
plt.show()