In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style for better visualizations
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Load the dataset
dataset_path = Path('future_skills_dataset.csv')
if not dataset_path.exists():
    print("‚ö†Ô∏è Dataset not found. Please run: python ../manage.py export_future_skills_dataset")
else:
    df = pd.read_csv(dataset_path)
    print(f"‚úÖ Dataset loaded: {len(df)} rows, {len(df.columns)} columns")
    print(f"\nColumns: {', '.join(df.columns)}")

## 1Ô∏è‚É£ Basic Dataset Information

In [None]:
# Display basic info
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
df.head()

In [None]:
# Data types and missing values
print("Data Types and Missing Values:")
info_df = pd.DataFrame({
    'Type': df.dtypes,
    'Missing': df.isnull().sum(),
    'Missing %': (df.isnull().sum() / len(df) * 100).round(2)
})
info_df

In [None]:
# Statistical summary of numeric features
print("\nStatistical Summary:")
df.describe()

## 2Ô∏è‚É£ Target Variable Analysis - Class Distribution

In [None]:
# Class distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
class_counts = df['future_need_level'].value_counts()
sns.barplot(x=class_counts.index, y=class_counts.values, ax=axes[0], palette='viridis')
axes[0].set_title('Distribution of Future Need Level', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Need Level')
axes[0].set_ylabel('Count')

# Add value labels on bars
for i, v in enumerate(class_counts.values):
    axes[0].text(i, v + 0.5, str(v), ha='center', fontweight='bold')

# Pie chart
colors = ['#ff6b6b', '#ffd93d', '#6bcf7f']
axes[1].pie(class_counts.values, labels=class_counts.index, autopct='%1.1f%%', 
            colors=colors, startangle=90)
axes[1].set_title('Percentage Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

# Print statistics
print("\nClass Distribution:")
for level, count in class_counts.items():
    percentage = (count / len(df)) * 100
    print(f"  {level}: {count} ({percentage:.1f}%)")

# Check for imbalance
max_class = class_counts.max()
min_class = class_counts.min()
imbalance_ratio = max_class / min_class
print(f"\n‚öñÔ∏è Imbalance Ratio: {imbalance_ratio:.2f}")
if imbalance_ratio > 3:
    print("‚ö†Ô∏è WARNING: Significant class imbalance detected!")
else:
    print("‚úÖ Classes are reasonably balanced.")

## 3Ô∏è‚É£ Categorical Features Analysis

In [None]:
# Analyze job roles
print("Top 10 Job Roles by Frequency:")
job_role_counts = df['job_role_name'].value_counts().head(10)
print(job_role_counts)

plt.figure(figsize=(12, 6))
sns.barplot(x=job_role_counts.values, y=job_role_counts.index, palette='coolwarm')
plt.title('Top 10 Job Roles', fontsize=14, fontweight='bold')
plt.xlabel('Count')
plt.ylabel('Job Role')
plt.tight_layout()
plt.show()

In [None]:
# Analyze skills
print("\nTop 10 Skills by Frequency:")
skill_counts = df['skill_name'].value_counts().head(10)
print(skill_counts)

plt.figure(figsize=(12, 6))
sns.barplot(x=skill_counts.values, y=skill_counts.index, palette='plasma')
plt.title('Top 10 Skills', fontsize=14, fontweight='bold')
plt.xlabel('Count')
plt.ylabel('Skill')
plt.tight_layout()
plt.show()

In [None]:
# Skill categories and job departments
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Skill categories
if 'skill_category' in df.columns:
    skill_cat_counts = df['skill_category'].value_counts()
    sns.barplot(x=skill_cat_counts.values, y=skill_cat_counts.index, ax=axes[0], palette='viridis')
    axes[0].set_title('Skill Categories', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Count')

# Job departments
if 'job_department' in df.columns:
    dept_counts = df['job_department'].value_counts()
    sns.barplot(x=dept_counts.values, y=dept_counts.index, ax=axes[1], palette='magma')
    axes[1].set_title('Job Departments', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Count')

plt.tight_layout()
plt.show()

## 4Ô∏è‚É£ Numeric Features Analysis

In [None]:
# Select numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numeric features: {', '.join(numeric_cols)}")

# Distribution of numeric features
n_cols = len(numeric_cols)
n_rows = (n_cols + 2) // 3
fig, axes = plt.subplots(n_rows, 3, figsize=(15, n_rows * 4))
axes = axes.flatten()

for i, col in enumerate(numeric_cols):
    axes[i].hist(df[col], bins=30, edgecolor='black', color='steelblue', alpha=0.7)
    axes[i].set_title(f'Distribution of {col}', fontweight='bold')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Frequency')
    
    # Add statistics
    mean_val = df[col].mean()
    median_val = df[col].median()
    axes[i].axvline(mean_val, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_val:.2f}')
    axes[i].axvline(median_val, color='green', linestyle='--', linewidth=2, label=f'Median: {median_val:.2f}')
    axes[i].legend()

# Hide unused subplots
for i in range(n_cols, len(axes)):
    axes[i].axis('off')

plt.tight_layout()
plt.show()

## 5Ô∏è‚É£ Outlier Detection

In [None]:
# Box plots for outlier detection
fig, axes = plt.subplots(n_rows, 3, figsize=(15, n_rows * 4))
axes = axes.flatten()

for i, col in enumerate(numeric_cols):
    sns.boxplot(y=df[col], ax=axes[i], color='lightblue')
    axes[i].set_title(f'Box Plot - {col}', fontweight='bold')
    axes[i].set_ylabel(col)
    
    # Calculate and display outliers
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
    axes[i].text(0.05, 0.95, f'Outliers: {len(outliers)}', 
                transform=axes[i].transAxes, verticalalignment='top',
                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

# Hide unused subplots
for i in range(n_cols, len(axes)):
    axes[i].axis('off')

plt.tight_layout()
plt.show()

In [None]:
# Detailed outlier statistics
print("\nOutlier Statistics (using IQR method):")
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    outlier_percentage = (len(outliers) / len(df)) * 100
    print(f"  {col}: {len(outliers)} outliers ({outlier_percentage:.2f}%)")

## 6Ô∏è‚É£ Feature Correlations

In [None]:
# Correlation matrix
plt.figure(figsize=(12, 10))
correlation_matrix = df[numeric_cols].corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

# Print strong correlations
print("\nStrong Correlations (|correlation| > 0.5):")
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > 0.5:
            print(f"  {correlation_matrix.columns[i]} ‚Üî {correlation_matrix.columns[j]}: {correlation_matrix.iloc[i, j]:.3f}")

## 7Ô∏è‚É£ Feature Distribution by Target Class

In [None]:
# Box plots by target class
n_cols_show = min(6, len(numeric_cols))  # Show first 6 numeric features
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.flatten()

for i, col in enumerate(numeric_cols[:n_cols_show]):
    sns.boxplot(x='future_need_level', y=col, data=df, ax=axes[i], 
                palette='Set2', order=['LOW', 'MEDIUM', 'HIGH'])
    axes[i].set_title(f'{col} by Need Level', fontweight='bold')
    axes[i].set_xlabel('Future Need Level')

plt.tight_layout()
plt.show()

## 8Ô∏è‚É£ Data Quality Checks

In [None]:
print("\nüìã DATA QUALITY REPORT\n" + "="*50)

# 1. Missing values
missing_count = df.isnull().sum().sum()
print(f"\n1. Missing Values: {missing_count}")
if missing_count == 0:
    print("   ‚úÖ No missing values detected")
else:
    print("   ‚ö†Ô∏è Missing values found:")
    print(df.isnull().sum()[df.isnull().sum() > 0])

# 2. Duplicate rows
duplicates = df.duplicated().sum()
print(f"\n2. Duplicate Rows: {duplicates}")
if duplicates == 0:
    print("   ‚úÖ No duplicate rows")
else:
    print("   ‚ö†Ô∏è Duplicate rows found")

# 3. Target class distribution
print(f"\n3. Target Class Balance:")
class_dist = df['future_need_level'].value_counts(normalize=True) * 100
for level, pct in class_dist.items():
    status = "‚úÖ" if 20 <= pct <= 50 else "‚ö†Ô∏è"
    print(f"   {status} {level}: {pct:.1f}%")

# 4. Feature value ranges
print(f"\n4. Feature Value Ranges:")
for col in numeric_cols:
    min_val = df[col].min()
    max_val = df[col].max()
    # Check if values are in expected range [0, 1] for most features
    if col != 'avg_salary_k':
        status = "‚úÖ" if 0 <= min_val and max_val <= 1 else "‚ö†Ô∏è"
        print(f"   {status} {col}: [{min_val:.3f}, {max_val:.3f}]")
    else:
        status = "‚úÖ" if min_val > 0 and max_val < 200 else "‚ö†Ô∏è"
        print(f"   {status} {col}: [{min_val:.2f}, {max_val:.2f}] K‚Ç¨")

# 5. Unique combinations
unique_combos = df[['job_role_name', 'skill_name']].drop_duplicates().shape[0]
print(f"\n5. Unique (Job Role, Skill) Combinations: {unique_combos}")
print(f"   Total rows: {len(df)}")
if unique_combos == len(df):
    print("   ‚úÖ All combinations are unique")
else:
    print("   ‚ö†Ô∏è Some combinations appear multiple times")

print("\n" + "="*50)

## 9Ô∏è‚É£ Summary and Recommendations

In [None]:
print("\nüéØ DATASET SUMMARY AND RECOMMENDATIONS\n" + "="*60)

print(f"\nüìä Dataset Size:")
print(f"   ‚Ä¢ Total samples: {len(df)}")
print(f"   ‚Ä¢ Features: {len(df.columns)}")
print(f"   ‚Ä¢ Unique job roles: {df['job_role_name'].nunique()}")
print(f"   ‚Ä¢ Unique skills: {df['skill_name'].nunique()}")

print(f"\nüéØ Target Distribution:")
class_counts = df['future_need_level'].value_counts()
for level in ['LOW', 'MEDIUM', 'HIGH']:
    if level in class_counts:
        count = class_counts[level]
        pct = (count / len(df)) * 100
        print(f"   ‚Ä¢ {level}: {count} ({pct:.1f}%)")

print(f"\nüí° Recommendations:")

# Check class imbalance
imbalance_ratio = class_counts.max() / class_counts.min()
if imbalance_ratio > 3:
    print("   ‚ö†Ô∏è CLASS IMBALANCE DETECTED (ratio > 3)")
    print("      ‚Üí Consider using class_weight='balanced' in the model")
    print("      ‚Üí Consider SMOTE or other resampling techniques")
    print("      ‚Üí Use stratified train/test split")
else:
    print("   ‚úÖ Classes are reasonably balanced")

# Check dataset size
if len(df) < 100:
    print("   ‚ö†Ô∏è SMALL DATASET (< 100 samples)")
    print("      ‚Üí Add more job roles and skills to increase diversity")
    print("      ‚Üí Consider using cross-validation instead of train/test split")
elif len(df) < 500:
    print("   ‚ö†Ô∏è MODERATE DATASET SIZE (< 500 samples)")
    print("      ‚Üí Use cross-validation for better model evaluation")
    print("      ‚Üí Consider data augmentation if possible")
else:
    print("   ‚úÖ Good dataset size for training")

# Check for outliers
total_outliers = 0
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = len(df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)])
    total_outliers += outliers

if total_outliers > len(df) * 0.05:  # More than 5% outliers
    print("   ‚ö†Ô∏è MANY OUTLIERS DETECTED (> 5% of data)")
    print("      ‚Üí Review outliers to ensure they're realistic")
    print("      ‚Üí Consider robust scaling or outlier removal")
else:
    print("   ‚úÖ Acceptable number of outliers")

print("\n‚ú® Next Steps:")
print("   1. If satisfied with the data quality, proceed with model training")
print("   2. Update the training script to use new features")
print("   3. Run: python ../manage.py export_future_skills_dataset")
print("   4. Run: python train_future_skills_model.py")
print("   5. Evaluate model performance and iterate")

print("\n" + "="*60)