#  Career Guidance ML Data Preprocessing & Model Training

**Author:** Kunal Ramesh Pawar  
**Project:** Hackgen - AI Career Guidance Platform

### Objectives:
1. Data Cleaning & Feature Engineering
2. ML Model Training & Evaluation
3. Performance Analysis & Visualization
4. Export Clean Data for Web Application

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

print(" Libraries imported successfully!")

 Libraries imported successfully!


##  Load and Clean Datasets

In [2]:
# Load datasets
print(" Loading datasets...")

# Dataset 1: Career Path Data
df_career = pd.read_csv('dataset/career_path_in_all_field.csv')
print(f" Career Path Dataset: {df_career.shape}")

# Dataset 2: Skill-Career Mapping
df_skills = pd.read_csv('dataset/skill-career.csv')
print(f" Skill-Career Dataset: {df_skills.shape}")

# Display basic info
print("\n Career Dataset Columns:")
print(df_career.columns.tolist())
print("\n Skills Dataset Columns:")
print(df_skills.columns.tolist())

 Loading datasets...
 Career Path Dataset: (9000, 17)
 Skill-Career Dataset: (3999, 21)

 Career Dataset Columns:
['Field', 'Career', 'GPA', 'Extracurricular_Activities', 'Internships', 'Projects', 'Leadership_Positions', 'Field_Specific_Courses', 'Research_Experience', 'Coding_Skills', 'Communication_Skills', 'Problem_Solving_Skills', 'Teamwork_Skills', 'Analytical_Skills', 'Presentation_Skills', 'Networking_Skills', 'Industry_Certifications']

 Skills Dataset Columns:
['Sr.No.', 'Course', 'Job profession', 'Student', 'Linguistic', 'Musical', 'Bodily', 'Logical - Mathematical', 'Spatial-Visualization', 'Interpersonal', 'Intrapersonal', 'Naturalist', 's/p', 'P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8']


In [3]:
# Data Cleaning
print("🧹 CLEANING DATASETS")

# Clean Career Dataset
df_career_clean = df_career.copy()
df_career_clean = df_career_clean.drop_duplicates()

# Handle missing values
numerical_cols = df_career_clean.select_dtypes(include=[np.number]).columns
imputer = SimpleImputer(strategy='median')
df_career_clean[numerical_cols] = imputer.fit_transform(df_career_clean[numerical_cols])

# Clean categorical columns
if 'Field' in df_career_clean.columns:
    df_career_clean['Field'] = df_career_clean['Field'].fillna('Other')
if 'Career' in df_career_clean.columns:
    df_career_clean['Career'] = df_career_clean['Career'].fillna('General')

# Clean Skills Dataset
df_skills_clean = df_skills.copy()
df_skills_clean = df_skills_clean.drop_duplicates()

# Handle skill scores
skill_cols = ['Linguistic', 'Musical', 'Bodily', 'Logical - Mathematical', 
              'Spatial-Visualization', 'Interpersonal', 'Intrapersonal', 'Naturalist']
skill_cols = [col for col in skill_cols if col in df_skills_clean.columns]

if skill_cols:
    df_skills_clean[skill_cols] = df_skills_clean[skill_cols].fillna(df_skills_clean[skill_cols].median())

print(f"Career dataset cleaned: {df_career_clean.shape}")
print(f" Skills dataset cleaned: {df_skills_clean.shape}")

🧹 CLEANING DATASETS
Career dataset cleaned: (9000, 17)
 Skills dataset cleaned: (3602, 21)


##  Feature Engineering

In [4]:
# Feature Engineering for Career Dataset
print("🔧 FEATURE ENGINEERING")

# Create skill scores
skill_columns = [col for col in df_career_clean.columns if 'Skills' in col]
if skill_columns:
    df_career_clean['Overall_Skills'] = df_career_clean[skill_columns].mean(axis=1)
    print(f"   Created Overall_Skills from: {skill_columns}")

# Performance categories
if 'GPA' in df_career_clean.columns:
    df_career_clean['Performance_Level'] = pd.cut(df_career_clean['GPA'], 
                                                   bins=[0, 2.5, 3.0, 3.5, 4.0], 
                                                   labels=['Low', 'Average', 'Good', 'Excellent'])

# Create RIASEC scores for Skills Dataset
if skill_cols:
    # Map to RIASEC categories
    df_skills_clean['RIASEC_R'] = df_skills_clean[['Bodily', 'Spatial-Visualization']].mean(axis=1) if all(col in skill_cols for col in ['Bodily', 'Spatial-Visualization']) else 0
    df_skills_clean['RIASEC_I'] = df_skills_clean[['Logical - Mathematical', 'Naturalist']].mean(axis=1) if all(col in skill_cols for col in ['Logical - Mathematical', 'Naturalist']) else 0
    df_skills_clean['RIASEC_A'] = df_skills_clean[['Musical', 'Linguistic']].mean(axis=1) if all(col in skill_cols for col in ['Musical', 'Linguistic']) else 0
    df_skills_clean['RIASEC_S'] = df_skills_clean['Interpersonal'] if 'Interpersonal' in skill_cols else 0
    df_skills_clean['RIASEC_E'] = df_skills_clean[['Interpersonal', 'Linguistic']].mean(axis=1) * 0.8 if all(col in skill_cols for col in ['Interpersonal', 'Linguistic']) else 0
    
    print("   Created RIASEC personality scores")

print(" Feature engineering completed!")

🔧 FEATURE ENGINEERING
   Created Overall_Skills from: ['Coding_Skills', 'Communication_Skills', 'Problem_Solving_Skills', 'Teamwork_Skills', 'Analytical_Skills', 'Presentation_Skills', 'Networking_Skills']
   Created RIASEC personality scores
 Feature engineering completed!


##  Data Visualization & Analysis

In [5]:
# Create visualizations
print(" CREATING VISUALIZATIONS")

# Career Field Distribution
if 'Field' in df_career_clean.columns:
    fig = px.bar(df_career_clean['Field'].value_counts().head(10), 
                 title="🎯 Top 10 Career Fields Distribution",
                 labels={'index': 'Field', 'value': 'Count'})
    fig.show()

# Skills Distribution
if skill_cols:
    avg_skills = df_skills_clean[skill_cols].mean()
    fig = px.bar(x=avg_skills.index, y=avg_skills.values,
                 title=" Average Intelligence Scores",
                 labels={'x': 'Intelligence Type', 'y': 'Average Score'})
    fig.show()

# RIASEC Radar Chart
riasec_cols = [col for col in df_skills_clean.columns if col.startswith('RIASEC_')]
if riasec_cols:
    avg_riasec = df_skills_clean[riasec_cols].mean()
    
    fig = go.Figure()
    fig.add_trace(go.Scatterpolar(
        r=avg_riasec.values,
        theta=[col.replace('RIASEC_', '') for col in avg_riasec.index],
        fill='toself',
        name='RIASEC Profile'
    ))
    
    fig.update_layout(title=" RIASEC Personality Profile", title_x=0.5)
    fig.show()

print(" Visualizations created!")

 CREATING VISUALIZATIONS


 Visualizations created!


##  Machine Learning Model Training

In [6]:
# Prepare data for ML
print(" PREPARING ML MODELS")

# Model 1: Career Field Prediction
if 'Field' in df_career_clean.columns and numerical_cols.size > 0:
    # Prepare features and target
    feature_cols = [col for col in numerical_cols if col != 'Sr.No.' and df_career_clean[col].notna().all()]
    X = df_career_clean[feature_cols]
    y = df_career_clean['Field']
    
    # Encode target variable
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    print(f"   Training set: {X_train.shape}")
    print(f"   Test set: {X_test.shape}")
    print(f"   Features used: {feature_cols}")
    
    # Train multiple models
    models = {
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42),
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
    }
    
    model_results = {}
    
    for name, model in models.items():
        print(f"\n Training {name}...")
        
        # Use scaled data for Logistic Regression, original for tree-based models
        if 'Logistic' in name:
            model.fit(X_train_scaled, y_train)
            y_pred = model.predict(X_test_scaled)
            # Cross-validation
            cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
        else:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            # Cross-validation
            cv_scores = cross_val_score(model, X_train, y_train, cv=5)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        cv_mean = cv_scores.mean()
        cv_std = cv_scores.std()
        
        model_results[name] = {
            'accuracy': accuracy,
            'cv_mean': cv_mean,
            'cv_std': cv_std,
            'model': model
        }
        
        print(f"   Accuracy: {accuracy:.4f}")
        print(f"   CV Score: {cv_mean:.4f} (+/- {cv_std*2:.4f})")
    
    print("\n Model training completed!")
else:
    print(" Insufficient data for ML model training")

 PREPARING ML MODELS
   Training set: (7200, 15)
   Test set: (1800, 15)
   Features used: ['GPA', 'Extracurricular_Activities', 'Internships', 'Projects', 'Leadership_Positions', 'Field_Specific_Courses', 'Research_Experience', 'Coding_Skills', 'Communication_Skills', 'Problem_Solving_Skills', 'Teamwork_Skills', 'Analytical_Skills', 'Presentation_Skills', 'Networking_Skills', 'Industry_Certifications']

🔄 Training Random Forest...
   Accuracy: 0.0650
   CV Score: 0.0653 (+/- 0.0108)

🔄 Training Gradient Boosting...
   Accuracy: 0.0617
   CV Score: 0.0710 (+/- 0.0159)

🔄 Training Logistic Regression...
   Accuracy: 0.0628
   CV Score: 0.0706 (+/- 0.0100)

 Model training completed!


##  Model Performance Analysis

In [7]:
# Analyze model performance
if 'model_results' in locals():
    print(" MODEL PERFORMANCE ANALYSIS")
    
    # Create performance comparison
    performance_df = pd.DataFrame({
        'Model': list(model_results.keys()),
        'Accuracy': [results['accuracy'] for results in model_results.values()],
        'CV Mean': [results['cv_mean'] for results in model_results.values()],
        'CV Std': [results['cv_std'] for results in model_results.values()]
    })
    
    print("\n Model Performance Summary:")
    display(performance_df.round(4))
    
    # Best model
    best_model_name = performance_df.loc[performance_df['Accuracy'].idxmax(), 'Model']
    best_accuracy = performance_df['Accuracy'].max()
    
    print(f"\n Best Model: {best_model_name} (Accuracy: {best_accuracy:.4f})")
    
    # Visualize performance
    fig = px.bar(performance_df, x='Model', y='Accuracy', 
                 title=" Model Accuracy Comparison",
                 color='Accuracy', color_continuous_scale='viridis')
    fig.show()
    
    # Feature importance for best model
    best_model = model_results[best_model_name]['model']
    if hasattr(best_model, 'feature_importances_'):
        feature_importance = pd.DataFrame({
            'Feature': feature_cols,
            'Importance': best_model.feature_importances_
        }).sort_values('Importance', ascending=False)
        
        print(f"\n Top 10 Important Features ({best_model_name}):")
        display(feature_importance.head(10))
        
        # Plot feature importance
        fig = px.bar(feature_importance.head(10), x='Importance', y='Feature',
                     orientation='h', title=f" Feature Importance - {best_model_name}")
        fig.show()

    print(" Performance analysis completed!")

 MODEL PERFORMANCE ANALYSIS

 Model Performance Summary:


Unnamed: 0,Model,Accuracy,CV Mean,CV Std
0,Random Forest,0.065,0.0653,0.0054
1,Gradient Boosting,0.0617,0.071,0.0079
2,Logistic Regression,0.0628,0.0706,0.005



 Best Model: Random Forest (Accuracy: 0.0650)



 Top 10 Important Features (Random Forest):


Unnamed: 0,Feature,Importance
0,GPA,0.13614
5,Field_Specific_Courses,0.088749
1,Extracurricular_Activities,0.086491
8,Communication_Skills,0.072268
7,Coding_Skills,0.069705
11,Analytical_Skills,0.069603
3,Projects,0.068815
9,Problem_Solving_Skills,0.06746
12,Presentation_Skills,0.067183
13,Networking_Skills,0.066699


 Performance analysis completed!


##  Export Clean Data

In [8]:
# Export processed data for web application
print(" EXPORTING CLEAN DATA")

# Save cleaned datasets
df_career_clean.to_csv('dataset/career_path_cleaned.csv', index=False)
df_skills_clean.to_csv('dataset/skill_career_cleaned.csv', index=False)

print(" Cleaned datasets exported:")
print(f"    career_path_cleaned.csv ({df_career_clean.shape})")
print(f"    skill_career_cleaned.csv ({df_skills_clean.shape})")

# Export model for web application
if 'model_results' in locals():
    import pickle
    
    # Save best model and preprocessing objects
    model_package = {
        'model': model_results[best_model_name]['model'],
        'scaler': scaler if best_model_name == 'Logistic Regression' else None,
        'label_encoder': le,
        'feature_columns': feature_cols,
        'model_name': best_model_name,
        'accuracy': best_accuracy
    }
    
    with open('dataset/career_prediction_model.pkl', 'wb') as f:
        pickle.dump(model_package, f)
    
    print(f"\n Model exported: career_prediction_model.pkl")
    print(f"    Best Model: {best_model_name}")
    print(f"    Accuracy: {best_accuracy:.4f}")

# Create summary report
summary = {
    'analysis_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'career_dataset': {
        'original_shape': df_career.shape,
        'cleaned_shape': df_career_clean.shape,
        'unique_fields': df_career_clean['Field'].nunique() if 'Field' in df_career_clean.columns else 0,
        'unique_careers': df_career_clean['Career'].nunique() if 'Career' in df_career_clean.columns else 0
    },
    'skills_dataset': {
        'original_shape': df_skills.shape,
        'cleaned_shape': df_skills_clean.shape,
        'intelligence_types': len(skill_cols) if skill_cols else 0
    },
    'ml_results': {
        'best_model': best_model_name if 'model_results' in locals() else 'None',
        'best_accuracy': best_accuracy if 'model_results' in locals() else 0,
        'models_trained': len(model_results) if 'model_results' in locals() else 0
    }
}

with open('dataset/analysis_summary.json', 'w') as f:
    import json
    json.dump(summary, f, indent=2)

print("\n Analysis summary exported: analysis_summary.json")
print("\n ML Data Preprocessing & Model Training Completed Successfully!")
print("\n" + "="*60)
print(" FINAL SUMMARY")
print("="*60)
print(f" Career records processed: {df_career_clean.shape[0]:,}")
print(f" Skill records processed: {df_skills_clean.shape[0]:,}")
if 'model_results' in locals():
    print(f" Best ML Model: {best_model_name} ({best_accuracy:.1%} accuracy)")
print(f" Files ready for web application integration")

 EXPORTING CLEAN DATA
 Cleaned datasets exported:
    career_path_cleaned.csv ((9000, 19))
    skill_career_cleaned.csv ((3602, 26))

 Model exported: career_prediction_model.pkl
    Best Model: Random Forest
    Accuracy: 0.0650

 Analysis summary exported: analysis_summary.json

 ML Data Preprocessing & Model Training Completed Successfully!

 FINAL SUMMARY
 Career records processed: 9,000
 Skill records processed: 3,602
 Best ML Model: Random Forest (6.5% accuracy)
 Files ready for web application integration
