<a href="https://colab.research.google.com/github/Joy-Terer/BSE-05-0254/blob/main/Employment_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# TASK 1: DATA LOADING AND EXPLORATION

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("="*80)
print("EMPLOYMENT PREDICTION USING DECISION TREE CLASSIFIER")
print("="*80)

# Load the IBM HR Analytics Attrition Dataset
# Download from: https://www.kaggle.com/datasets/pavansubhasht/ibm-hr-analytics-attritiondataset
print("\n[1] LOADING IBM HR ANALYTICS DATASET...")
try:
    # Try common file names for this dataset
    try:
        df_original = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')
    except:
        df_original = pd.read_csv('HR-Employee-Attrition.csv')

    print("✓ IBM HR Dataset loaded successfully!")
    print(f"   Original dataset shape: {df_original.shape}")

    # Map IBM HR dataset to assignment requirements
    print("\n[2] MAPPING IBM HR DATASET TO ASSIGNMENT REQUIREMENTS...")

    # Create the required columns based on IBM HR data
    df = pd.DataFrame()

    # age - Direct mapping
    df['age'] = df_original['Age']

    # education_level - Map Education (1-5) to education levels
    education_mapping = {
        1: 'High School',
        2: 'Bachelor',
        3: 'Bachelor',
        4: 'Master',
        5: 'PhD'
    }
    df['education_level'] = df_original['Education'].map(education_mapping)

    # years_of_experience - Use TotalWorkingYears
    df['years_of_experience'] = df_original['TotalWorkingYears']

    # technical_test_score - Simulate using PercentSalaryHike and PerformanceRating
    # Scale to 0-100
    df['technical_test_score'] = (
        (df_original['PercentSalaryHike'] * 3) +
        (df_original['PerformanceRating'] * 15)
    ).clip(40, 100)

    # interview_score - Simulate using JobSatisfaction and EnvironmentSatisfaction
    # Scale to 0-10
    df['interview_score'] = (
        (df_original['JobSatisfaction'] + df_original['EnvironmentSatisfaction']) * 1.25
    ).clip(3, 10).round(1)

    # previous_employment - Use NumCompaniesWorked (0 = No, >0 = Yes)
    df['previous_employment'] = df_original['NumCompaniesWorked'].apply(
        lambda x: 'Yes' if x > 0 else 'No'
    )

    # suitable_for_employment - Inverse of Attrition (No Attrition = Suitable)
    df['suitable_for_employment'] = df_original['Attrition'].apply(
        lambda x: 'No' if x == 'Yes' else 'Yes'
    )

    print("✓ Dataset mapped successfully!")
    print(f"   Mapped features: {', '.join(df.columns)}")

except FileNotFoundError:
    print("⚠ IBM HR Dataset file not found!")
    print("   Please download from: https://www.kaggle.com/datasets/pavansubhasht/ibm-hr-analytics-attritiondataset")
    print("   Expected filename: 'WA_Fn-UseC_-HR-Employee-Attrition.csv'")
    print("\n   Creating sample data for demonstration...")

    # Create sample dataset based on assignment description
    np.random.seed(42)
    n_samples = 500
    df = pd.DataFrame({
        'age': np.random.randint(22, 60, n_samples),
        'education_level': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], n_samples),
        'years_of_experience': np.random.randint(0, 30, n_samples),
        'technical_test_score': np.random.randint(40, 100, n_samples),
        'interview_score': np.random.uniform(3, 10, n_samples).round(1),
        'previous_employment': np.random.choice(['Yes', 'No'], n_samples),
        'suitable_for_employment': np.random.choice(['Yes', 'No'], n_samples, p=[0.6, 0.4])
    })
    print("✓ Sample dataset created!")

# Display first few rows
print("\n[2] DATASET PREVIEW:")
print(df.head(10))

# Dataset Information
print("\n[3] DATASET INFORMATION:")
print(f"Shape: {df.shape[0]} rows × {df.shape[1]} columns")
print("\nColumn Details:")
print(df.info())

# Basic Statistics
print("\n[4] STATISTICAL SUMMARY:")
print(df.describe())

# Check for null values
print("\n[5] MISSING VALUES CHECK:")
null_counts = df.isnull().sum()
print(null_counts)
if null_counts.sum() == 0:
    print("✓ No missing values found!")
else:
    print("⚠ Missing values detected. Handling required.")

# Distribution of target variable
print("\n[6] TARGET VARIABLE DISTRIBUTION:")
target_dist = df['suitable_for_employment'].value_counts()
print(target_dist)
print(f"\nClass Balance: {target_dist.values[0]/target_dist.sum()*100:.1f}% vs {target_dist.values[1]/target_dist.sum()*100:.1f}%")

# =============================================================================
# EXPLORATORY DATA ANALYSIS (EDA)
# =============================================================================

print("\n" + "="*80)
print("EXPLORATORY DATA ANALYSIS")
print("="*80)

# Visualizations
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Exploratory Data Analysis - Feature Distributions', fontsize=16, y=1.02)

# Age distribution
axes[0, 0].hist(df['age'], bins=20, color='skyblue', edgecolor='black')
axes[0, 0].set_title('Age Distribution')
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_ylabel('Frequency')

# Education level
df['education_level'].value_counts().plot(kind='bar', ax=axes[0, 1], color='lightcoral')
axes[0, 1].set_title('Education Level Distribution')
axes[0, 1].set_xlabel('Education Level')
axes[0, 1].set_ylabel('Count')
axes[0, 1].tick_params(axis='x', rotation=45)

# Years of experience
axes[0, 2].hist(df['years_of_experience'], bins=15, color='lightgreen', edgecolor='black')
axes[0, 2].set_title('Years of Experience Distribution')
axes[0, 2].set_xlabel('Years')
axes[0, 2].set_ylabel('Frequency')

# Technical test score
axes[1, 0].hist(df['technical_test_score'], bins=20, color='gold', edgecolor='black')
axes[1, 0].set_title('Technical Test Score Distribution')
axes[1, 0].set_xlabel('Score')
axes[1, 0].set_ylabel('Frequency')

# Interview score
axes[1, 1].hist(df['interview_score'], bins=15, color='plum', edgecolor='black')
axes[1, 1].set_title('Interview Score Distribution')
axes[1, 1].set_xlabel('Score')
axes[1, 1].set_ylabel('Frequency')

# Target variable
df['suitable_for_employment'].value_counts().plot(kind='bar', ax=axes[1, 2], color=['green', 'red'])
axes[1, 2].set_title('Suitability for Employment')
axes[1, 2].set_xlabel('Suitable')
axes[1, 2].set_ylabel('Count')
axes[1, 2].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.savefig('eda_distributions.png', dpi=300, bbox_inches='tight')
print("\n✓ EDA visualizations saved as 'eda_distributions.png'")

# =============================================================================
# TASK 2: DATA PREPROCESSING
# =============================================================================

print("\n" + "="*80)
print("DATA PREPROCESSING")
print("="*80)

# Create a copy for preprocessing
df_processed = df.copy()

# Encode categorical variables
print("\n[1] ENCODING CATEGORICAL VARIABLES...")

# Label Encoding for binary variables
le_prev_emp = LabelEncoder()
df_processed['previous_employment'] = le_prev_emp.fit_transform(df_processed['previous_employment'])
print(f"   Previous Employment: {dict(zip(le_prev_emp.classes_, le_prev_emp.transform(le_prev_emp.classes_)))}")

# Target variable encoding
le_target = LabelEncoder()
df_processed['suitable_for_employment'] = le_target.fit_transform(df_processed['suitable_for_employment'])
print(f"   Target Variable: {dict(zip(le_target.classes_, le_target.transform(le_target.classes_)))}")

# One-hot encoding for education level
df_processed = pd.get_dummies(df_processed, columns=['education_level'], prefix='edu')
print(f"   Education Level: One-hot encoded into {len([col for col in df_processed.columns if col.startswith('edu_')])} columns")

print("\n[2] PROCESSED DATASET:")
print(df_processed.head())

# Split features and target
X = df_processed.drop('suitable_for_employment', axis=1)
y = df_processed['suitable_for_employment']

print(f"\n[3] FEATURE MATRIX SHAPE: {X.shape}")
print(f"    Target VECTOR SHAPE: {y.shape}")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\n[4] DATA SPLIT COMPLETE:")
print(f"    Training Set: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"    Testing Set:  {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")

# =============================================================================
# TASK 3: MODEL BUILDING
# =============================================================================

print("\n" + "="*80)
print("MODEL BUILDING")
print("="*80)

# Train Decision Tree Classifier
print("\n[1] TRAINING DECISION TREE CLASSIFIER...")
dt_classifier = DecisionTreeClassifier(
    criterion='gini',
    max_depth=5,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42
)

dt_classifier.fit(X_train, y_train)
print("✓ Model training complete!")

print(f"\n[2] MODEL PARAMETERS:")
print(f"    Criterion: {dt_classifier.criterion}")
print(f"    Max Depth: {dt_classifier.max_depth}")
print(f"    Min Samples Split: {dt_classifier.min_samples_split}")
print(f"    Min Samples Leaf: {dt_classifier.min_samples_leaf}")
print(f"    Number of Features: {dt_classifier.n_features_in_}")
print(f"    Number of Classes: {dt_classifier.n_classes_}")

# =============================================================================
# TASK 4: MODEL VISUALIZATION
# =============================================================================

print("\n" + "="*80)
print("MODEL VISUALIZATION")
print("="*80)

# Visualize decision tree
plt.figure(figsize=(20, 12))
plot_tree(
    dt_classifier,
    feature_names=X.columns,
    class_names=['Not Suitable', 'Suitable'],
    filled=True,
    rounded=True,
    fontsize=10
)
plt.title('Decision Tree for Employment Prediction', fontsize=16, pad=20)
plt.savefig('decision_tree_visualization.png', dpi=300, bbox_inches='tight')
print("\n✓ Decision tree visualization saved as 'decision_tree_visualization.png'")

# =============================================================================
# TASK 5: MODEL TESTING AND PREDICTION
# =============================================================================

print("\n" + "="*80)
print("MODEL TESTING AND PREDICTION")
print("="*80)

# Predictions on test set
y_pred = dt_classifier.predict(X_test)
print(f"\n[1] PREDICTIONS ON TEST SET COMPLETE")
print(f"    Total predictions: {len(y_pred)}")

# Test with 3 hypothetical candidates
print("\n[2] TESTING WITH HYPOTHETICAL CANDIDATES:")
print("-" * 80)

# Candidate 1: Strong profile
candidate1 = {
    'age': 28,
    'years_of_experience': 5,
    'technical_test_score': 85,
    'interview_score': 8.5,
    'previous_employment': 1,  # Yes
}
# Add education encoding (Bachelor's degree)
for col in X.columns:
    if col.startswith('edu_'):
        candidate1[col] = 1 if col == 'edu_Bachelor' else 0

candidate1_df = pd.DataFrame([candidate1], columns=X.columns)
pred1 = dt_classifier.predict(candidate1_df)[0]
prob1 = dt_classifier.predict_proba(candidate1_df)[0]

print("\nCANDIDATE 1 - Strong Profile:")
print(f"  Age: 28, Experience: 5 years, Education: Bachelor's")
print(f"  Technical Score: 85/100, Interview: 8.5/10, Previous Employment: Yes")
print(f"  → PREDICTION: {'SUITABLE' if pred1 == 1 else 'NOT SUITABLE'}")
print(f"  → CONFIDENCE: {max(prob1)*100:.1f}%")

# Candidate 2: Weak profile
candidate2 = {
    'age': 35,
    'years_of_experience': 2,
    'technical_test_score': 55,
    'interview_score': 5.0,
    'previous_employment': 0,  # No
}
for col in X.columns:
    if col.startswith('edu_'):
        candidate2[col] = 1 if col == 'edu_High School' else 0

candidate2_df = pd.DataFrame([candidate2], columns=X.columns)
pred2 = dt_classifier.predict(candidate2_df)[0]
prob2 = dt_classifier.predict_proba(candidate2_df)[0]

print("\nCANDIDATE 2 - Weak Profile:")
print(f"  Age: 35, Experience: 2 years, Education: High School")
print(f"  Technical Score: 55/100, Interview: 5.0/10, Previous Employment: No")
print(f"  → PREDICTION: {'SUITABLE' if pred2 == 1 else 'NOT SUITABLE'}")
print(f"  → CONFIDENCE: {max(prob2)*100:.1f}%")

# Candidate 3: Moderate profile
candidate3 = {
    'age': 30,
    'years_of_experience': 7,
    'technical_test_score': 72,
    'interview_score': 7.0,
    'previous_employment': 1,  # Yes
}
for col in X.columns:
    if col.startswith('edu_'):
        candidate3[col] = 1 if col == 'edu_Master' else 0

candidate3_df = pd.DataFrame([candidate3], columns=X.columns)
pred3 = dt_classifier.predict(candidate3_df)[0]
prob3 = dt_classifier.predict_proba(candidate3_df)[0]

print("\nCANDIDATE 3 - Moderate Profile:")
print(f"  Age: 30, Experience: 7 years, Education: Master's")
print(f"  Technical Score: 72/100, Interview: 7.0/10, Previous Employment: Yes")
print(f"  → PREDICTION: {'SUITABLE' if pred3 == 1 else 'NOT SUITABLE'}")
print(f"  → CONFIDENCE: {max(prob3)*100:.1f}%")

# =============================================================================
# TASK 6: MODEL EVALUATION
# =============================================================================

print("\n" + "="*80)
print("MODEL EVALUATION")
print("="*80)

# Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print(f"\n[1] ACCURACY SCORE: {accuracy*100:.2f}%")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(f"\n[2] CONFUSION MATRIX:")
print(cm)

# Visualize Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Not Suitable', 'Suitable'],
            yticklabels=['Not Suitable', 'Suitable'])
plt.title('Confusion Matrix', fontsize=14, pad=15)
plt.ylabel('Actual', fontsize=12)
plt.xlabel('Predicted', fontsize=12)
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
print("✓ Confusion matrix saved as 'confusion_matrix.png'")

# Classification Report
print(f"\n[3] CLASSIFICATION REPORT:")
print(classification_report(y_test, y_pred,
                          target_names=['Not Suitable', 'Suitable']))

# =============================================================================
# BONUS TASK: FEATURE IMPORTANCE ANALYSIS
# =============================================================================

print("\n" + "="*80)
print("BONUS: FEATURE IMPORTANCE ANALYSIS")
print("="*80)

# Get feature importances
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': dt_classifier.feature_importances_
}).sort_values('Importance', ascending=False)

print("\n[1] FEATURE IMPORTANCE RANKING:")
print(feature_importance.to_string(index=False))

# Visualize feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'][:10], feature_importance['Importance'][:10], color='steelblue')
plt.xlabel('Importance Score', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('Top 10 Most Important Features', fontsize=14, pad=15)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
print("\n✓ Feature importance plot saved as 'feature_importance.png'")

# Summary
print("\n" + "="*80)
print("ANALYSIS SUMMARY")
print("="*80)
print(f"\n✓ Dataset processed: {df.shape[0]} samples, {df.shape[1]} features")
print(f"✓ Model trained: Decision Tree Classifier")
print(f"✓ Model accuracy: {accuracy*100:.2f}%")
print(f"✓ Top 3 important features:")
for i in range(min(3, len(feature_importance))):
    print(f"   {i+1}. {feature_importance.iloc[i]['Feature']}: {feature_importance.iloc[i]['Importance']:.4f}")
print("\n✓ All visualizations saved successfully!")
print("="*80)


