<a href="https://colab.research.google.com/github/Mcdtronix/Todo/blob/master/Employee_perfomance_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# Load data
df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')

# Define target variable
target = 'PerformanceRating'

print("✅ Data loaded successfully!")
print(f"Dataset shape: {df.shape}")
print(f"Target variable '{target}' values: {df[target].value_counts().to_dict()}")

✅ Data loaded successfully!
Dataset shape: (1470, 35)
Target variable 'PerformanceRating' values: {3: 1244, 4: 226}


In [3]:
# Essential features for performance prediction
primary_features = [
    'Age',
    'Gender',
    'MaritalStatus',
    'DistanceFromHome',
    'Education',
    'JobRole',
    'Department',
    'JobLevel',
    'TotalWorkingYears',
    'YearsAtCompany',
    'YearsInCurrentRole',
    'YearsSinceLastPromotion',
    'YearsWithCurrManager',
    'MonthlyIncome',
    'PercentSalaryHike',
    'StockOptionLevel',
    'JobSatisfaction',
    'EnvironmentSatisfaction',
    'RelationshipSatisfaction',
    'WorkLifeBalance',
    'TrainingTimesLastYear',
    'JobInvolvement',
    'BusinessTravel',
    'OverTime',
    'NumCompaniesWorked',
]

In [4]:
def engineer_features(df):
    df_eng = df.copy()

    # Experience ratios
    df_eng['TenureRatio'] = df_eng['YearsAtCompany'] / (df_eng['TotalWorkingYears'] + 1)
    df_eng['RoleStability'] = df_eng['YearsInCurrentRole'] / (df_eng['YearsAtCompany'] + 1)
    df_eng['PromotionVelocity'] = df_eng['YearsAtCompany'] / (df_eng['YearsSinceLastPromotion'] + 1)

    # Satisfaction composite score
    satisfaction_cols = ['JobSatisfaction', 'EnvironmentSatisfaction', 'RelationshipSatisfaction', 'WorkLifeBalance']
    df_eng['SatisfactionScore'] = df_eng[satisfaction_cols].mean(axis=1)

    # Career progression indicator
    df_eng['CareerMomentum'] = (df_eng['PercentSalaryHike'] * df_eng['StockOptionLevel'] * df_eng['PromotionVelocity'])

    # Training effectiveness
    df_eng['TrainingIntensity'] = df_eng['TrainingTimesLastYear'] / (df_eng['YearsAtCompany'] + 1)

    return df_eng

# Apply feature engineering
df_processed = engineer_features(df)

In [5]:
categorical_features = [
    'BusinessTravel',
    'Department',
    'Education',
    'Gender',
    'JobRole',
    'MaritalStatus',
    'OverTime'
]

# One-hot encode categorical variables
df_encoded = pd.get_dummies(df_processed, columns=categorical_features, drop_first=True, dtype=int)

In [6]:
# Prepare final feature set
feature_cols = [col for col in df_encoded.columns if col not in [target, 'Attrition', 'EmployeeNumber', 'EmployeeCount', 'StandardHours', 'Over18']]
X = df_encoded[feature_cols]
y = df_encoded[target]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Select only numerical columns for scaling
X_train_numeric = X_train.select_dtypes(include=np.number)
X_test_numeric = X_test.select_dtypes(include=np.number)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_numeric)
X_test_scaled = scaler.transform(X_test_numeric)

In [8]:
# ----------------------------------------
# Train Random Forest Model
# ----------------------------------------
from sklearn.ensemble import RandomForestClassifier

# Initialize model
model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    max_depth=None,
    class_weight="balanced",
)

# Train the model using the scaled numeric features
model.fit(X_train_scaled, y_train)

# ----------------------------------------
# Evaluate the model
# ----------------------------------------
y_pred = model.predict(X_test_scaled)

from sklearn.metrics import classification_report, confusion_matrix

print("✅ Model trained successfully!\n")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


✅ Model trained successfully!

Confusion Matrix:
[[249   0]
 [  0  45]]

Classification Report:
              precision    recall  f1-score   support

           3       1.00      1.00      1.00       249
           4       1.00      1.00      1.00        45

    accuracy                           1.00       294
   macro avg       1.00      1.00      1.00       294
weighted avg       1.00      1.00      1.00       294



In [9]:
# Save the trained model and scaler
# joblib.dump(model, 'random_forest_performance_model.pkl') # Moved to cell eab75b67
# joblib.dump(scaler, 'scaler.pkl') # Moved to cell eab75b67

print("Model and scaler saving is now handled in the training cell.")

Model and scaler saving is now handled in the training cell.


In [10]:
# Identify non-numeric columns in the original split data that were not one-hot encoded
non_numeric_cols_train = X_train.select_dtypes(exclude=np.number).columns
non_numeric_cols_test = X_test.select_dtypes(exclude=np.number).columns

# Apply one-hot encoding to the remaining non-numeric columns
X_train_non_numeric_encoded = pd.get_dummies(X_train[non_numeric_cols_train], drop_first=True, dtype=int)
X_test_non_numeric_encoded = pd.get_dummies(X_test[non_numeric_cols_test], drop_first=True, dtype=int)

# Create DataFrames from numerical features, preserving original column names
X_train_numeric_df = pd.DataFrame(X_train_numeric, columns=X_train_numeric.columns, index=X_train.index)
X_test_numeric_df = pd.DataFrame(X_test_numeric, columns=X_test_numeric.columns, index=X_test.index)


# Concatenate numerical features with encoded non-numeric features
X_train_processed = pd.concat([X_train_numeric_df, X_train_non_numeric_encoded], axis=1)
X_test_processed = pd.concat([X_test_numeric_df, X_test_non_numeric_encoded], axis=1)


# Scale the *entire* processed training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_processed)
X_test_scaled = scaler.transform(X_test_processed)


# Convert scaled arrays back to DataFrames to preserve column names
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train_processed.columns, index=X_train_processed.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test_processed.columns, index=X_test_processed.index)


# Initialize and train the RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled_df, y_train) # Use scaled data for training

# Save the trained model and scaler *after* fitting
joblib.dump(model, 'random_forest_performance_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

print("Model training complete.")
print("Model and scaler saved.")

Model training complete.
Model and scaler saved.


In [11]:
# Train model
rf_model = RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=10, min_samples_leaf=5, class_weight='balanced', random_state=42, n_jobs=-1)
rf_model.fit(X_train_scaled, y_train)

# Predictions
y_pred = rf_model.predict(X_test_scaled)

# Evaluation
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Save the model
joblib.dump(rf_model, 'employee_performance_model.pkl')


Classification Report:
              precision    recall  f1-score   support

           3       1.00      1.00      1.00       249
           4       1.00      1.00      1.00        45

    accuracy                           1.00       294
   macro avg       1.00      1.00      1.00       294
weighted avg       1.00      1.00      1.00       294


Confusion Matrix:
[[249   0]
 [  0  45]]


['employee_performance_model.pkl']

In [12]:
def preprocess_new_employee_data(employee_data, training_columns, categorical_features):
    """
    Preprocesses new employee data to match the format of the training data.

    Args:
        employee_data (dict): A dictionary containing the new employee's data.
        training_columns (pd.Index): The columns of the training data.
        categorical_features (list): A list of categorical feature names.

    Returns:
        pd.DataFrame: The preprocessed and aligned employee data DataFrame.
    """
    employee_df = pd.DataFrame([employee_data])

    # Apply feature engineering
    employee_df_processed = engineer_features(employee_df)

    # Apply one-hot encoding to categorical features
    employee_df_encoded = pd.get_dummies(employee_df_processed, columns=categorical_features, drop_first=True, dtype=int)

    # Reindex with training columns and fill missing values with 0
    aligned_employee_df = employee_df_encoded.reindex(columns=training_columns, fill_value=0)

    # Ensure column order matches training data
    aligned_employee_df = aligned_employee_df[training_columns]

    return aligned_employee_df


def predict_performance(employee_data):
    # Load the model and scaler
    model = joblib.load('random_forest_performance_model.pkl')
    scaler = joblib.load('scaler.pkl')

    # Get training columns from the preprocessed training data
    # This assumes X_train_processed is available in the environment
    # If not, you would need to load or recreate the training columns somehow
    training_cols = X_train_processed.columns

    # Preprocess the new employee data using the helper function
    aligned_employee_df = preprocess_new_employee_data(employee_data, training_cols, categorical_features)

    # Apply scaler to the aligned and preprocessed input data
    employee_scaled = scaler.transform(aligned_employee_df)

    # Convert scaled array back to DataFrame to preserve column names
    employee_processed_scaled_df = pd.DataFrame(employee_scaled, columns=aligned_employee_df.columns, index=aligned_employee_df.index)

    # Make prediction
    prediction = model.predict(employee_processed_scaled_df)
    return prediction[0]

# Example usage with multiple inputs
employee_inputs = [
    {
        'Age': 30,
        'Gender': 'Male',
        'MaritalStatus': 'Single',
        'DistanceFromHome': 5,
        'Education': 'Bachelor',
        'JobRole': 'Sales Executive',
        'Department': 'Sales',
        'JobLevel': 1,
        'TotalWorkingYears': 5,
        'YearsAtCompany': 3,
        'YearsInCurrentRole': 2,
        'YearsSinceLastPromotion': 1,
        'YearsWithCurrManager': 2,
        'MonthlyIncome': 5000,
        'PercentSalaryHike': 5,
        'StockOptionLevel': 1,
        'JobSatisfaction': 4,
        'EnvironmentSatisfaction': 3,
        'RelationshipSatisfaction': 4,
        'WorkLifeBalance': 3,
        'TrainingTimesLastYear': 2,
        'JobInvolvement': 3,
        'BusinessTravel': 'Travel_Rarely',
        'OverTime': 'No',
        'NumCompaniesWorked': 1
    },
    {
        'Age': 45,
        'Gender': 'Female',
        'MaritalStatus': 'Married',
        'DistanceFromHome': 15,
        'Education': 'Master',
        'JobRole': 'Research Scientist',
        'Department': 'Research & Development',
        'JobLevel': 3,
        'TotalWorkingYears': 20,
        'YearsAtCompany': 15,
        'YearsInCurrentRole': 10,
        'YearsSinceLastPromotion': 5,
        'YearsWithCurrManager': 10,
        'MonthlyIncome': 10000,
        'PercentSalaryHike': 10,
        'StockOptionLevel': 2,
        'JobSatisfaction': 5,
        'EnvironmentSatisfaction': 5,
        'RelationshipSatisfaction': 5,
        'WorkLifeBalance': 4,
        'TrainingTimesLastYear': 3,
        'JobInvolvement': 4,
        'BusinessTravel': 'Travel_Frequently',
        'OverTime': 'Yes',
        'NumCompaniesWorked': 2
    },
    {
        'Age': 22,
        'Gender': 'Male',
        'MaritalStatus': 'Single',
        'DistanceFromHome': 2,
        'Education': 'High School',
        'JobRole': 'Laboratory Technician',
        'Department': 'Research & Development',
        'JobLevel': 1,
        'TotalWorkingYears': 1,
        'YearsAtCompany': 1,
        'YearsInCurrentRole': 0,
        'YearsSinceLastPromotion': 0,
        'YearsWithCurrManager': 0,
        'MonthlyIncome': 3000,
        'PercentSalaryHike': 15,
        'StockOptionLevel': 0,
        'JobSatisfaction': 2,
        'EnvironmentSatisfaction': 2,
        'RelationshipSatisfaction': 3,
        'WorkLifeBalance': 2,
        'TrainingTimesLastYear': 1,
        'JobInvolvement': 2,
        'BusinessTravel': 'Non-Travel',
        'OverTime': 'No',
        'NumCompaniesWorked': 1
    }
]

for i, employee_input in enumerate(employee_inputs):
    performance = predict_performance(employee_input)
    # Assuming performance rating 3 and 4 are present in the training data, and 4 is better
    risk_level = 'Low' if performance == 4 else 'High'
    print(f'--- Employee Input {i+1} ---')
    print(f'Predicted Performance: {performance}, Risk Level: {risk_level}')
    print("-" * 20)

--- Employee Input 1 ---
Predicted Performance: 3, Risk Level: High
--------------------
--- Employee Input 2 ---
Predicted Performance: 3, Risk Level: High
--------------------
--- Employee Input 3 ---
Predicted Performance: 3, Risk Level: High
--------------------


In [13]:
def engineer_features(df):
    df_eng = df.copy()

    # Experience ratios
    df_eng['TenureRatio'] = df_eng['YearsAtCompany'] / (df_eng['TotalWorkingYears'] + 1)
    df_eng['RoleStability'] = df_eng['YearsInCurrentRole'] / (df_eng['YearsAtCompany'] + 1)
    df_eng['PromotionVelocity'] = df_eng['YearsAtCompany'] / (df_eng['YearsSinceLastPromotion'] + 1)

    # Satisfaction composite score
    satisfaction_cols = ['JobSatisfaction', 'EnvironmentSatisfaction', 'RelationshipSatisfaction', 'WorkLifeBalance']
    df_eng['SatisfactionScore'] = df_eng[satisfaction_cols].mean(axis=1)

    # Career progression indicator
    df_eng['CareerMomentum'] = (df_eng['PercentSalaryHike'] * df_eng['StockOptionLevel'] * df_eng['PromotionVelocity'])

    # Training effectiveness
    df_eng['TrainingIntensity'] = df_eng['TrainingTimesLastYear'] / (df_eng['YearsAtCompany'] + 1)

    return df_eng

def preprocess_new_employee_data(employee_data, training_columns, categorical_features):
    """
    Preprocesses new employee data to match the format of the training data.

    Args:
        employee_data (dict): A dictionary containing the new employee's data.
        training_columns (pd.Index): The columns of the training data.
        categorical_features (list): A list of categorical feature names.

    Returns:
        pd.DataFrame: The preprocessed and aligned employee data DataFrame.
    """
    employee_df = pd.DataFrame([employee_data])

    # Apply feature engineering
    employee_df_processed = engineer_features(employee_df)

    # Apply one-hot encoding to categorical features
    employee_df_encoded = pd.get_dummies(employee_df_processed, columns=categorical_features, drop_first=True, dtype=int)

    # Reindex with training columns and fill missing values with 0
    aligned_employee_df = employee_df_encoded.reindex(columns=training_columns, fill_value=0)

    # Ensure column order matches training data
    aligned_employee_df = aligned_employee_df[training_columns]

    return aligned_employee_df


def predict_performance(employee_data):
    # Load the model and scaler
    model = joblib.load('random_forest_performance_model.pkl') # Load the correct model
    scaler = joblib.load('scaler.pkl')

    # Get training columns from the preprocessed training data
    # This assumes X_train_processed is available in the environment
    # If not, you would need to load or recreate the training columns somehow
    # For now, assuming X_train_processed is available from previous execution
    training_cols = X_train_processed.columns

    # Preprocess the new employee data using the helper function
    aligned_employee_df = preprocess_new_employee_data(employee_data, training_cols, categorical_features)

    # Apply scaler to the aligned and preprocessed input data
    employee_scaled = scaler.transform(aligned_employee_df)

    # Convert scaled array back to DataFrame to preserve column names (optional but good practice)
    employee_processed_scaled_df = pd.DataFrame(employee_scaled, columns=aligned_employee_df.columns, index=aligned_employee_df.index)

    # Make prediction
    prediction = model.predict(employee_processed_scaled_df)
    return prediction[0]

# Example usage
employee_input = {
    'Age': 30,
    'Gender': 'Male',
    'MaritalStatus': 'Single',
    'DistanceFromHome': 5,
    'Education': 'Bachelor',
    'JobRole': 'Sales Executive',
    'Department': 'Sales',
    'JobLevel': 1,
    'TotalWorkingYears': 5,
    'YearsAtCompany': 3,
    'YearsInCurrentRole': 2,
    'YearsSinceLastPromotion': 1,
    'YearsWithCurrManager': 2,
    'MonthlyIncome': 5000,
    'PercentSalaryHike': 5,
    'StockOptionLevel': 1,
    'JobSatisfaction': 4,
    'EnvironmentSatisfaction': 3,
    'RelationshipSatisfaction': 4,
    'WorkLifeBalance': 3,
    'TrainingTimesLastYear': 2,
    'JobInvolvement': 3,
    'BusinessTravel': 'Travel_Rarely',
    'OverTime': 'No',
    'NumCompaniesWorked': 1
}

performance = predict_performance(employee_input)
# Assuming performance rating 3 and 4 are present in the training data, and 4 is better
risk_level = 'Low' if performance == 4 else 'High'
print(f'Predicted Performance: {performance}, Risk Level: {risk_level}')

Predicted Performance: 3, Risk Level: High


In [14]:
# Function to get employee data input from the user
def get_employee_input():
    employee_data = {}
    print("Please enter employee data:")

    # Get numerical inputs
    numerical_features = [
        'Age', 'DistanceFromHome', 'Education', 'JobLevel', 'TotalWorkingYears',
        'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
        'YearsWithCurrManager', 'MonthlyIncome', 'PercentSalaryHike',
        'StockOptionLevel', 'JobSatisfaction', 'EnvironmentSatisfaction',
        'RelationshipSatisfaction', 'WorkLifeBalance', 'TrainingTimesLastYear',
        'JobInvolvement', 'NumCompaniesWorked'
    ]
    for feature in numerical_features:
        while True:
            try:
                value = float(input(f"Enter {feature}: "))
                employee_data[feature] = value
                break
            except ValueError:
                print("Invalid input. Please enter a number.")

    # Get categorical inputs
    categorical_features_input = {
        'Gender': ['Male', 'Female'],
        'MaritalStatus': ['Single', 'Married', 'Divorced'],
        'JobRole': ['Sales Executive', 'Research Scientist', 'Laboratory Technician',
                    'Manufacturing Director', 'Healthcare Representative', 'Manager',
                    'Sales Representative', 'Research Director', 'Human Resources'],
        'Department': ['Sales', 'Research & Development', 'Human Resources'],
        'BusinessTravel': ['Travel_Rarely', 'Travel_Frequently', 'Non-Travel'],
        'OverTime': ['Yes', 'No']
    }
    for feature, categories in categorical_features_input.items():
        while True:
            value = input(f"Enter {feature} {categories}: ")
            if value in categories:
                employee_data[feature] = value
                break
            else:
                print(f"Invalid input. Please enter one of: {categories}")

    # Add other necessary features that might be in the training data but not explicitly asked
    # You might need to adjust this based on the full list of features in X_train_processed.columns
    # For simplicity, adding some common ones with default values if not already captured
    if 'EducationField' not in employee_data:
        employee_data['EducationField'] = 'Life Sciences' # Default or handle appropriately
    if 'DailyRate' not in employee_data:
         employee_data['DailyRate'] = 800 # Default or handle appropriately
    if 'HourlyRate' not in employee_data:
         employee_data['HourlyRate'] = 65 # Default or handle appropriately
    if 'JobInvolvement' not in employee_data:
         employee_data['JobInvolvement'] = 3 # Default or handle appropriately
    if 'JobSatisfaction' not in employee_data:
         employee_data['JobSatisfaction'] = 3 # Default or handle appropriately
    if 'PerformanceRating' not in employee_data:
         employee_data['PerformanceRating'] = 3 # Default or handle appropriately - will be replaced by prediction


    return employee_data

# Get employee data from user
new_employee_data = get_employee_input()

# Predict performance
predicted_performance = predict_performance(new_employee_data)

# Determine risk level
# Assuming performance rating 3 and 4 are present in the training data, and 4 is better
risk_level = 'Low' if predicted_performance == 4 else 'High'

# Display result
print(f"\nPredicted Performance Rating: {predicted_performance}")
print(f"Predicted Risk Level: {risk_level}")

Please enter employee data:
Enter Age: 34
Enter DistanceFromHome: 5
Enter Education: tertiary
Invalid input. Please enter a number.
Enter Education: 3
Enter JobLevel: 3
Enter TotalWorkingYears: 5
Enter YearsAtCompany: 5
Enter YearsInCurrentRole: 2
Enter YearsSinceLastPromotion: 1
Enter YearsWithCurrManager: 1
Enter MonthlyIncome: 1000
Enter PercentSalaryHike: 20
Enter StockOptionLevel: 2
Enter JobSatisfaction: 1
Enter EnvironmentSatisfaction: 0
Enter RelationshipSatisfaction: 1
Enter WorkLifeBalance: 0
Enter TrainingTimesLastYear: 1
Enter JobInvolvement: 1
Enter NumCompaniesWorked: 2
Enter Gender ['Male', 'Female']: male
Invalid input. Please enter one of: ['Male', 'Female']
Enter Gender ['Male', 'Female']: Male
Enter MaritalStatus ['Single', 'Married', 'Divorced']: Married
Enter JobRole ['Sales Executive', 'Research Scientist', 'Laboratory Technician', 'Manufacturing Director', 'Healthcare Representative', 'Manager', 'Sales Representative', 'Research Director', 'Human Resources']: Hum

In [15]:
def generate_risk_alert(performance):
    if performance == 1:  # Assuming 1 indicates low performance
        return 'Low Risk'
    elif performance == 2:  # Assuming 2 indicates medium performance
        return 'Medium Risk'
    else:  # Any other value indicates high performance
        return 'High Risk'

risk_alert = generate_risk_alert(performance)
print(f'Risk Alert: {risk_alert}')

Risk Alert: High Risk


In [16]:
import joblib

model1 = joblib.load('employee_performance_model.pkl')
model2 = joblib.load('random_forest_performance_model.pkl')

print(model1)
print(model2)


RandomForestClassifier(class_weight='balanced', max_depth=15,
                       min_samples_leaf=5, min_samples_split=10,
                       n_estimators=200, n_jobs=-1, random_state=42)
RandomForestClassifier(random_state=42)
