In [None]:
# 'Numpy' is used for mathematical operations on large, multi-dimensional arrays and matrices
import numpy as np

# 'Pandas' is used for data manipulation and analysis
import pandas as pd

# 'Seaborn' is based on matplotlib; used for plotting statistical graphics
import seaborn as sns

# 'Matplotlib' is a data visualization library for 2D and 3D plots, built on numpy
import matplotlib.pyplot as plt
%matplotlib inline
import sys
import joblib


from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier



The script builds a complete machine learning pipeline that loads employee performance data, preprocesses it, trains a Random Forest model, and saves all required artifacts for deployment.

### **1. Data Loading**
- Loads the CSV file containing employee data.
- Uses a fixed path inside Google Colab.

### **2. Preprocessing**
- Separates features (X) and target (y).
- Encodes the target label using `LabelEncoder` when it is categorical.
- Converts all categorical features into numerical form using **one-hot encoding** (`pd.get_dummies`).
- Scales all numerical variables using **StandardScaler**.
- Stores the list of processed feature columns for future inference.
- Returns: scaled features, encoded target, scaler, column names, and label encoder.

### **3. Model Training**
- Splits the dataset into training and testing sets using **stratified sampling**.
- Trains a **Random Forest Classifier** with class balancing enabled.
- Evaluates accuracy and prints a full classification report.

### **4. Saving Artifacts**
- Saves:
  - The trained ML model (`employee_model.pkl`)
  - The scaler (`scaler.pkl`)
  - The list of feature columns (`training_columns.pkl`)
  - The label encoder (if used)
- These files make the model ready for deployment or inference in a production system.

### **5. Main Workflow**
- Loads data â†’ preprocesses â†’ trains the model â†’ saves all artifacts.
- Provides console outputs for each step to track progress during execution.



In [6]:


def load_data(file_path: str):
    """Load dataset from CSV file."""
    return pd.read_csv('/content/INX_Future_Inc_Employee_Performance_CDS_Project2_Data_V1.8.csv')

def preprocess_data(df: pd.DataFrame, target_col: str):
    """Preprocess dataset: encode categorical variables, scale numerical features."""
    # Separate features and target
    X = df.drop(columns=[target_col])
    y = df[target_col]

    # Encode categorical target if needed
    label_encoder = None
    if y.dtype == 'object':
        label_encoder = LabelEncoder()
        y = label_encoder.fit_transform(y)

    # Encode categorical features
    X_encoded = pd.get_dummies(X, drop_first=True)

    # Store column names for future use
    feature_columns = X_encoded.columns.tolist()

    # Scale numerical features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_encoded)

    return X_scaled, y, scaler, feature_columns, label_encoder

def train_model(X, y):
    """Train a Random Forest model."""
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    model = RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        class_weight="balanced"
    )
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)
    print("Model Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

    return model

def save_model(model, scaler, feature_columns, label_encoder=None,
               model_path="employee_model.pkl",
               scaler_path="scaler.pkl",
               columns_path="training_columns.pkl",
               encoder_path="label_encoder.pkl"):
    """Save trained model, scaler, and feature columns."""
    joblib.dump(model, model_path)
    joblib.dump(scaler, scaler_path)
    joblib.dump(feature_columns, columns_path)

    if label_encoder is not None:
        joblib.dump(label_encoder, encoder_path)

    print(f"Model saved to {model_path}")
    print(f"Scaler saved to {scaler_path}")
    print(f"Feature columns saved to {columns_path}")
    if label_encoder is not None:
        print(f"Label encoder saved to {encoder_path}")

def main():
    # Replace with your dataset path
    file_path = "/content/INX_Future_Inc_Employee_Performance_CDS_Project2_Data_V1.8.csv"
    target_col = "PerformanceRating"  # Adjust based on your dataset

    print("Loading data...")
    df = load_data(file_path)
    print(f"Data loaded: {df.shape[0]} rows, {df.shape[1]} columns")

    print("\nPreprocessing data...")
    X, y, scaler, feature_columns, label_encoder = preprocess_data(df, target_col)

    print("\nTraining model...")
    model = train_model(X, y)

    print("\nSaving model artifacts...")
    save_model(model, scaler, feature_columns, label_encoder)

    print("\nTraining complete!")

if __name__ == "__main__":
    main()

Loading data...
Data loaded: 1200 rows, 28 columns

Preprocessing data...

Training model...
Model Accuracy: 0.7708333333333334

Classification Report:
               precision    recall  f1-score   support

           2       1.00      0.26      0.41        39
           3       0.76      1.00      0.86       175
           4       0.00      0.00      0.00        26

    accuracy                           0.77       240
   macro avg       0.59      0.42      0.42       240
weighted avg       0.72      0.77      0.70       240


Saving model artifacts...
Model saved to employee_model.pkl
Scaler saved to scaler.pkl
Feature columns saved to training_columns.pkl

Training complete!


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Test the model by User Input


This section  provides an interactive, command-line based system to predict performance ratings for new employees based on user input.

###  Loading Saved Artifacts
- `load_artifacts(...)`:
  - Loads the trained model (`employee_model.pkl`), scaler (`scaler.pkl`), training columns (`training_columns.pkl`), and label encoder (`label_encoder.pkl`) using `joblib`.
  - Handles missing files by printing an error and exiting the program.
  - Returns: `model`, `scaler`, `training_columns`, and `label_encoder` (if available).

###  Collecting User Input
- `get_user_input()`:
  - Asks the user to manually enter employee-related information via `input()`:
    - Personal: Age, Gender
    - Education: Education level
    - Work: Years at company, years in current role, department, job role
    - Performance-related: Job involvement, environment satisfaction, job satisfaction, work-life balance
    - Training/Progression: Training times last year, years since last promotion
    - Additional: Attrition (Yes/No), OverTime (Yes/No)
  - Stores these values in a dictionary and converts it into a one-row `DataFrame`.
  - This simulates a new employee profile for prediction.

###  Preprocessing User Data
- `preprocess_user_data(user_data, scaler, training_columns)`:
  - Applies one-hot encoding to categorical variables using `pd.get_dummies`.
  - Reindexes the encoded data to match the training columns, filling any missing columns with 0.
  - Uses the loaded `scaler` to scale the features so they match the training-time preprocessing.
  - Returns the scaled feature array ready for prediction.

###  Making Predictions and Displaying Results
- `predict_and_display(user_data, model, scaler, training_columns, label_encoder)`:
  - Preprocesses user input with `preprocess_user_data`.
  - Uses the loaded model to:
    - Predict the performance class (`model.predict`).
    - Get class probabilities (`model.predict_proba`).
  - If a label encoder exists, it converts the numeric prediction back to the original class label.
  - Prints:
    - Predicted performance rating.
    - Confidence level (highest class probability).
    - A probability breakdown for each class with a simple text-based bar visualization.
  - Calls `interpret_performance(rating)` to provide a human-readable interpretation.

###  Interpreting the Performance Rating
- `interpret_performance(rating)`:
  - Maps numeric ratings (1â€“5) to interpretation messages:
    - 1: Needs improvement
    - 2: Below average
    - 3: Meets expectations
    - 4: Exceeds expectations
    - 5: Outstanding
  - If the rating is non-numeric (e.g., string label), it prints a generic message with the rating.
  - Helps convert raw model output into understandable HR language.

###  Main Execution Flow
- `main()`:
  - Prints the system title and status messages.
  - Loads the model and preprocessing artifacts using `load_artifacts`.
  - Enters a loop:
    - Collects new employee data using `get_user_input`.
    - Predicts and displays performance using `predict_and_display`.
    - Asks the user whether they want to test another employee.
  - Exits the loop and ends the program when the user answers anything other than "yes"/"y".

###  Overall Purpose
- This script forms the **inference/prediction interface** for the trained employee performance model.
- It is intended to be used by HR or recruiters to input candidate/employee details and receive:
  - A predicted performance rating.
  - Confidence and probability distribution.
  - A simple interpretation of what that rating means in practical terms.


In [7]:
def load_artifacts(model_path="employee_model.pkl",
                   scaler_path="scaler.pkl",
                   columns_path="training_columns.pkl",
                   encoder_path="label_encoder.pkl"):
    """Load trained model and preprocessing artifacts."""
    try:
        model = joblib.load(model_path)
        scaler = joblib.load(scaler_path)
        training_columns = joblib.load(columns_path)

        try:
            label_encoder = joblib.load(encoder_path)
        except FileNotFoundError:
            label_encoder = None

        return model, scaler, training_columns, label_encoder
    except FileNotFoundError as e:
        print(f"Error: Could not find model files. Please train the model first.")
        print(f"Missing file: {e.filename}")
        sys.exit(1)

def get_user_input():
    """Collect employee data from user input."""

    print("EMPLOYEE PERFORMANCE PREDICTION - DATA INPUT")


    employee_data = {}

    # Customize these fields based on your actual dataset columns
    print("\nPlease enter the following information:")
    print("-" * 60)

    # Personal Information
    employee_data['Age'] = int(input("Age: "))
    employee_data['Gender'] = input("Gender (Male/Female): ").strip()

    # Education
    print("\nEducation Options: Below Secondary, Secondary, Bachelors, Masters, PhD")
    employee_data['Education'] = input("Education Level: ").strip()

    # Work Information
    employee_data['ExperienceYearsAtThisCompany'] = int(input("Years at Company: "))
    employee_data['ExperienceYearsInCurrentRole'] = int(input("Years in Current Role: "))

    print("\nDepartment Options: Sales, HR, IT, Finance, Marketing, Operations")
    employee_data['Department'] = input("Department: ").strip()

    print("\nJob Role Options: Manager, Developer, Analyst, Sales Rep, etc.")
    employee_data['JobRole'] = input("Job Role: ").strip()

    # Performance Metrics
    employee_data['JobInvolvement'] = int(input("Job Involvement (1-4): "))
    employee_data['EnvironmentSatisfaction'] = int(input("Environment Satisfaction (1-4): "))
    employee_data['JobSatisfaction'] = int(input("Job Satisfaction (1-4): "))
    employee_data['WorkLifeBalance'] = int(input("Work-Life Balance (1-4): "))

    # Skills and Training
    employee_data['TrainingTimesLastYear'] = int(input("Training Times Last Year: "))
    employee_data['YearsSinceLastPromotion'] = int(input("Years Since Last Promotion: "))

    # Additional Metrics
    employee_data['Attrition'] = input("Attrition (Yes/No): ").strip()
    employee_data['OverTime'] = input("Over Time (Yes/No): ").strip()

    return pd.DataFrame([employee_data])

def preprocess_user_data(user_data, scaler, training_columns):
    """Preprocess user input data to match training format."""
    # One-hot encode categorical features
    user_data_encoded = pd.get_dummies(user_data, drop_first=True)

    # Align columns with training data
    user_data_encoded = user_data_encoded.reindex(columns=training_columns, fill_value=0)

    # Scale features
    user_data_scaled = scaler.transform(user_data_encoded)

    return user_data_scaled

def predict_and_display(user_data, model, scaler, training_columns, label_encoder):
    """Make prediction and display results."""
    # Preprocess
    X_processed = preprocess_user_data(user_data, scaler, training_columns)

    # Predict
    prediction = model.predict(X_processed)[0]
    probabilities = model.predict_proba(X_processed)[0]

    # Decode if needed
    if label_encoder is not None:
        prediction = label_encoder.inverse_transform([prediction])[0]
        class_names = label_encoder.classes_
    else:
        class_names = sorted(model.classes_)

    # Display results
    print("\n" + "="*60)
    print("PREDICTION RESULTS")
    print("="*60)

    print(f"\n Predicted Performance Rating: {prediction}")
    print(f" Confidence Level: {max(probabilities)*100:.2f}%")

    print("\n Detailed Probability Breakdown:")
    print("-" * 60)
    for class_name, prob in zip(class_names, probabilities):
        bar_length = int(prob * 50)
        bar = "â–ˆ" * bar_length
        print(f"Rating {class_name}: {bar} {prob*100:5.2f}%")

    print("="*60)

    # Performance interpretation
    interpret_performance(prediction)

def interpret_performance(rating):
    """Provide interpretation of performance rating."""
    interpretations = {
        1: "!  Needs Improvement - Requires immediate attention and support",
        2: "^ Below Average - Additional training and guidance recommended",
        3: "= Meets Expectations - Satisfactory performance",
        4: "$ Exceeds Expectations - Strong performer, ready for growth",
        5: "* Outstanding - Top performer, leadership potential"
    }

    # Handle both numeric and string ratings
    try:
        rating_num = int(rating)
        interpretation = interpretations.get(rating_num, "Performance level assessed")
    except (ValueError, TypeError):
        interpretation = f"Performance Rating: {rating}"

    print(f"\n Interpretation: {interpretation}\n")

def main():
    print("\n EMPLOYEE PERFORMANCE PREDICTION SYSTEM")
    print("="*60)

    # Load model artifacts
    print("\n Loading model artifacts...")
    model, scaler, training_columns, label_encoder = load_artifacts()
    print(" Model loaded successfully!")

    while True:
        # Get user input
        user_data = get_user_input()

        # Make prediction
        predict_and_display(user_data, model, scaler, training_columns, label_encoder)

        # Ask for another prediction
        another = input("\n Would you like to test another employee? (yes/no): ").strip().lower()
        if another not in ['yes', 'y']:
            print("\nðŸ‘‹ Thank you for using the Employee Performance Prediction System!")
            break

if __name__ == "__main__":
    main()


ðŸ¤– EMPLOYEE PERFORMANCE PREDICTION SYSTEM

ðŸ“¦ Loading model artifacts...
âœ… Model loaded successfully!

EMPLOYEE PERFORMANCE PREDICTION - DATA INPUT

Please enter the following information:
------------------------------------------------------------
Age: 20
Gender (Male/Female): Male

Education Options: Below Secondary, Secondary, Bachelors, Masters, PhD
Education Level: Bachelors
Years at Company: 1
Years in Current Role: 1

Department Options: Sales, HR, IT, Finance, Marketing, Operations
Department: IT

Job Role Options: Manager, Developer, Analyst, Sales Rep, etc.
Job Role: Analyst
Job Involvement (1-4): 1
Environment Satisfaction (1-4): 1
Job Satisfaction (1-4): 3
Work-Life Balance (1-4): 2
Training Times Last Year: 1
Years Since Last Promotion: 0
Attrition (Yes/No): yes
Over Time (Yes/No): no

PREDICTION RESULTS

ðŸŽ¯ Predicted Performance Rating: 3
ðŸ“Š Confidence Level: 87.00%

ðŸ“ˆ Detailed Probability Breakdown:
---------------------------------------------------------