Model Creation

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('enriched_training_sessions.csv')

# Step 1: Prepare the Data
features = [
    'mean_participant_score', 
    'status_pending', 
    'status_ongoing', 
    'status_completed',
    'designation_trainer_promotion_encoded',
    'designation_trainer_retention_encoded',
    'designation_participant_promotion_encoded',
    'designation_participant_retention_encoded',
    'training_category_promotion_encoded',
    'training_category_retention_encoded'
]

# Targets for predicting promotion and retention
target_promotion = 'promotion'
target_retention = 'retention'

X = df[features]
y_promotion = df[target_promotion]
y_retention = df[target_retention]

# Step 2: Split the Data
X_train, X_test, y_train_promotion, y_test_promotion = train_test_split(X, y_promotion, test_size=0.2, random_state=42)
X_train, X_test, y_train_retention, y_test_retention = train_test_split(X, y_retention, test_size=0.2, random_state=42)

# Step 3: Data Normalization/Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Hyperparameter Tuning with Grid Search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Random Forest for Promotion
grid_search_promotion = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search_promotion.fit(X_train_scaled, y_train_promotion)

best_model_promotion = grid_search_promotion.best_estimator_

# Random Forest for Retention
grid_search_retention = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search_retention.fit(X_train_scaled, y_train_retention)

best_model_retention = grid_search_retention.best_estimator_

# Step 5: Cross-Validation
cv_scores_promotion = cross_val_score(best_model_promotion, X_train_scaled, y_train_promotion, cv=5)
cv_scores_retention = cross_val_score(best_model_retention, X_train_scaled, y_train_retention, cv=5)

print(f"Promotion Model CV Average Score: {np.mean(cv_scores_promotion)}")
print(f"Retention Model CV Average Score: {np.mean(cv_scores_retention)}")

# Step 6: Evaluate the Model
# Predictions for promotion
y_pred_promotion = best_model_promotion.predict(X_test_scaled)
print("\nPromotion Model Evaluation:")
print(classification_report(y_test_promotion, y_pred_promotion))
print("Accuracy:", accuracy_score(y_test_promotion, y_pred_promotion))

# Predictions for retention
y_pred_retention = best_model_retention.predict(X_test_scaled)
print("\nRetention Model Evaluation:")
print(classification_report(y_test_retention, y_pred_retention))
print("Accuracy:", accuracy_score(y_test_retention, y_pred_retention))


Promotion Model CV Average Score: 1.0
Retention Model CV Average Score: 1.0

Promotion Model Evaluation:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       303
           1       1.00      1.00      1.00        20

    accuracy                           1.00       323
   macro avg       1.00      1.00      1.00       323
weighted avg       1.00      1.00      1.00       323

Accuracy: 1.0

Retention Model Evaluation:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       105
           1       1.00      1.00      1.00       218

    accuracy                           1.00       323
   macro avg       1.00      1.00      1.00       323
weighted avg       1.00      1.00      1.00       323

Accuracy: 1.0


Checking Model for Random Samples

In [20]:
# Step 9: Prepare a random sample for predictions
random_sample = df.sample(n=10, random_state=20)  # Change n for the desired sample size

# Prepare features for predictions
random_features = [
    'mean_participant_score', 
    'status_pending', 
    'status_ongoing', 
    'status_completed',
    'designation_trainer_promotion_encoded',
    'designation_trainer_retention_encoded',
    'designation_participant_promotion_encoded',
    'designation_participant_retention_encoded',
    'training_category_promotion_encoded',
    'training_category_retention_encoded'
]

# Step 10: Scale the random sample
random_sample_scaled = scaler.transform(random_sample[random_features])

# Step 11: Make predictions using the trained models
predictions_promotion = best_model_promotion.predict(random_sample_scaled)
predictions_retention = best_model_retention.predict(random_sample_scaled)

# Step 12: Add predictions to the random sample DataFrame
random_sample['predicted_promotion'] = predictions_promotion
random_sample['predicted_retention'] = predictions_retention

# Display the random sample with predictions
print(random_sample[['mean_participant_score', 'predicted_promotion', 'predicted_retention']])


      mean_participant_score  predicted_promotion  predicted_retention
1580                    3.50                    0                    0
1551                    6.25                    0                    1
1382                    3.75                    0                    0
910                     4.00                    0                    0
1472                    6.50                    0                    1
313                     7.25                    1                    1
1301                    6.25                    0                    1
602                     5.00                    0                    1
1243                    5.50                    0                    1
745                     4.50                    0                    1
