# Final Code: Machine Learning for Corporate Training Effectiveness

In [None]:
import pandas as pd

# Load datasets
employee_df = pd.read_csv('employee_data.csv')
training_df = pd.read_csv('training_and_development_data.csv')
engagement_df = pd.read_csv('employee_engagement_survey_data.csv')
recruitment_df = pd.read_csv('recruitment_data.csv')

# Merge datasets
df = pd.merge(training_df, employee_df, left_on='Employee ID', right_on='EmpID', how='left')
df = pd.merge(df, engagement_df, on='Employee ID', how='left')

print("Final merged dataset shape:", df.shape)
df.head()

In [None]:
# Preprocessing and feature engineering
import matplotlib.pyplot as plt
import seaborn as sns

df['Training Date'] = pd.to_datetime(df['Training Date'], errors='coerce', dayfirst=True)
df['Survey Date'] = pd.to_datetime(df['Survey Date'], errors='coerce', dayfirst=True)

outcome_map = {'Failed': 0, 'Incomplete': 1, 'Completed': 2, 'Passed': 3}
df['Training Outcome Score'] = df['Training Outcome'].map(outcome_map)

for col in ['Engagement Score', 'Satisfaction Score', 'Work-Life Balance Score']:
    df[col] = df[col].fillna(df[col].median())

numeric_df = df.select_dtypes(include=['int64', 'float64'])
plt.figure(figsize=(12, 8))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix of Numeric Features")
plt.tight_layout()
plt.savefig("correlation_heatmap.png")
plt.show()

In [None]:
# Modeling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

df = df[df['Training Outcome Score'].notnull()]
y = df['Training Outcome Score'].apply(lambda x: 1 if x >= 2 else 0)

X = df[[
    'Training Duration(Days)', 'Training Cost',
    'Engagement Score', 'Satisfaction Score', 'Work-Life Balance Score',
    'Current Employee Rating'
]].fillna(0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr = LogisticRegression()
lr.fit(X_train, y_train)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)

def evaluate_model(name, y_test, y_pred):
    return {
        'Model': name,
        'Accuracy': round(accuracy_score(y_test, y_pred), 3),
        'Precision': round(precision_score(y_test, y_pred), 3),
        'Recall': round(recall_score(y_test, y_pred), 3),
        'F1 Score': round(f1_score(y_test, y_pred), 3)
    }

results = [
    evaluate_model("Logistic Regression", y_test, lr.predict(X_test)),
    evaluate_model("Random Forest", y_test, rf.predict(X_test)),
    evaluate_model("XGBoost", y_test, xgb.predict(X_test))
]

pd.DataFrame(results)

In [None]:
# SHAP analysis for interpretability
import shap
import numpy as np

explainer = shap.TreeExplainer(rf)
X_sample = X_test.sample(n=100, random_state=1)
shap_vals = explainer.shap_values(X_sample)

shap.summary_plot(shap_vals, X_sample, plot_type="bar")

In [None]:
# Mean SHAP values and bar plot
shap_vals_class1 = shap_vals[1]
shap_mean = np.abs(shap_vals_class1).mean(axis=0)

shap_df = pd.DataFrame({
    'Feature': X_sample.columns,
    'Mean |SHAP Value|': shap_mean
}).sort_values(by='Mean |SHAP Value|', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=shap_df, x='Mean |SHAP Value|', y='Feature')
plt.title("SHAP Feature Importance (Random Forest)")
plt.tight_layout()
plt.savefig("shap_feature_importance.png")
plt.show()