In [None]:
# Final Python Script for Employee Performance Prediction

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv("your_data.csv")  # Replace with your actual path

# 3. Feature & Target Split
X = data.drop("is_high_performer", axis=1)
y = data["is_high_performer"]

# 4. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 5. Scale Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 6. Feature Selection (RFE)
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=20)
rfe_selector.fit(X_train_scaled, y_train)
X_train_selected = rfe_selector.transform(X_train_scaled)
X_test_selected = rfe_selector.transform(X_test_scaled)

# 7. Apply SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_selected, y_train)

# 8. LightGBM Model Training with Grid Search
params = {
    'learning_rate': [0.05],
    'max_depth': [4, 6, 8],
    'num_leaves': [31, 63],
    'n_estimators': [100, 200]
}
lgbm = lgb.LGBMClassifier(random_state=42)
grid = GridSearchCV(lgbm, param_grid=params, cv=5, scoring='f1', verbose=1)
grid.fit(X_train_balanced, y_train_balanced)
best_model = grid.best_estimator_

# 9. Predict Probabilities & Tune Threshold
y_pred_probs = best_model.predict_proba(X_test_selected)[:, 1]
threshold = 0.65
y_pred_final = (y_pred_probs >= threshold).astype(int)

# 10. Evaluation
print("Classification Report at threshold = 0.65")
print(classification_report(y_test, y_pred_final))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_final))
print("ROC AUC:", roc_auc_score(y_test, y_pred_probs))

In [None]:
# 11. Precision-Recall vs Threshold Plot
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_probs)
plt.figure(figsize=(8, 6))
plt.plot(thresholds, precision[:-1], label="Precision", linestyle='--', color='blue')
plt.plot(thresholds, recall[:-1], label="Recall", color='green')
plt.axvline(x=0.5, linestyle='--', color='red', label='Default threshold (0.5)')
plt.title("Precision-Recall vs Threshold")
plt.xlabel("Threshold")
plt.ylabel("Score")
plt.legend()
plt.grid(True)
plt.show()