In [43]:
# Import necessary libraries
import pandas as pd

# Load the dataset
df = pd.read_csv("Attrition.csv")

# Display the first few rows and column names
print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head()

Dataset shape: (1470, 35)
Columns: ['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [45]:
# Import additional libraries
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Create a RetentionScore (proxy for retention likelihood)
# We'll use JobSatisfaction, WorkLifeBalance, PerformanceRating, and JobInvolvement
# Normalize each to 0-1 and take a weighted average
df['RetentionScore'] = (
    0.3 * (df['JobSatisfaction'] / 4) +  # Max value is 4
    0.3 * (df['WorkLifeBalance'] / 4) +  # Max value is 4
    0.2 * (df['PerformanceRating'] / 5) +  # Max value is 5
    0.2 * (df['JobInvolvement'] / 4)  # Max value is 4
)

# Verify the new column
print("RetentionScore sample:")
print(df[['JobSatisfaction', 'WorkLifeBalance', 'PerformanceRating', 'JobInvolvement', 'RetentionScore']].head())

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

RetentionScore sample:
   JobSatisfaction  WorkLifeBalance  PerformanceRating  JobInvolvement  \
0                4                1                  3               3   
1                2                3                  4               2   
2                3                3                  3               2   
3                3                3                  3               3   
4                2                3                  3               3   

   RetentionScore  
0           0.645  
1           0.635  
2           0.670  
3           0.720  
4           0.645  

Missing values:
Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobIn

In [51]:
# Import necessary libraries
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np

# Define features and target
X_performance = df[["Age", "Gender", "Department", "JobRole", "MonthlyIncome", "YearsAtCompany", 
                   "OverTime", "JobSatisfaction", "WorkLifeBalance", "TotalWorkingYears", 
                   "TrainingTimesLastYear", "JobInvolvement", "EnvironmentSatisfaction", 
                   "RelationshipSatisfaction"]]
y_performance_binary = df["PerformanceRating"].map({3: 0, 4: 1})

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), ["Age", "MonthlyIncome", "YearsAtCompany", "TotalWorkingYears", 
                                   "TrainingTimesLastYear", "JobSatisfaction", "WorkLifeBalance", 
                                   "JobInvolvement", "EnvironmentSatisfaction", "RelationshipSatisfaction"]),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), ["Gender", "Department", "JobRole", "OverTime"])
    ]
)

# Define the models to compare
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(random_state=42),
    "XGBoost": XGBClassifier(eval_metric="logloss", random_state=42)  # Removed use_label_encoder
}

# Evaluate each model with SMOTE using cross-validation
print("Evaluating models with SMOTE...")
results = {}
for name, model in models.items():
    # Create a pipeline with SMOTE
    pipeline = ImbPipeline([
        ("preprocessor", preprocessor),
        ("smote", SMOTE(random_state=42)),
        ("model", model)
    ])
    
    # Cross-validation for accuracy
    accuracy_scores = cross_val_score(pipeline, X_performance, y_performance_binary, cv=5, scoring="accuracy")
    # Cross-validation for F1-score
    f1_scores = cross_val_score(pipeline, X_performance, y_performance_binary, cv=5, 
                                 scoring=make_scorer(f1_score, pos_label=1))
    
    results[name] = {
        "Accuracy (Mean)": np.mean(accuracy_scores),
        "Accuracy (Std)": np.std(accuracy_scores),
        "F1-Score (Mean)": np.mean(f1_scores),
        "F1-Score (Std)": np.std(f1_scores)
    }
    print(f"{name}:")
    print(f"  Accuracy: {results[name]['Accuracy (Mean)']:.4f} (+/- {results[name]['Accuracy (Std)']:.4f})")
    print(f"  F1-Score: {results[name]['F1-Score (Mean)']:.4f} (+/- {results[name]['F1-Score (Std)']:.4f})\n")

Evaluating models with SMOTE...
Logistic Regression:
  Accuracy: 0.5054 (+/- 0.0208)
  F1-Score: 0.2027 (+/- 0.0178)

Random Forest:
  Accuracy: 0.8354 (+/- 0.0041)
  F1-Score: 0.0000 (+/- 0.0000)

SVM:
  Accuracy: 0.6381 (+/- 0.0163)
  F1-Score: 0.1781 (+/- 0.0505)

XGBoost:
  Accuracy: 0.8190 (+/- 0.0142)
  F1-Score: 0.0618 (+/- 0.0321)



In [53]:
# Import necessary libraries for hyperparameter tuning
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, f1_score

# Define features and target
X_performance = df[["Age", "Gender", "Department", "JobRole", "MonthlyIncome", "YearsAtCompany", 
                   "OverTime", "JobSatisfaction", "WorkLifeBalance", "TotalWorkingYears", 
                   "TrainingTimesLastYear", "JobInvolvement", "EnvironmentSatisfaction", 
                   "RelationshipSatisfaction"]]
y_performance_binary = df["PerformanceRating"].map({3: 0, 4: 1})

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), ["Age", "MonthlyIncome", "YearsAtCompany", "TotalWorkingYears", 
                                   "TrainingTimesLastYear", "JobSatisfaction", "WorkLifeBalance", 
                                   "JobInvolvement", "EnvironmentSatisfaction", "RelationshipSatisfaction"]),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), ["Gender", "Department", "JobRole", "OverTime"])
    ]
)

# Hyperparameter tuning for Logistic Regression
print("Tuning Logistic Regression...")
logistic_pipeline = ImbPipeline([
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("model", LogisticRegression(max_iter=1000, random_state=42))
])

logistic_param_grid = {
    "model__C": [0.1, 1, 10],
    "model__solver": ["lbfgs", "liblinear"],
    "model__class_weight": [None, "balanced"]
}

logistic_grid = GridSearchCV(
    logistic_pipeline,
    logistic_param_grid,
    cv=5,
    scoring=make_scorer(f1_score, pos_label=1),
    n_jobs=-1
)
logistic_grid.fit(X_performance, y_performance_binary)

print(f"Best parameters for Logistic Regression: {logistic_grid.best_params_}")
print(f"Best F1-Score for Logistic Regression: {logistic_grid.best_score_:.4f}\n")

# Hyperparameter tuning for SVM
print("Tuning SVM...")
svm_pipeline = ImbPipeline([
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("model", SVC(random_state=42))
])

svm_param_grid = {
    "model__C": [0.1, 1, 10],
    "model__kernel": ["rbf", "linear"],
    "model__class_weight": [None, "balanced"]
}

svm_grid = GridSearchCV(
    svm_pipeline,
    svm_param_grid,
    cv=5,
    scoring=make_scorer(f1_score, pos_label=1),
    n_jobs=-1
)
svm_grid.fit(X_performance, y_performance_binary)

print(f"Best parameters for SVM: {svm_grid.best_params_}")
print(f"Best F1-Score for SVM: {svm_grid.best_score_:.4f}\n")

# Compare the best models
best_model = logistic_grid if logistic_grid.best_score_ > svm_grid.best_score_ else svm_grid
best_model_name = "Logistic Regression" if logistic_grid.best_score_ > svm_grid.best_score_ else "SVM"
print(f"Best overall model: {best_model_name}")
print(f"Best F1-Score: {best_model.best_score_:.4f}")
print(f"Best parameters: {best_model.best_params_}")

Tuning Logistic Regression...
Best parameters for Logistic Regression: {'model__C': 0.1, 'model__class_weight': None, 'model__solver': 'liblinear'}
Best F1-Score for Logistic Regression: 0.2114

Tuning SVM...
Best parameters for SVM: {'model__C': 0.1, 'model__class_weight': None, 'model__kernel': 'linear'}
Best F1-Score for SVM: 0.2212

Best overall model: SVM
Best F1-Score: 0.2212
Best parameters: {'model__C': 0.1, 'model__class_weight': None, 'model__kernel': 'linear'}


In [55]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import pandas as pd
import numpy as np

# Define features and target
X_performance = df[["Age", "Gender", "Department", "JobRole", "MonthlyIncome", "YearsAtCompany", 
                   "OverTime", "JobSatisfaction", "WorkLifeBalance", "TotalWorkingYears", 
                   "TrainingTimesLastYear", "JobInvolvement", "EnvironmentSatisfaction", 
                   "RelationshipSatisfaction"]]
y_performance_binary = df["PerformanceRating"].map({3: 0, 4: 1})

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), ["Age", "MonthlyIncome", "YearsAtCompany", "TotalWorkingYears", 
                                   "TrainingTimesLastYear", "JobSatisfaction", "WorkLifeBalance", 
                                   "JobInvolvement", "EnvironmentSatisfaction", "RelationshipSatisfaction"]),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), ["Gender", "Department", "JobRole", "OverTime"])
    ]
)

# Create a pipeline with SMOTE and Random Forest for feature importance
feature_pipeline = ImbPipeline([
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("model", RandomForestClassifier(random_state=42))
])

# Fit the pipeline
feature_pipeline.fit(X_performance, y_performance_binary)

# Get feature importances
feature_importances = feature_pipeline.named_steps["model"].feature_importances_

# Get feature names after preprocessing
num_features = ["Age", "MonthlyIncome", "YearsAtCompany", "TotalWorkingYears", 
                "TrainingTimesLastYear", "JobSatisfaction", "WorkLifeBalance", 
                "JobInvolvement", "EnvironmentSatisfaction", "RelationshipSatisfaction"]
cat_features = feature_pipeline.named_steps["preprocessor"].named_transformers_["cat"].get_feature_names_out()
all_features = np.concatenate([num_features, cat_features])

# Create a DataFrame of feature importances
importance_df = pd.DataFrame({
    "Feature": all_features,
    "Importance": feature_importances
}).sort_values(by="Importance", ascending=False)

# Display the top 10 features
print("Top 10 most important features:")
print(importance_df.head(10))

# Select the top 5 features
top_features = importance_df["Feature"].head(5).values
print("\nSelected top 5 features:")
print(top_features)


Top 10 most important features:
                     Feature  Importance
8    EnvironmentSatisfaction    0.085966
1              MonthlyIncome    0.084426
4      TrainingTimesLastYear    0.084314
2             YearsAtCompany    0.079245
10               Gender_Male    0.077578
3          TotalWorkingYears    0.074014
9   RelationshipSatisfaction    0.073076
0                        Age    0.070789
21              OverTime_Yes    0.068003
5            JobSatisfaction    0.065383

Selected top 5 features:
['EnvironmentSatisfaction' 'MonthlyIncome' 'TrainingTimesLastYear'
 'YearsAtCompany' 'Gender_Male']


In [57]:
# Import necessary libraries
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, f1_score

# Define the top 5 features
top_features = ['EnvironmentSatisfaction', 'MonthlyIncome', 'TrainingTimesLastYear', 
                'YearsAtCompany', 'Gender']

# Prepare the feature set
X_performance_top = df[top_features]
y_performance_binary = df["PerformanceRating"].map({3: 0, 4: 1})

# Preprocessing
preprocessor_top = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), ['EnvironmentSatisfaction', 'MonthlyIncome', 'TrainingTimesLastYear', 'YearsAtCompany']),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), ['Gender'])
    ]
)

# Create the pipeline with SMOTE and XGBoost
xgb_pipeline = ImbPipeline([
    ("preprocessor", preprocessor_top),
    ("smote", SMOTE(random_state=42)),
    ("model", XGBClassifier(eval_metric="logloss", random_state=42))
])

# Define the hyperparameter grid for XGBoost
xgb_param_grid = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [3, 6, 10],
    "model__learning_rate": [0.01, 0.1],
    "model__scale_pos_weight": [1, 5, 10]  # To handle class imbalance
}

# Perform GridSearchCV
print("Tuning XGBoost with top 5 features...")
xgb_grid = GridSearchCV(
    xgb_pipeline,
    xgb_param_grid,
    cv=5,
    scoring=make_scorer(f1_score, pos_label=1),
    n_jobs=-1
)
xgb_grid.fit(X_performance_top, y_performance_binary)

print(f"Best parameters for XGBoost: {xgb_grid.best_params_}")
print(f"Best F1-Score for XGBoost: {xgb_grid.best_score_:.4f}")


Tuning XGBoost with top 5 features...
Best parameters for XGBoost: {'model__learning_rate': 0.01, 'model__max_depth': 10, 'model__n_estimators': 100, 'model__scale_pos_weight': 10}
Best F1-Score for XGBoost: 0.2670


In [59]:
# Import necessary libraries
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score

# Define features for XGBoost (top 5 features)
top_features = ['EnvironmentSatisfaction', 'MonthlyIncome', 'TrainingTimesLastYear', 
                'YearsAtCompany', 'Gender']
X_performance_top = df[top_features]
y_performance_binary = df["PerformanceRating"].map({3: 0, 4: 1})

# Preprocessing for XGBoost
preprocessor_top = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), ['EnvironmentSatisfaction', 'MonthlyIncome', 'TrainingTimesLastYear', 'YearsAtCompany']),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), ['Gender'])
    ]
)

# Define features for SVM (all features)
X_performance_all = df[["Age", "Gender", "Department", "JobRole", "MonthlyIncome", "YearsAtCompany", 
                       "OverTime", "JobSatisfaction", "WorkLifeBalance", "TotalWorkingYears", 
                       "TrainingTimesLastYear", "JobInvolvement", "EnvironmentSatisfaction", 
                       "RelationshipSatisfaction"]]

# Preprocessing for SVM
preprocessor_all = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), ["Age", "MonthlyIncome", "YearsAtCompany", "TotalWorkingYears", 
                                   "TrainingTimesLastYear", "JobSatisfaction", "WorkLifeBalance", 
                                   "JobInvolvement", "EnvironmentSatisfaction", "RelationshipSatisfaction"]),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), ["Gender", "Department", "JobRole", "OverTime"])
    ]
)

# Define the individual models with their best parameters
xgb_pipeline = ImbPipeline([
    ("preprocessor", preprocessor_top),
    ("smote", SMOTE(random_state=42)),
    ("model", XGBClassifier(learning_rate=0.01, max_depth=10, n_estimators=100, 
                           scale_pos_weight=10, eval_metric="logloss", random_state=42))
])

svm_pipeline = ImbPipeline([
    ("preprocessor", preprocessor_all),
    ("smote", SMOTE(random_state=42)),
    ("model", SVC(C=0.1, class_weight=None, kernel="linear", probability=True, random_state=42))
])

# Create the voting classifier
voting_clf = VotingClassifier(
    estimators=[
        ("xgb", xgb_pipeline),
        ("svm", svm_pipeline)
    ],
    voting="soft"  # Use soft voting to average probabilities
)

# Evaluate the voting classifier using cross-validation
print("Evaluating Voting Classifier (XGBoost + SVM)...")
accuracy_scores = cross_val_score(voting_clf, X_performance_all, y_performance_binary, cv=5, scoring="accuracy")
f1_scores = cross_val_score(voting_clf, X_performance_all, y_performance_binary, cv=5, 
                            scoring=make_scorer(f1_score, pos_label=1))

print(f"Voting Classifier:")
print(f"  Accuracy: {np.mean(accuracy_scores):.4f} (+/- {np.std(accuracy_scores):.4f})")
print(f"  F1-Score: {np.mean(f1_scores):.4f} (+/- {np.std(f1_scores):.4f})")

Evaluating Voting Classifier (XGBoost + SVM)...
Voting Classifier:
  Accuracy: 0.2605 (+/- 0.0267)
  F1-Score: 0.2611 (+/- 0.0041)


In [61]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Create a copy of the dataset
df_engineered = df.copy()

# Create new features
# 1. Income per Year at Company (handle division by zero)
df_engineered['IncomePerYear'] = df_engineered['MonthlyIncome'] / (df_engineered['YearsAtCompany'] + 1)  # Add 1 to avoid division by zero

# 2. Satisfaction Score (average of satisfaction-related features)
df_engineered['SatisfactionScore'] = (df_engineered['EnvironmentSatisfaction'] + 
                                     df_engineered['JobSatisfaction'] + 
                                     df_engineered['RelationshipSatisfaction']) / 3

# 3. Training per Year (handle division by zero)
df_engineered['TrainingPerYear'] = df_engineered['TrainingTimesLastYear'] / (df_engineered['YearsAtCompany'] + 1)

# Define the feature set (top 5 features + new features)
engineered_features = ['EnvironmentSatisfaction', 'MonthlyIncome', 'TrainingTimesLastYear', 
                      'YearsAtCompany', 'Gender', 'IncomePerYear', 'SatisfactionScore', 'TrainingPerYear']
X_performance_engineered = df_engineered[engineered_features]
y_performance_binary = df_engineered["PerformanceRating"].map({3: 0, 4: 1})

# Display the first few rows of the new features
print("Sample of engineered features:")
print(X_performance_engineered.head())


Sample of engineered features:
   EnvironmentSatisfaction  MonthlyIncome  TrainingTimesLastYear  \
0                        2           5993                      0   
1                        3           5130                      3   
2                        4           2090                      3   
3                        4           2909                      3   
4                        1           3468                      3   

   YearsAtCompany  Gender  IncomePerYear  SatisfactionScore  TrainingPerYear  
0               6  Female     856.142857           2.333333         0.000000  
1              10    Male     466.363636           3.000000         0.272727  
2               0    Male    2090.000000           3.000000         3.000000  
3               8  Female     323.222222           3.333333         0.333333  
4               2    Male    1156.000000           2.333333         1.000000  


In [63]:
# Import necessary libraries
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, f1_score

# Define the feature set
engineered_features = ['EnvironmentSatisfaction', 'MonthlyIncome', 'TrainingTimesLastYear', 
                      'YearsAtCompany', 'Gender', 'IncomePerYear', 'SatisfactionScore', 'TrainingPerYear']
X_performance_engineered = df_engineered[engineered_features]
y_performance_binary = df_engineered["PerformanceRating"].map({3: 0, 4: 1})

# Preprocessing
preprocessor_engineered = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), ['EnvironmentSatisfaction', 'MonthlyIncome', 'TrainingTimesLastYear', 
                                   'YearsAtCompany', 'IncomePerYear', 'SatisfactionScore', 'TrainingPerYear']),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), ['Gender'])
    ]
)

# Create the pipeline with SMOTE and XGBoost
xgb_pipeline = ImbPipeline([
    ("preprocessor", preprocessor_engineered),
    ("smote", SMOTE(random_state=42)),
    ("model", XGBClassifier(eval_metric="logloss", random_state=42))
])

# Define the hyperparameter grid for XGBoost
xgb_param_grid = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [3, 6, 10],
    "model__learning_rate": [0.01, 0.1],
    "model__scale_pos_weight": [1, 5, 10]
}

# Perform GridSearchCV
print("Tuning XGBoost with engineered features...")
xgb_grid = GridSearchCV(
    xgb_pipeline,
    xgb_param_grid,
    cv=5,
    scoring=make_scorer(f1_score, pos_label=1),
    n_jobs=-1
)
xgb_grid.fit(X_performance_engineered, y_performance_binary)

print(f"Best parameters for XGBoost: {xgb_grid.best_params_}")
print(f"Best F1-Score for XGBoost: {xgb_grid.best_score_:.4f}")

Tuning XGBoost with engineered features...
Best parameters for XGBoost: {'model__learning_rate': 0.01, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__scale_pos_weight': 10}
Best F1-Score for XGBoost: 0.2636


In [65]:
# Import necessary libraries
from imblearn.over_sampling import ADASYN
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, f1_score

# Define the top 5 features
top_features = ['EnvironmentSatisfaction', 'MonthlyIncome', 'TrainingTimesLastYear', 
                'YearsAtCompany', 'Gender']
X_performance_top = df[top_features]
y_performance_binary = df["PerformanceRating"].map({3: 0, 4: 1})

# Preprocessing
preprocessor_top = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), ['EnvironmentSatisfaction', 'MonthlyIncome', 'TrainingTimesLastYear', 'YearsAtCompany']),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), ['Gender'])
    ]
)

# Create the pipeline with ADASYN and XGBoost
xgb_pipeline = ImbPipeline([
    ("preprocessor", preprocessor_top),
    ("adasyn", ADASYN(random_state=42)),
    ("model", XGBClassifier(eval_metric="logloss", random_state=42))
])

# Define the hyperparameter grid for XGBoost
xgb_param_grid = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [3, 6, 10],
    "model__learning_rate": [0.01, 0.1],
    "model__scale_pos_weight": [1, 5, 10]
}

# Perform GridSearchCV
print("Tuning XGBoost with ADASYN and top 5 features...")
xgb_grid = GridSearchCV(
    xgb_pipeline,
    xgb_param_grid,
    cv=5,
    scoring=make_scorer(f1_score, pos_label=1),
    n_jobs=-1
)
xgb_grid.fit(X_performance_top, y_performance_binary)

print(f"Best parameters for XGBoost: {xgb_grid.best_params_}")
print(f"Best F1-Score for XGBoost: {xgb_grid.best_score_:.4f}")


Tuning XGBoost with ADASYN and top 5 features...
Best parameters for XGBoost: {'model__learning_rate': 0.01, 'model__max_depth': 6, 'model__n_estimators': 100, 'model__scale_pos_weight': 5}
Best F1-Score for XGBoost: 0.2647


In [67]:
# Import necessary libraries
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier
import joblib

# Define the top 5 features
top_features = ['EnvironmentSatisfaction', 'MonthlyIncome', 'TrainingTimesLastYear', 
                'YearsAtCompany', 'Gender']
X_performance_top = df[top_features]
y_performance_binary = df["PerformanceRating"].map({3: 0, 4: 1})

# Preprocessing
preprocessor_top = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), ['EnvironmentSatisfaction', 'MonthlyIncome', 'TrainingTimesLastYear', 'YearsAtCompany']),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), ['Gender'])
    ]
)

# Create the pipeline with the best XGBoost model
performance_model = ImbPipeline([
    ("preprocessor", preprocessor_top),
    ("smote", SMOTE(random_state=42)),
    ("model", XGBClassifier(learning_rate=0.01, max_depth=10, n_estimators=100, 
                           scale_pos_weight=10, eval_metric="logloss", random_state=42))
])

# Train the model on the full dataset
performance_model.fit(X_performance_top, y_performance_binary)
print("Best XGBoost model trained successfully.")

# Save the model
joblib.dump(performance_model, "performance_model.pkl")
print("Performance model saved as 'performance_model.pkl'.")

Best XGBoost model trained successfully.
Performance model saved as 'performance_model.pkl'.


In [69]:
# Check the distribution of the Attrition column
print("Attrition distribution:")
print(df["Attrition"].value_counts())

Attrition distribution:
Attrition
No     1233
Yes     237
Name: count, dtype: int64


In [71]:
print("RetentionScore distribution:")
print(df["RetentionScore"].value_counts())

RetentionScore distribution:
RetentionScore
0.720    207
0.645    172
0.795    154
0.570    130
0.670    100
0.595     68
0.745     62
0.520     52
0.620     48
0.495     37
0.760     36
0.835     32
0.685     31
0.770     31
0.870     28
0.845     28
0.695     27
0.610     23
0.710     20
0.445     19
0.695     18
0.545     12
0.635     12
0.560     12
0.660     10
0.470     10
0.820     10
0.785     10
0.420      9
0.545      9
0.535      6
0.370      6
0.810      6
0.735      5
0.510      4
0.920      3
0.395      3
0.735      3
0.885      3
0.485      2
0.910      2
0.860      2
0.585      2
0.585      2
0.470      1
0.960      1
0.460      1
0.435      1
Name: count, dtype: int64


In [73]:
# Threshold RetentionScore to create a binary target
y_retention_binary = (df["RetentionScore"] < 0.5).astype(int)  # 1: High risk (likely to leave), 0: Low risk (likely to stay)

# Check the distribution of the binary target
print("Binary RetentionScore distribution (0 = Low risk, 1 = High risk):")
print(y_retention_binary.value_counts())

Binary RetentionScore distribution (0 = Low risk, 1 = High risk):
RetentionScore
0    1381
1      89
Name: count, dtype: int64


In [75]:
# Import necessary libraries
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np

# Define features and target for the retention model
X_retention = df[["Age", "Gender", "Department", "JobRole", "MonthlyIncome", "YearsAtCompany", 
                  "OverTime", "JobSatisfaction", "WorkLifeBalance", "TotalWorkingYears", 
                  "TrainingTimesLastYear", "JobInvolvement", "EnvironmentSatisfaction", 
                  "RelationshipSatisfaction"]]
y_retention_binary = (df["RetentionScore"] < 0.5).astype(int)  # 1: High risk, 0: Low risk

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), ["Age", "MonthlyIncome", "YearsAtCompany", "TotalWorkingYears", 
                                   "TrainingTimesLastYear", "JobSatisfaction", "WorkLifeBalance", 
                                   "JobInvolvement", "EnvironmentSatisfaction", "RelationshipSatisfaction"]),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), ["Gender", "Department", "JobRole", "OverTime"])
    ]
)

# Define the models to compare
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(random_state=42),
    "XGBoost": XGBClassifier(eval_metric="logloss", random_state=42)
}

# Evaluate each model with SMOTE using cross-validation
print("Evaluating models for retention prediction...")
results = {}
for name, model in models.items():
    pipeline = ImbPipeline([
        ("preprocessor", preprocessor),
        ("smote", SMOTE(random_state=42)),
        ("model", model)
    ])
    
    # Cross-validation for accuracy
    accuracy_scores = cross_val_score(pipeline, X_retention, y_retention_binary, cv=5, scoring="accuracy")
    # Cross-validation for F1-score
    f1_scores = cross_val_score(pipeline, X_retention, y_retention_binary, cv=5, 
                                 scoring=make_scorer(f1_score, pos_label=1))
    
    results[name] = {
        "Accuracy (Mean)": np.mean(accuracy_scores),
        "Accuracy (Std)": np.std(accuracy_scores),
        "F1-Score (Mean)": np.mean(f1_scores),
        "F1-Score (Std)": np.std(f1_scores)
    }
    print(f"{name}:")
    print(f"  Accuracy: {results[name]['Accuracy (Mean)']:.4f} (+/- {results[name]['Accuracy (Std)']:.4f})")
    print(f"  F1-Score: {results[name]['F1-Score (Mean)']:.4f} (+/- {results[name]['F1-Score (Std)']:.4f})\n")

Evaluating models for retention prediction...
Logistic Regression:
  Accuracy: 0.9871 (+/- 0.0079)
  F1-Score: 0.9048 (+/- 0.0569)

Random Forest:
  Accuracy: 0.9741 (+/- 0.0035)
  F1-Score: 0.7386 (+/- 0.0545)

SVM:
  Accuracy: 0.9796 (+/- 0.0061)
  F1-Score: 0.8336 (+/- 0.0465)

XGBoost:
  Accuracy: 0.9830 (+/- 0.0043)
  F1-Score: 0.8614 (+/- 0.0291)



In [77]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import pandas as pd
import numpy as np

# Define features and target
X_retention = df[["Age", "Gender", "Department", "JobRole", "MonthlyIncome", "YearsAtCompany", 
                  "OverTime", "JobSatisfaction", "WorkLifeBalance", "TotalWorkingYears", 
                  "TrainingTimesLastYear", "JobInvolvement", "EnvironmentSatisfaction", 
                  "RelationshipSatisfaction"]]
y_retention_binary = (df["RetentionScore"] < 0.5).astype(int)

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), ["Age", "MonthlyIncome", "YearsAtCompany", "TotalWorkingYears", 
                                   "TrainingTimesLastYear", "JobSatisfaction", "WorkLifeBalance", 
                                   "JobInvolvement", "EnvironmentSatisfaction", "RelationshipSatisfaction"]),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), ["Gender", "Department", "JobRole", "OverTime"])
    ]
)

# Create a pipeline with SMOTE and Random Forest for feature importance
feature_pipeline = ImbPipeline([
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("model", RandomForestClassifier(random_state=42))
])

# Fit the pipeline
feature_pipeline.fit(X_retention, y_retention_binary)

# Get feature importances
feature_importances = feature_pipeline.named_steps["model"].feature_importances_

# Get feature names after preprocessing
num_features = ["Age", "MonthlyIncome", "YearsAtCompany", "TotalWorkingYears", 
                "TrainingTimesLastYear", "JobSatisfaction", "WorkLifeBalance", 
                "JobInvolvement", "EnvironmentSatisfaction", "RelationshipSatisfaction"]
cat_features = feature_pipeline.named_steps["preprocessor"].named_transformers_["cat"].get_feature_names_out()
all_features = np.concatenate([num_features, cat_features])

# Create a DataFrame of feature importances
importance_df = pd.DataFrame({
    "Feature": all_features,
    "Importance": feature_importances
}).sort_values(by="Importance", ascending=False)

# Display the top 10 features
print("Top 10 most important features for retention prediction:")
print(importance_df.head(10))

# Select the top 5 features
top_features = importance_df["Feature"].head(5).values
print("\nSelected top 5 features:")
print(top_features)

Top 10 most important features for retention prediction:
                              Feature  Importance
5                     JobSatisfaction    0.372399
6                     WorkLifeBalance    0.289887
7                      JobInvolvement    0.056490
21                       OverTime_Yes    0.039721
10                        Gender_Male    0.027402
3                   TotalWorkingYears    0.023825
4               TrainingTimesLastYear    0.022626
11  Department_Research & Development    0.017829
0                                 Age    0.017477
1                       MonthlyIncome    0.017267

Selected top 5 features:
['JobSatisfaction' 'WorkLifeBalance' 'JobInvolvement' 'OverTime_Yes'
 'Gender_Male']


In [81]:
# Import necessary libraries
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, f1_score

# Define the top 5 features (corrected OverTime_Yes to OverTime)
top_features = ['JobSatisfaction', 'WorkLifeBalance', 'JobInvolvement', 'OverTime', 'Gender']
X_retention_top = df[top_features]
y_retention_binary = (df["RetentionScore"] < 0.5).astype(int)

# Preprocessing
preprocessor_top = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), ['JobSatisfaction', 'WorkLifeBalance', 'JobInvolvement']),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), ['OverTime', 'Gender'])
    ]
)

# Hyperparameter tuning for Logistic Regression
print("Tuning Logistic Regression with top 5 features...")
logistic_pipeline = ImbPipeline([
    ("preprocessor", preprocessor_top),
    ("smote", SMOTE(random_state=42)),
    ("model", LogisticRegression(max_iter=1000, random_state=42))
])

logistic_param_grid = {
    "model__C": [0.1, 1, 10],
    "model__solver": ["lbfgs", "liblinear"],
    "model__class_weight": [None, "balanced"]
}

logistic_grid = GridSearchCV(
    logistic_pipeline,
    logistic_param_grid,
    cv=5,
    scoring=make_scorer(f1_score, pos_label=1),
    n_jobs=-1
)
logistic_grid.fit(X_retention_top, y_retention_binary)

print(f"Best parameters for Logistic Regression: {logistic_grid.best_params_}")
print(f"Best F1-Score for Logistic Regression: {logistic_grid.best_score_:.4f}\n")

# Hyperparameter tuning for XGBoost
print("Tuning XGBoost with top 5 features...")
xgb_pipeline = ImbPipeline([
    ("preprocessor", preprocessor_top),
    ("smote", SMOTE(random_state=42)),
    ("model", XGBClassifier(eval_metric="logloss", random_state=42))
])

xgb_param_grid = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [3, 6, 10],
    "model__learning_rate": [0.01, 0.1],
    "model__scale_pos_weight": [1, 5, 10]
}

xgb_grid = GridSearchCV(
    xgb_pipeline,
    xgb_param_grid,
    cv=5,
    scoring=make_scorer(f1_score, pos_label=1),
    n_jobs=-1
)
xgb_grid.fit(X_retention_top, y_retention_binary)

print(f"Best parameters for XGBoost: {xgb_grid.best_params_}")
print(f"Best F1-Score for XGBoost: {xgb_grid.best_score_:.4f}\n")

# Compare the best models
best_model = logistic_grid if logistic_grid.best_score_ > xgb_grid.best_score_ else xgb_grid
best_model_name = "Logistic Regression" if logistic_grid.best_score_ > xgb_grid.best_score_ else "XGBoost"
print(f"Best overall model: {best_model_name}")
print(f"Best F1-Score: {best_model.best_score_:.4f}")
print(f"Best parameters: {best_model.best_params_}")

Tuning Logistic Regression with top 5 features...
Best parameters for Logistic Regression: {'model__C': 10, 'model__class_weight': None, 'model__solver': 'lbfgs'}
Best F1-Score for Logistic Regression: 0.9476

Tuning XGBoost with top 5 features...
Best parameters for XGBoost: {'model__learning_rate': 0.1, 'model__max_depth': 6, 'model__n_estimators': 200, 'model__scale_pos_weight': 5}
Best F1-Score for XGBoost: 0.9205

Best overall model: Logistic Regression
Best F1-Score: 0.9476
Best parameters: {'model__C': 10, 'model__class_weight': None, 'model__solver': 'lbfgs'}


In [83]:
# Import necessary libraries
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.linear_model import LogisticRegression
import joblib

# Define the top 5 features
top_features = ['JobSatisfaction', 'WorkLifeBalance', 'JobInvolvement', 'OverTime', 'Gender']
X_retention_top = df[top_features]
y_retention_binary = (df["RetentionScore"] < 0.5).astype(int)

# Preprocessing
preprocessor_top = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), ['JobSatisfaction', 'WorkLifeBalance', 'JobInvolvement']),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), ['OverTime', 'Gender'])
    ]
)

# Create the pipeline with the best Logistic Regression model
retention_model = ImbPipeline([
    ("preprocessor", preprocessor_top),
    ("smote", SMOTE(random_state=42)),
    ("model", LogisticRegression(C=10, class_weight=None, solver="lbfgs", max_iter=1000, random_state=42))
])

# Train the model on the full dataset
retention_model.fit(X_retention_top, y_retention_binary)
print("Best Logistic Regression model trained successfully.")

# Save the model
joblib.dump(retention_model, "retention_model.pkl")
print("Retention model saved as 'retention_model.pkl'.")

Best Logistic Regression model trained successfully.
Retention model saved as 'retention_model.pkl'.


In [85]:
import requests

# Sample employee data
employee_data = {
    "Age": 35.0,
    "Gender": "Male",
    "Department": "Sales",
    "JobRole": "Sales Executive",
    "MonthlyIncome": 5000.0,
    "YearsAtCompany": 5.0,
    "OverTime": "Yes",
    "JobSatisfaction": 3.0,
    "WorkLifeBalance": 2.0,
    "TotalWorkingYears": 10.0,
    "TrainingTimesLastYear": 2.0,
    "JobInvolvement": 3.0,
    "EnvironmentSatisfaction": 4.0,
    "RelationshipSatisfaction": 3.0
}

# Test performance prediction endpoint
response_performance = requests.post("http://localhost:8001/predict_performance", json=employee_data)
print("Performance Prediction:", response_performance.json())

# Test retention prediction endpoint
response_retention = requests.post("http://localhost:8001/predict_retention", json=employee_data)
print("Retention Prediction:", response_retention.json())

Performance Prediction: {'PerformanceRating': 1.0}
Retention Prediction: {'RetentionRisk': 0.0, 'RetentionRiskProbability': 1.6062871182033433e-10}
