In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

df = pd.read_csv('Employee-Attrition.csv')
display(df.head(5))
print(df.info())
print(df.describe())

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [2]:
display(df.columns)
display(df.shape)
display(df.info())

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

(1470, 35)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

None

In [3]:
display(df.isnull().sum())
display(df.nunique)


Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

<bound method DataFrame.nunique of       Age Attrition     BusinessTravel  DailyRate              Department  \
0      41       Yes      Travel_Rarely       1102                   Sales   
1      49        No  Travel_Frequently        279  Research & Development   
2      37       Yes      Travel_Rarely       1373  Research & Development   
3      33        No  Travel_Frequently       1392  Research & Development   
4      27        No      Travel_Rarely        591  Research & Development   
...   ...       ...                ...        ...                     ...   
1465   36        No  Travel_Frequently        884  Research & Development   
1466   39        No      Travel_Rarely        613  Research & Development   
1467   27        No      Travel_Rarely        155  Research & Development   
1468   49        No  Travel_Frequently       1023                   Sales   
1469   34        No      Travel_Rarely        628  Research & Development   

      DistanceFromHome  Education Educat

In [4]:
num_cols = df.select_dtypes(include=['int64', 'float64']).columns

for col in num_cols:
    print(f"{col}: Skewness = {df[col].skew():.2f}")


Age: Skewness = 0.41
DailyRate: Skewness = -0.00
DistanceFromHome: Skewness = 0.96
Education: Skewness = -0.29
EmployeeCount: Skewness = 0.00
EmployeeNumber: Skewness = 0.02
EnvironmentSatisfaction: Skewness = -0.32
HourlyRate: Skewness = -0.03
JobInvolvement: Skewness = -0.50
JobLevel: Skewness = 1.03
JobSatisfaction: Skewness = -0.33
MonthlyIncome: Skewness = 1.37
MonthlyRate: Skewness = 0.02
NumCompaniesWorked: Skewness = 1.03
PercentSalaryHike: Skewness = 0.82
PerformanceRating: Skewness = 1.92
RelationshipSatisfaction: Skewness = -0.30
StandardHours: Skewness = 0.00
StockOptionLevel: Skewness = 0.97
TotalWorkingYears: Skewness = 1.12
TrainingTimesLastYear: Skewness = 0.55
WorkLifeBalance: Skewness = -0.55
YearsAtCompany: Skewness = 1.76
YearsInCurrentRole: Skewness = 0.92
YearsSinceLastPromotion: Skewness = 1.98
YearsWithCurrManager: Skewness = 0.83


In [5]:
import pandas as pd
from sklearn.preprocessing import PowerTransformer

num_cols = df.select_dtypes(include=['int64', 'float64']).columns

skewed_cols = [col for col in num_cols if abs(df[col].skew()) > 0.5]

pt = PowerTransformer(method='yeo-johnson')
df[skewed_cols] = pt.fit_transform(df[skewed_cols])

for col in skewed_cols:
    print(f"{col}: Skewness (after Yeo-Johnson) = {df[col].skew():.2f}")

DistanceFromHome: Skewness (after Yeo-Johnson) = -0.01
JobLevel: Skewness (after Yeo-Johnson) = 0.10
MonthlyIncome: Skewness (after Yeo-Johnson) = 0.03
NumCompaniesWorked: Skewness (after Yeo-Johnson) = 0.01
PercentSalaryHike: Skewness (after Yeo-Johnson) = 0.12
PerformanceRating: Skewness (after Yeo-Johnson) = 0.00
StockOptionLevel: Skewness (after Yeo-Johnson) = 0.09
TotalWorkingYears: Skewness (after Yeo-Johnson) = -0.01
TrainingTimesLastYear: Skewness (after Yeo-Johnson) = 0.06
WorkLifeBalance: Skewness (after Yeo-Johnson) = -0.01
YearsAtCompany: Skewness (after Yeo-Johnson) = -0.01
YearsInCurrentRole: Skewness (after Yeo-Johnson) = -0.06
YearsSinceLastPromotion: Skewness (after Yeo-Johnson) = 0.21
YearsWithCurrManager: Skewness (after Yeo-Johnson) = -0.07


In [6]:
from sklearn.preprocessing import LabelEncoder

df_encoded = df.copy()
cat_cols = df_encoded.select_dtypes(include=['object']).columns

encoders = {} 
for col in cat_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    encoders[col] = le  

print("Categorical columns encoded to numeric.")
display(df_encoded.head(5))


Categorical columns encoded to numeric.


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,2,1102,2,-1.494085,2,1,1,1,...,1,80,-1.087933,-0.244305,-2.579073,-2.144446,0.139692,0.205562,-1.094862,0.490147
1,49,0,1,279,1,0.243416,1,1,1,2,...,4,80,0.548969,0.052495,0.217384,0.276821,0.76266,0.883888,0.096854,0.909635
2,37,1,2,1373,1,-1.031215,2,4,1,4,...,2,80,-1.087933,-0.4105,0.217384,0.276821,-2.229592,-1.596434,-1.094862,-1.550156
3,33,0,1,1392,1,-0.700347,4,1,1,5,...,3,80,-1.087933,-0.244305,0.217384,0.276821,0.482432,0.883888,0.905592,-1.550156
4,27,0,2,591,1,-1.031215,1,3,1,7,...,4,80,0.548969,-0.592063,0.217384,0.276821,-0.95154,-0.446645,0.607464,-0.406881


In [7]:
for col in num_cols:
    df[col] = df[col].astype(float)  
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    before_outliers = ((df[col] < lower) | (df[col] > upper)).sum()

    df.loc[df[col] < lower, col] = lower
    df.loc[df[col] > upper, col] = upper
    print(f"{col}:= {before_outliers}")


Age:= 0
DailyRate:= 0
DistanceFromHome:= 0
Education:= 0
EmployeeCount:= 0
EmployeeNumber:= 0
EnvironmentSatisfaction:= 0
HourlyRate:= 0
JobInvolvement:= 0
JobLevel:= 0
JobSatisfaction:= 0
MonthlyIncome:= 0
MonthlyRate:= 0
NumCompaniesWorked:= 0
PercentSalaryHike:= 0
PerformanceRating:= 226
RelationshipSatisfaction:= 0
StandardHours:= 0
StockOptionLevel:= 0
TotalWorkingYears:= 11
TrainingTimesLastYear:= 238
WorkLifeBalance:= 0
YearsAtCompany:= 5
YearsInCurrentRole:= 0
YearsSinceLastPromotion:= 0
YearsWithCurrManager:= 0


In [8]:
import pandas as pd
import numpy as np

num_cols = df.select_dtypes(include=['int64', 'float64']).columns

for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    df.loc[df[col] < lower, col] = lower
    df.loc[df[col] > upper, col] = upper

    after_outliers = ((df[col] < lower) | (df[col] > upper)).sum()
    print(f"{col}: Outliers after rectification = {after_outliers}")

Age: Outliers after rectification = 0
DailyRate: Outliers after rectification = 0
DistanceFromHome: Outliers after rectification = 0
Education: Outliers after rectification = 0
EmployeeCount: Outliers after rectification = 0
EmployeeNumber: Outliers after rectification = 0
EnvironmentSatisfaction: Outliers after rectification = 0
HourlyRate: Outliers after rectification = 0
JobInvolvement: Outliers after rectification = 0
JobLevel: Outliers after rectification = 0
JobSatisfaction: Outliers after rectification = 0
MonthlyIncome: Outliers after rectification = 0
MonthlyRate: Outliers after rectification = 0
NumCompaniesWorked: Outliers after rectification = 0
PercentSalaryHike: Outliers after rectification = 0
PerformanceRating: Outliers after rectification = 0
RelationshipSatisfaction: Outliers after rectification = 0
StandardHours: Outliers after rectification = 0
StockOptionLevel: Outliers after rectification = 0
TotalWorkingYears: Outliers after rectification = 0
TrainingTimesLastYea

In [9]:
class_counts = df['Attrition'].value_counts()

print("Class counts for 'Attrition':")
print(class_counts)


Class counts for 'Attrition':
Attrition
No     1233
Yes     237
Name: count, dtype: int64


In [10]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

X = df_encoded.drop('Attrition', axis=1)
y = df_encoded['Attrition']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print("Training set class distribution after SMOTE:")
print(pd.Series(y_train_balanced).value_counts())

Training set class distribution after SMOTE:
Attrition
0    986
1    986
Name: count, dtype: int64


In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(random_state=42, probability=True),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "XGBClassifier": XGBClassifier()
}

for name, model in models.items():
    model.fit(X_train_scaled, y_train_balanced)
    y_pred = model.predict(X_test_scaled)
    print(f"\n{name}:")
    print(f"  Accuracy:  {accuracy_score(y_test, y_pred):.3f}")
    print(f"  Precision: {precision_score(y_test, y_pred):.3f}")
    print(f"  Recall:    {recall_score(y_test, y_pred):.3f}")
    print(f"  F1-score:  {f1_score(y_test, y_pred):.3f}")


Logistic Regression:
  Accuracy:  0.741
  Precision: 0.307
  Recall:    0.489
  F1-score:  0.377

Decision Tree:
  Accuracy:  0.731
  Precision: 0.258
  Recall:    0.362
  F1-score:  0.301

Random Forest:
  Accuracy:  0.827
  Precision: 0.417
  Recall:    0.213
  F1-score:  0.282

SVM:
  Accuracy:  0.816
  Precision: 0.426
  Recall:    0.426
  F1-score:  0.426

KNN:
  Accuracy:  0.554
  Precision: 0.191
  Recall:    0.553
  F1-score:  0.284

Naive Bayes:
  Accuracy:  0.656
  Precision: 0.250
  Recall:    0.574
  F1-score:  0.348

XGBClassifier:
  Accuracy:  0.854
  Precision: 0.583
  Recall:    0.298
  F1-score:  0.394


In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20]
}

grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid.fit(X_train_scaled, y_train_balanced)

print("Best parameters:", grid.best_params_)
from sklearn.ensemble import RandomForestClassifier

model1 =  RandomForestClassifier(max_depth = 20 , n_estimators = 200)

model1.fit(X_train_scaled, y_train_balanced)
y_pred = model1.predict(X_test_scaled)
print(f"  Accuracy:  {accuracy_score(y_test, y_pred):.3f}")
print(f"  Precision: {precision_score(y_test, y_pred):.3f}")
print(f"  Recall:    {recall_score(y_test, y_pred):.3f}")
print(f"  F1-score:  {f1_score(y_test, y_pred):.3f}")



from sklearn.metrics import confusion_matrix, classification_report

print("Test set class distribution:")
print(pd.Series(y_test).value_counts())

cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["No", "Yes"]))

Best parameters: {'max_depth': 20, 'n_estimators': 100}
  Accuracy:  0.820
  Precision: 0.385
  Recall:    0.213
  F1-score:  0.274
Test set class distribution:
Attrition
0    247
1     47
Name: count, dtype: int64

Confusion Matrix:
[[231  16]
 [ 37  10]]

Classification Report:
              precision    recall  f1-score   support

          No       0.86      0.94      0.90       247
         Yes       0.38      0.21      0.27        47

    accuracy                           0.82       294
   macro avg       0.62      0.57      0.59       294
weighted avg       0.79      0.82      0.80       294



In [14]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}

grid_nb = GridSearchCV(GaussianNB(), param_grid, cv=5)
grid_nb.fit(X_train_scaled, y_train_balanced)

best_nb = grid_nb.best_estimator_

y_pred_nb = best_nb.predict(X_test_scaled)

print("Naive Bayes (after tuning):")
print(f"  Accuracy:  {accuracy_score(y_test, y_pred_nb):.3f}")
print(f"  Precision: {precision_score(y_test, y_pred_nb):.3f}")
print(f"  Recall:    {recall_score(y_test, y_pred_nb):.3f}")
print(f"  F1-score:  {f1_score(y_test, y_pred_nb):.3f}")
print("Best Hyperparameters:", grid_nb.best_params_)

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_nb))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_nb, target_names=["No", "Yes"]))

Naive Bayes (after tuning):
  Accuracy:  0.656
  Precision: 0.250
  Recall:    0.574
  F1-score:  0.348
Best Hyperparameters: {'var_smoothing': 1e-09}

Confusion Matrix:
[[166  81]
 [ 20  27]]

Classification Report:
              precision    recall  f1-score   support

          No       0.89      0.67      0.77       247
         Yes       0.25      0.57      0.35        47

    accuracy                           0.66       294
   macro avg       0.57      0.62      0.56       294
weighted avg       0.79      0.66      0.70       294



In [15]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

grid_svm = GridSearchCV(SVC(probability=True, random_state=42), param_grid, cv=5)
grid_svm.fit(X_train_scaled, y_train_balanced)

best_svm = grid_svm.best_estimator_

y_pred_svm = best_svm.predict(X_test_scaled)

print("SVM (after tuning):")
print(f"  Accuracy:  {accuracy_score(y_test, y_pred_svm):.3f}")
print(f"  Precision: {precision_score(y_test, y_pred_svm):.3f}")
print(f"  Recall:    {recall_score(y_test, y_pred_svm):.3f}")
print(f"  F1-score:  {f1_score(y_test, y_pred_svm):.3f}")
print("Best Hyperparameters:", grid_svm.best_params_)

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm, target_names=["No", "Yes"]))

SVM (after tuning):
  Accuracy:  0.827
  Precision: 0.450
  Recall:    0.383
  F1-score:  0.414
Best Hyperparameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}

Confusion Matrix:
[[225  22]
 [ 29  18]]

Classification Report:
              precision    recall  f1-score   support

          No       0.89      0.91      0.90       247
         Yes       0.45      0.38      0.41        47

    accuracy                           0.83       294
   macro avg       0.67      0.65      0.66       294
weighted avg       0.82      0.83      0.82       294



In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

grid_knn = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid_knn.fit(X_train_scaled, y_train_balanced)

best_knn = grid_knn.best_estimator_

y_pred_knn = best_knn.predict(X_test_scaled)

print("KNN (after tuning):")
print(f"  Accuracy:  {accuracy_score(y_test, y_pred_knn):.3f}")
print(f"  Precision: {precision_score(y_test, y_pred_knn):.3f}")
print(f"  Recall:    {recall_score(y_test, y_pred_knn):.3f}")
print(f"  F1-score:  {f1_score(y_test, y_pred_knn):.3f}")
print("Best Hyperparameters:", grid_knn.best_params_)

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_knn, target_names=["No", "Yes"]))

KNN (after tuning):
  Accuracy:  0.680
  Precision: 0.224
  Recall:    0.404
  F1-score:  0.288
Best Hyperparameters: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}

Confusion Matrix:
[[181  66]
 [ 28  19]]

Classification Report:
              precision    recall  f1-score   support

          No       0.87      0.73      0.79       247
         Yes       0.22      0.40      0.29        47

    accuracy                           0.68       294
   macro avg       0.54      0.57      0.54       294
weighted avg       0.76      0.68      0.71       294



In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

param_grid = {
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

grid_dt = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5)
grid_dt.fit(X_train_scaled, y_train_balanced)

best_dt = grid_dt.best_estimator_

y_pred_dt = best_dt.predict(X_test_scaled)

print("Decision Tree (after tuning):")
print(f"  Accuracy:  {accuracy_score(y_test, y_pred_dt):.3f}")
print(f"  Precision: {precision_score(y_test, y_pred_dt):.3f}")
print(f"  Recall:    {recall_score(y_test, y_pred_dt):.3f}")
print(f"  F1-score:  {f1_score(y_test, y_pred_dt):.3f}")
print("Best Hyperparameters:", grid_dt.best_params_)

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_dt, target_names=["No", "Yes"]))

Decision Tree (after tuning):
  Accuracy:  0.728
  Precision: 0.230
  Recall:    0.298
  F1-score:  0.259
Best Hyperparameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 5}

Confusion Matrix:
[[200  47]
 [ 33  14]]

Classification Report:
              precision    recall  f1-score   support

          No       0.86      0.81      0.83       247
         Yes       0.23      0.30      0.26        47

    accuracy                           0.73       294
   macro avg       0.54      0.55      0.55       294
weighted avg       0.76      0.73      0.74       294



In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],           
    'penalty': ['l2'],                      
    'solver': ['lbfgs', 'liblinear']        
}

grid_lr = GridSearchCV(LogisticRegression(random_state=42, max_iter=1000), param_grid, cv=5)
grid_lr.fit(X_train_scaled, y_train_balanced)

best_lr = grid_lr.best_estimator_

y_pred_lr = best_lr.predict(X_test_scaled)

print("Logistic Regression (after tuning):")
print(f"  Accuracy:  {accuracy_score(y_test, y_pred_lr):.3f}")
print(f"  Precision: {precision_score(y_test, y_pred_lr):.3f}")
print(f"  Recall:    {recall_score(y_test, y_pred_lr):.3f}")
print(f"  F1-score:  {f1_score(y_test, y_pred_lr):.3f}")
print("Best Hyperparameters:", grid_lr.best_params_)

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr, target_names=["No", "Yes"]))

Logistic Regression (after tuning):
  Accuracy:  0.745
  Precision: 0.311
  Recall:    0.489
  F1-score:  0.380
Best Hyperparameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}

Confusion Matrix:
[[196  51]
 [ 24  23]]

Classification Report:
              precision    recall  f1-score   support

          No       0.89      0.79      0.84       247
         Yes       0.31      0.49      0.38        47

    accuracy                           0.74       294
   macro avg       0.60      0.64      0.61       294
weighted avg       0.80      0.74      0.77       294



In [22]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

X_train_all = X_train_balanced
X_test_all = X_test

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_all)
X_test_scaled = scaler.transform(X_test_all)

param_grid_xgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.01],
    'max_depth': [3, 6, 9],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'scale_pos_weight': [1]
}

grid_xgb = GridSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    param_grid_xgb,
    cv=5,
    scoring='f1',
    n_jobs=-1
)
grid_xgb.fit(X_train_scaled, y_train_balanced)
best_xgb = grid_xgb.best_estimator_

y_pred_xgb = best_xgb.predict(X_test_scaled)
print("\nXGBoost (all features, after tuning):")
print(f"  Accuracy:  {accuracy_score(y_test, y_pred_xgb):.3f}")
print(f"  Precision: {precision_score(y_test, y_pred_xgb):.3f}")
print(f"  Recall:    {recall_score(y_test, y_pred_xgb):.3f}")
print(f"  F1-score:  {f1_score(y_test, y_pred_xgb):.3f}")
print("Best Hyperparameters:", grid_xgb.best_params_)
print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb, target_names=["No", "Yes"]))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



XGBoost (all features, after tuning):
  Accuracy:  0.827
  Precision: 0.417
  Recall:    0.213
  F1-score:  0.282
Best Hyperparameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 200, 'scale_pos_weight': 1, 'subsample': 1}
[[233  14]
 [ 37  10]]
              precision    recall  f1-score   support

          No       0.86      0.94      0.90       247
         Yes       0.42      0.21      0.28        47

    accuracy                           0.83       294
   macro avg       0.64      0.58      0.59       294
weighted avg       0.79      0.83      0.80       294



In [23]:
best_rf = grid_xgb.best_estimator_
importances = best_rf.feature_importances_

feature_names = X_train.columns

feat_imp_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
feat_imp_df = feat_imp_df.sort_values(by='importance', ascending=False)

print(feat_imp_df.head(10)) 

top_n = 15  
top_features = feat_imp_df['feature'].head(top_n).tolist()

                     feature  importance
26          StockOptionLevel    0.248645
13                  JobLevel    0.145521
33      YearsWithCurrManager    0.066524
15           JobSatisfaction    0.040492
1             BusinessTravel    0.029382
29           WorkLifeBalance    0.027860
24  RelationshipSatisfaction    0.026812
12            JobInvolvement    0.026324
9    EnvironmentSatisfaction    0.024487
14                   JobRole    0.023135


In [24]:
selected_features = top_features 

X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train_selected)

X_train_sel_scaled = scaler.transform(X_train_selected)
X_test_sel_scaled = scaler.transform(X_test_selected)

In [25]:
from sklearn.preprocessing import LabelEncoder
df_encoded = df.copy()
cat_cols = df_encoded.select_dtypes(include=['object']).columns
le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    le_dict[col] = le

X = df_encoded.drop('Attrition', axis=1)
y = df_encoded['Attrition']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

selected_features = top_features

X_train_sel = X_train_bal[selected_features]
X_test_sel = X_test[selected_features]

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_sel) 
X_test_scaled = scaler.transform(X_test_sel)

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.01],
    'max_depth': [3, 6, 9],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'scale_pos_weight': [1]
}
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
grid = GridSearchCV(xgb, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid.fit(X_train_scaled, y_train_bal)
best_xgb = grid.best_estimator_
print("XGBoost best params:", grid.best_params_)

y_pred = best_xgb.predict(X_test_scaled)

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


XGBoost best params: {'colsample_bytree': 1, 'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 200, 'scale_pos_weight': 1, 'subsample': 0.8}
[[234  13]
 [ 36  11]]
              precision    recall  f1-score   support

           0       0.87      0.95      0.91       247
           1       0.46      0.23      0.31        47

    accuracy                           0.83       294
   macro avg       0.66      0.59      0.61       294
weighted avg       0.80      0.83      0.81       294

Accuracy: 0.8333333333333334


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
import joblib

joblib.dump(best_xgb, "XGBoost.joblib")

joblib.dump(scaler, "scaler.joblib")

joblib.dump(encoders, "label_encoders.joblib")  

joblib.dump(top_features, "top_features.joblib")

['top_features.joblib']