<a href="https://colab.research.google.com/github/Infant-Joshva/Project_3-Employee_Attrition_Analysis_and_Prediction/blob/main/notebooks/ML_Model_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model selection

#### Necessary packages

In [None]:
# Importing packages for data loading

import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN

In [None]:
# Machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

# Evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report

#### ETL

In [None]:
# Raw GitHub cleaaned dataset(CSV) link
url = "https://raw.githubusercontent.com/Infant-Joshva/Project_3-Employee_Attrition_Analysis_and_Prediction/refs/heads/main/data%20set/employee_data_cleaned.csv"

emp_df = pd.read_csv(url).reset_index(drop=True)
emp_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,Emp_ID,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,1,2,Female,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,2,3,Male,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,4,Male,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,5,4,Female,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,7,1,Male,...,3,4,1,6,3,3,2,2,2,2


#### Preprocessing Pipeline

In [None]:
# Importing lib for Pipeline

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OrdinalEncoder,OneHotEncoder
from sklearn.impute import SimpleImputer

In [None]:
# emp_df['Attrition_binary'] = emp_df['Attrition'].map({'Yes': 1, 'No': 0})

In [None]:
# Creating Features and Target in new DataFrame

X=emp_df[['Age','DistanceFromHome','Education','EnvironmentSatisfaction','JobInvolvement','JobLevel','JobSatisfaction','MonthlyIncome','NumCompaniesWorked','PercentSalaryHike','PerformanceRating','RelationshipSatisfaction',
         'TotalWorkingYears','TrainingTimesLastYear','WorkLifeBalance','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager','BusinessTravel',
         'Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']] # Features
Y=emp_df['Attrition'] # Target

In [None]:
# Seperating Numerical and Categorical Features
# For 1st we seperating Numerical into 2 types one is Continues and Discreat

num_feature=X[X.select_dtypes(include=['int64','float64','int32']).columns]
num_feature

Unnamed: 0,Age,DistanceFromHome,Education,EnvironmentSatisfaction,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,2,2,3,2,4,5993,8,11,3,1,8,0,1,6,4,0,5
1,49,8,1,3,2,2,2,5130,1,23,4,4,10,3,3,10,7,1,7
2,37,2,2,4,2,1,3,2090,6,15,3,2,7,3,3,0,0,0,0
3,33,3,4,4,3,1,3,2909,1,11,3,3,8,3,3,8,7,3,0
4,27,2,1,1,3,1,2,3468,9,12,3,4,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,23,2,3,4,2,4,2571,4,17,3,3,17,3,3,5,2,0,3
1466,39,6,1,4,2,3,1,9991,4,15,3,1,9,5,3,7,7,1,7
1467,27,4,3,2,4,2,2,6142,1,20,4,2,6,0,3,6,2,0,3
1468,49,2,3,4,2,2,2,5390,2,14,3,4,17,3,2,9,6,0,8


In [None]:
# We have continues and discret values so we again seperating numerical values
num_feature_dis = num_feature[['Education','EnvironmentSatisfaction','JobInvolvement','JobLevel','JobSatisfaction',
                               'RelationshipSatisfaction','WorkLifeBalance','PerformanceRating']].reset_index(drop=True) # Discreat Values
                               #Need to confirm "StockOptionLevel" is needed or not
num_features_con = num_feature[['Age','DistanceFromHome','MonthlyIncome','NumCompaniesWorked','PercentSalaryHike','TotalWorkingYears','TrainingTimesLastYear',
                                'YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager']].reset_index(drop=True) # Continues Values

In [None]:
# Seperating Categorical features

cat_feature=X[X.select_dtypes(include=['object']).columns].reset_index(drop=True)
cat_feature

Unnamed: 0,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,OverTime
0,Travel_Rarely,Sales,Life Sciences,Female,Sales Executive,Single,Yes
1,Travel_Frequently,Research & Development,Life Sciences,Male,Research Scientist,Married,No
2,Travel_Rarely,Research & Development,Other,Male,Laboratory Technician,Single,Yes
3,Travel_Frequently,Research & Development,Life Sciences,Female,Research Scientist,Married,Yes
4,Travel_Rarely,Research & Development,Medical,Male,Laboratory Technician,Married,No
...,...,...,...,...,...,...,...
1465,Travel_Frequently,Research & Development,Medical,Male,Laboratory Technician,Married,No
1466,Travel_Rarely,Research & Development,Medical,Male,Healthcare Representative,Married,No
1467,Travel_Rarely,Research & Development,Life Sciences,Male,Manufacturing Director,Married,Yes
1468,Travel_Frequently,Sales,Medical,Male,Sales Executive,Married,No


In [None]:
# Seperating Categorical features for OneHot Encoding

cat_feature_OH = cat_feature[['BusinessTravel','Department','Gender','MaritalStatus','OverTime']].reset_index(drop=True)
cat_feature_OH

Unnamed: 0,BusinessTravel,Department,Gender,MaritalStatus,OverTime
0,Travel_Rarely,Sales,Female,Single,Yes
1,Travel_Frequently,Research & Development,Male,Married,No
2,Travel_Rarely,Research & Development,Male,Single,Yes
3,Travel_Frequently,Research & Development,Female,Married,Yes
4,Travel_Rarely,Research & Development,Male,Married,No
...,...,...,...,...,...
1465,Travel_Frequently,Research & Development,Male,Married,No
1466,Travel_Rarely,Research & Development,Male,Married,No
1467,Travel_Rarely,Research & Development,Male,Married,Yes
1468,Travel_Frequently,Sales,Male,Married,No


In [None]:
# Seperating Categorical features for Target Encoding

cat_feature_TR = cat_feature[['EducationField','JobRole']].reset_index(drop=True)
cat_feature_TR

Unnamed: 0,EducationField,JobRole
0,Life Sciences,Sales Executive
1,Life Sciences,Research Scientist
2,Other,Laboratory Technician
3,Life Sciences,Research Scientist
4,Medical,Laboratory Technician
...,...,...
1465,Medical,Laboratory Technician
1466,Medical,Healthcare Representative
1467,Life Sciences,Manufacturing Director
1468,Medical,Sales Executive


In [None]:
cat_feature_TR_cols = cat_feature_TR.columns
cat_feature_TR_cols

Index(['EducationField', 'JobRole'], dtype='object')

In [None]:
# Loop through each categorical column

# This will store the results
cat_feature_TR_cols_cross_list = []

for col in cat_feature_TR_cols:
    crosstab = pd.crosstab(emp_df[col], emp_df['Attrition'])

    # Ensure both 'Yes' and 'No' are present to avoid KeyErrors
    if 'Yes' not in crosstab.columns:
        crosstab['Yes'] = 0
    if 'No' not in crosstab.columns:
        crosstab['No'] = 0


    # Compute the attrition ratio
    crosstab['Attrition_Ratio'] = crosstab['Yes'] / (crosstab['Yes'] + crosstab['No'])

    # Append the sorted index (category names) by attrition ratio
    sorted_index = crosstab.sort_values(by='Attrition_Ratio', ascending=False).index
    cat_feature_TR_cols_cross_list.append(sorted_index)

cat_feature_TR_cols_cross_list

[Index(['Human Resources', 'Technical Degree', 'Marketing', 'Life Sciences',
        'Medical', 'Other'],
       dtype='object', name='EducationField'),
 Index(['Sales Representative', 'Laboratory Technician', 'Human Resources',
        'Sales Executive', 'Research Scientist', 'Manufacturing Director',
        'Healthcare Representative', 'Manager', 'Research Director'],
       dtype='object', name='JobRole')]

In [None]:
# The list for Odinal Encoding Categorical value

cat_feature_TR_cols_cross_list

[Index(['Human Resources', 'Technical Degree', 'Marketing', 'Life Sciences',
        'Medical', 'Other'],
       dtype='object', name='EducationField'),
 Index(['Sales Representative', 'Laboratory Technician', 'Human Resources',
        'Sales Executive', 'Research Scientist', 'Manufacturing Director',
        'Healthcare Representative', 'Manager', 'Research Director'],
       dtype='object', name='JobRole')]

In [None]:
# Creating transformer for Numerical con columns

numerical_transformer_con=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])

numerical_transformer_con

In [None]:
# Creating transformer for dis column

numerical_transformer_dis=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent'))
])

numerical_transformer_dis

In [None]:
# Creating transformer for cat OneHotEncoder column

categorical_transformer_OH=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(dtype='int64', sparse_output=False, drop='first'))
])

categorical_transformer_OH

In [None]:
# Creating transformer for cat odinal column

categorical_transformer_TR=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OrdinalEncoder(categories=[['Human Resources', 'Technical Degree', 'Marketing', 'Life Sciences','Medical', 'Other'],
     ['Sales Representative', 'Laboratory Technician', 'Human Resources','Sales Executive', 'Research Scientist', 'Manufacturing Director','Healthcare Representative', 'Manager', 'Research Director']]))
])

categorical_transformer_TR

In [None]:
# Combining the all transformer as a full loaded pipeline with column transformer

preprocess=ColumnTransformer(
    transformers=[
        ('num_con',numerical_transformer_con,num_features_con.columns),
        ('num_dis',numerical_transformer_dis,num_feature_dis.columns),
        ('cat_OH',categorical_transformer_OH,cat_feature_OH.columns),
        ('cat_TR',categorical_transformer_TR,cat_feature_TR.columns)
    ]
)
preprocess



*   preprocess ready
*   here i'm going to prepare ML model



#### Data Balancing

In [None]:
from imblearn.pipeline import Pipeline as IMBPipeline

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
# Splitting data for train and test

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)
X_train.shape,X_test.shape,Y_train.shape,Y_test.shape

((1176, 26), (294, 26), (1176,), (294,))

In [None]:
X_train_trans=preprocess.fit_transform(X_train)

In [None]:
smote=SMOTE(random_state=42)
X_train_smote,y_train_smote=smote.fit_resample(X_train_trans,Y_train)
y_train_smote.value_counts()

Unnamed: 0_level_0,count
Attrition,Unnamed: 1_level_1
No,978
Yes,978


In [None]:
adaysn=ADASYN(random_state=42)
X_train_adasyn,y_train_adasyn=adaysn.fit_resample(X_train_trans,Y_train)
y_train_adasyn.value_counts()

Unnamed: 0_level_0,count
Attrition,Unnamed: 1_level_1
No,978
Yes,957


#### ML Pipeline

##### LogisticRegression

In [None]:
# Creating pipeline for LogReg

logreg_pipeline_smote=IMBPipeline(steps=[
    ('preprocess',preprocess),
    ('Sampling',SMOTE(random_state=42)),
    ('Poly',PolynomialFeatures(degree=2)),
    ('logreg',LogisticRegression(class_weight="balanced", C=0.1))
])

logreg_pipeline_smote.fit(X_train,Y_train)

Y_train_pred=logreg_pipeline_smote.predict(X_train)
Y_test_pred=logreg_pipeline_smote.predict(X_test)

print('Confusion Matrix Train')
print('--'*50)
print(confusion_matrix(Y_train,Y_train_pred))
print('Confusion Matrix Test')
print('--'*50)
print(confusion_matrix(Y_test,Y_test_pred))
print('**'*50)
print('Classification Report Train')
print('--'*50)
print(classification_report(Y_train,Y_train_pred))
print('Classification Report Test')
print('--'*50)
print(classification_report(Y_test,Y_test_pred))
print('--'*50)


Confusion Matrix Train
----------------------------------------------------------------------------------------------------
[[903  75]
 [ 19 179]]
Confusion Matrix Test
----------------------------------------------------------------------------------------------------
[[224  31]
 [ 18  21]]
****************************************************************************************************
Classification Report Train
----------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

          No       0.98      0.92      0.95       978
         Yes       0.70      0.90      0.79       198

    accuracy                           0.92      1176
   macro avg       0.84      0.91      0.87      1176
weighted avg       0.93      0.92      0.92      1176

Classification Report Test
----------------------------------------------------------------------------------------------------
              precisio

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


###### Exporting ML Model as Pickle file

In [None]:
import pickle

In [None]:
pickle.dump(logreg_pipeline_smote,open('logreg_pipeline_smote.pkl','wb'))

In [None]:
# Creating pipeline for LogReg

logreg_pipeline=IMBPipeline(steps=[
    ('preprocess',preprocess),
    ('Sampling',ADASYN(random_state=42)),
    ('Poly',PolynomialFeatures(degree=2)),
    ('logreg',LogisticRegression(class_weight="balanced",penalty='l2'))
])

logreg_pipeline.fit(X_train,Y_train)

Y_train_pred=logreg_pipeline.predict(X_train)
Y_test_pred=logreg_pipeline.predict(X_test)

print('Confusion Matrix Train')
print('--'*50)
print(confusion_matrix(Y_train,Y_train_pred))
print('Confusion Matrix Test')
print('--'*50)
print(confusion_matrix(Y_test,Y_test_pred))
print('**'*50)
print('Classification Report Train')
print('--'*50)
print(classification_report(Y_train,Y_train_pred))
print('Classification Report Test')
print('--'*50)
print(classification_report(Y_test,Y_test_pred))
print('--'*50)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Confusion Matrix Train
----------------------------------------------------------------------------------------------------
[[908  70]
 [ 15 183]]
Confusion Matrix Test
----------------------------------------------------------------------------------------------------
[[219  36]
 [ 21  18]]
****************************************************************************************************
Classification Report Train
----------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

          No       0.98      0.93      0.96       978
         Yes       0.72      0.92      0.81       198

    accuracy                           0.93      1176
   macro avg       0.85      0.93      0.88      1176
weighted avg       0.94      0.93      0.93      1176

Classification Report Test
----------------------------------------------------------------------------------------------------
              precisio

##### SVC

In [None]:
# Creating pipeline for SVC

svc_pipeline=IMBPipeline(steps=[
    ('preprocess',preprocess),
    ('Sampling',SMOTE(random_state=42)),
    ('svc',SVC(class_weight="balanced", C=0.1))
])

svc_pipeline.fit(X_train,Y_train)

Y_train_pred=svc_pipeline.predict(X_train)
Y_test_pred=svc_pipeline.predict(X_test)

print('Confusion Matrix Train')
print('--'*50)
print(confusion_matrix(Y_train,Y_train_pred))
print('Confusion Matrix Test')
print('--'*50)
print(confusion_matrix(Y_test,Y_test_pred))
print('**'*50)
print('Classification Report Train')
print('--'*50)
print(classification_report(Y_train,Y_train_pred))
print('Classification Report Test')
print('--'*50)
print(classification_report(Y_test,Y_test_pred))
print('--'*50)


Confusion Matrix Train
----------------------------------------------------------------------------------------------------
[[728 250]
 [ 43 155]]
Confusion Matrix Test
----------------------------------------------------------------------------------------------------
[[198  57]
 [ 16  23]]
****************************************************************************************************
Classification Report Train
----------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

          No       0.94      0.74      0.83       978
         Yes       0.38      0.78      0.51       198

    accuracy                           0.75      1176
   macro avg       0.66      0.76      0.67      1176
weighted avg       0.85      0.75      0.78      1176

Classification Report Test
----------------------------------------------------------------------------------------------------
              precisio

In [None]:
# Creating pipeline for SVC

svc_pipeline=IMBPipeline(steps=[
    ('preprocess',preprocess),
    ('Sampling',ADASYN(random_state=42)),
    ('svc',SVC(class_weight="balanced",kernel='poly'))
])

svc_pipeline.fit(X_train,Y_train)

Y_train_pred=svc_pipeline.predict(X_train)
Y_test_pred=svc_pipeline.predict(X_test)

print('Confusion Matrix Train')
print('--'*50)
print(confusion_matrix(Y_train,Y_train_pred))
print('Confusion Matrix Test')
print('--'*50)
print(confusion_matrix(Y_test,Y_test_pred))
print('**'*50)
print('Classification Report Train')
print('--'*50)
print(classification_report(Y_train,Y_train_pred))
print('Classification Report Test')
print('--'*50)
print(classification_report(Y_test,Y_test_pred))
print('--'*50)


Confusion Matrix Train
----------------------------------------------------------------------------------------------------
[[823 155]
 [ 15 183]]
Confusion Matrix Test
----------------------------------------------------------------------------------------------------
[[206  49]
 [ 17  22]]
****************************************************************************************************
Classification Report Train
----------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

          No       0.98      0.84      0.91       978
         Yes       0.54      0.92      0.68       198

    accuracy                           0.86      1176
   macro avg       0.76      0.88      0.79      1176
weighted avg       0.91      0.86      0.87      1176

Classification Report Test
----------------------------------------------------------------------------------------------------
              precisio

##### DecisionTreeClassifier

In [None]:
# Creating pipeline for DecisionTreeClassifier

dt_pipeline=IMBPipeline(steps=[
    ('preprocess',preprocess),
    ('Sampling',SMOTE(random_state=42)),
    ('dt',DecisionTreeClassifier(class_weight={'No':1,'Yes':8},max_depth=8,min_samples_split=15,min_samples_leaf=15))
])

dt_pipeline

In [None]:
dt_pipeline.fit(X_train,Y_train)

In [None]:
Y_train_pred=dt_pipeline.predict(X_train)
Y_test_pred=dt_pipeline.predict(X_test)

print('Confusion Matrix Train')
print('--'*50)
print(confusion_matrix(Y_train,Y_train_pred))
print('Confusion Matrix Test')
print('--'*50)
print(confusion_matrix(Y_test,Y_test_pred))
print('**'*50)
print('Classification Report Train')
print('--'*50)
print(classification_report(Y_train,Y_train_pred))
print('Classification Report Test')
print('--'*50)
print(classification_report(Y_test,Y_test_pred))
print('--'*50)

Confusion Matrix Train
----------------------------------------------------------------------------------------------------
[[605 373]
 [ 10 188]]
Confusion Matrix Test
----------------------------------------------------------------------------------------------------
[[133 122]
 [ 16  23]]
****************************************************************************************************
Classification Report Train
----------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

          No       0.98      0.62      0.76       978
         Yes       0.34      0.95      0.50       198

    accuracy                           0.67      1176
   macro avg       0.66      0.78      0.63      1176
weighted avg       0.87      0.67      0.72      1176

Classification Report Test
----------------------------------------------------------------------------------------------------
              precisio

In [None]:
# Creating pipeline for DecisionTreeClassifier

dt_pipeline=IMBPipeline(steps=[
    ('preprocess',preprocess),
    ('Sampling',ADASYN(random_state=42)),
    ('dt',DecisionTreeClassifier(class_weight={'No':1,'Yes':8},max_depth=8,min_samples_split=15,min_samples_leaf=15))
])

dt_pipeline.fit(X_train,Y_train)

Y_train_pred=dt_pipeline.predict(X_train)
Y_test_pred=dt_pipeline.predict(X_test)

print('Confusion Matrix Train')
print('--'*50)
print(confusion_matrix(Y_train,Y_train_pred))
print('Confusion Matrix Test')
print('--'*50)
print(confusion_matrix(Y_test,Y_test_pred))
print('**'*50)
print('Classification Report Train')
print('--'*50)
print(classification_report(Y_train,Y_train_pred))
print('Classification Report Test')
print('--'*50)
print(classification_report(Y_test,Y_test_pred))
print('--'*50)


Confusion Matrix Train
----------------------------------------------------------------------------------------------------
[[651 327]
 [ 12 186]]
Confusion Matrix Test
----------------------------------------------------------------------------------------------------
[[161  94]
 [ 15  24]]
****************************************************************************************************
Classification Report Train
----------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

          No       0.98      0.67      0.79       978
         Yes       0.36      0.94      0.52       198

    accuracy                           0.71      1176
   macro avg       0.67      0.80      0.66      1176
weighted avg       0.88      0.71      0.75      1176

Classification Report Test
----------------------------------------------------------------------------------------------------
              precisio

##### Random Forest

In [None]:
# Creating pipeline for Random Forest

RF_pipeline = IMBPipeline(steps=[
    ('preprocess', preprocess),
    ('Sampling', SMOTE(random_state=42)),
    ('rf', RandomForestClassifier(class_weight={'No':4,'Yes':6},max_depth=10,min_samples_split=20,min_samples_leaf=20, n_estimators=500))

])

RF_pipeline

In [None]:
RF_pipeline.fit(X_train,Y_train)

In [None]:
Y_train_pred=RF_pipeline.predict(X_train)
Y_test_pred=RF_pipeline.predict(X_test)

print('Confusion Matrix Train')
print('--'*50)
print(confusion_matrix(Y_train,Y_train_pred))
print('Confusion Matrix Test')
print('--'*50)
print(confusion_matrix(Y_test,Y_test_pred))
print('**'*50)
print('Classification Report Train')
print('--'*50)
print(classification_report(Y_train,Y_train_pred))
print('Classification Report Test')
print('--'*50)
print(classification_report(Y_test,Y_test_pred))
print('--'*50)

Confusion Matrix Train
----------------------------------------------------------------------------------------------------
[[883  95]
 [ 51 147]]
Confusion Matrix Test
----------------------------------------------------------------------------------------------------
[[224  31]
 [ 21  18]]
****************************************************************************************************
Classification Report Train
----------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

          No       0.95      0.90      0.92       978
         Yes       0.61      0.74      0.67       198

    accuracy                           0.88      1176
   macro avg       0.78      0.82      0.80      1176
weighted avg       0.89      0.88      0.88      1176

Classification Report Test
----------------------------------------------------------------------------------------------------
              precisio

##### XGBclassifier

In [None]:
Y_train.value_counts()

Unnamed: 0_level_0,count
Attrition,Unnamed: 1_level_1
No,978
Yes,198


In [None]:
len(Y_train[Y_train=='No'])/len(Y_train[Y_train=='Yes'])

4.9393939393939394

In [None]:
class_ratio=len(Y_train[Y_train=='No'])/len(Y_train[Y_train=='Yes'])

In [None]:
from sklearn.preprocessing import LabelEncoder

# encode Y
le = LabelEncoder()
Y_train_enc = le.fit_transform(Y_train)   # No=0, Yes=1
Y_test_enc  = le.transform(Y_test)

XGB_pipeline=IMBPipeline(steps=[
    ('Preprcess',preprocess),
    ('Sampling',SMOTE(random_state=42)),
    ('Model',XGBClassifier(scale_pos_weight=class_ratio, max_depth=4 ))

])

# now train again
XGB_pipeline.fit(X_train, Y_train_enc)

Y_train_pred = XGB_pipeline.predict(X_train)
Y_test_pred  = XGB_pipeline.predict(X_test)

print('Confusion Matrix Train')
print(confusion_matrix(Y_train_enc, Y_train_pred))
print('Confusion Matrix Test')
print(confusion_matrix(Y_test_enc, Y_test_pred))

print('Classification Report Train')
print(classification_report(Y_train_enc, Y_train_pred, target_names=le.classes_))
print('Classification Report Test')
print(classification_report(Y_test_enc, Y_test_pred, target_names=le.classes_))


Confusion Matrix Train
[[978   0]
 [  0 198]]
Confusion Matrix Test
[[226  29]
 [ 22  17]]
Classification Report Train
              precision    recall  f1-score   support

          No       1.00      1.00      1.00       978
         Yes       1.00      1.00      1.00       198

    accuracy                           1.00      1176
   macro avg       1.00      1.00      1.00      1176
weighted avg       1.00      1.00      1.00      1176

Classification Report Test
              precision    recall  f1-score   support

          No       0.91      0.89      0.90       255
         Yes       0.37      0.44      0.40        39

    accuracy                           0.83       294
   macro avg       0.64      0.66      0.65       294
weighted avg       0.84      0.83      0.83       294



##### KNeighborsClassifier

In [None]:
KNeighborsClassifier_pipeline=IMBPipeline(steps=[
    ('preprocess',preprocess),
    ('Sampling',SMOTE(random_state=42)),
    ('knn',KNeighborsClassifier(
    n_neighbors=5,        # Number of nearest neighbors (k value)
    weights='uniform',    # 'uniform' = all neighbors equal, 'distance' = closer neighbors have more influence
    algorithm='auto',     # 'auto', 'ball_tree', 'kd_tree', 'brute'
    leaf_size=30,         # Leaf size for BallTree/KDTree (affects speed)
    p=2,                  # Power parameter for distance: p=1 → Manhattan, p=2 → Euclidean
    metric='minkowski',   # Distance metric (default minkowski with p=2 = Euclidean)
    metric_params=None,   # Extra metric arguments (rarely used)
    n_jobs=None           # Parallel jobs (-1 = use all CPUs)
))
])

KNeighborsClassifier_pipeline.fit(X_train,Y_train)

Y_train_pred=KNeighborsClassifier_pipeline.predict(X_train)
Y_test_pred=KNeighborsClassifier_pipeline.predict(X_test)

print('Confusion Matrix Train')
print('--'*50)
print(confusion_matrix(Y_train,Y_train_pred))
print('Confusion Matrix Test')
print('--'*50)
print(confusion_matrix(Y_test,Y_test_pred))
print('**'*50)
print('Classification Report Train')
print('--'*50)
print(classification_report(Y_train,Y_train_pred))
print('Classification Report Test')
print('--'*50)
print(classification_report(Y_test,Y_test_pred))
print('--'*50)


Confusion Matrix Train
----------------------------------------------------------------------------------------------------
[[762 216]
 [  1 197]]
Confusion Matrix Test
----------------------------------------------------------------------------------------------------
[[178  77]
 [ 14  25]]
****************************************************************************************************
Classification Report Train
----------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

          No       1.00      0.78      0.88       978
         Yes       0.48      0.99      0.64       198

    accuracy                           0.82      1176
   macro avg       0.74      0.89      0.76      1176
weighted avg       0.91      0.82      0.84      1176

Classification Report Test
----------------------------------------------------------------------------------------------------
              precisio