In [172]:
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_recall_curve, auc

from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedRandomForestClassifier

from xgboost import XGBClassifier

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [173]:
df = pd.read_csv("emp_attr.csv")
df.iloc[0]

Age                                      41
Attrition                               Yes
BusinessTravel                Travel_Rarely
DailyRate                              1102
Department                            Sales
DistanceFromHome                          1
Education                                 2
EducationField                Life Sciences
EmployeeCount                             1
EmployeeNumber                            1
EnvironmentSatisfaction                   2
Gender                               Female
HourlyRate                               94
JobInvolvement                            3
JobLevel                                  2
JobRole                     Sales Executive
JobSatisfaction                           4
MaritalStatus                        Single
MonthlyIncome                          5993
MonthlyRate                           19479
NumCompaniesWorked                        8
Over18                                    Y
OverTime                        

In [174]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

<h3>No nulls</h3>

In [175]:
df.isna().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [176]:
df['Attrition'] = df['Attrition'].map({'Yes':1, 'No':0})

In [177]:
df['Attrition'].value_counts()

Attrition
0    1233
1     237
Name: count, dtype: int64

In [178]:
df = df.drop(columns = ['EmployeeNumber'])
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18',
       'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [179]:
x = df.drop('Attrition', axis = 1)
y = df['Attrition']

In [180]:
x.iloc[0]

Age                                      41
BusinessTravel                Travel_Rarely
DailyRate                              1102
Department                            Sales
DistanceFromHome                          1
Education                                 2
EducationField                Life Sciences
EmployeeCount                             1
EnvironmentSatisfaction                   2
Gender                               Female
HourlyRate                               94
JobInvolvement                            3
JobLevel                                  2
JobRole                     Sales Executive
JobSatisfaction                           4
MaritalStatus                        Single
MonthlyIncome                          5993
MonthlyRate                           19479
NumCompaniesWorked                        8
Over18                                    Y
OverTime                                Yes
PercentSalaryHike                        11
PerformanceRating               

In [181]:
x_dum = pd.get_dummies(x, drop_first = True)
x_dum.iloc[0]

Age                                     41
DailyRate                             1102
DistanceFromHome                         1
Education                                2
EmployeeCount                            1
EnvironmentSatisfaction                  2
HourlyRate                              94
JobInvolvement                           3
JobLevel                                 2
JobSatisfaction                          4
MonthlyIncome                         5993
MonthlyRate                          19479
NumCompaniesWorked                       8
PercentSalaryHike                       11
PerformanceRating                        3
RelationshipSatisfaction                 1
StandardHours                           80
StockOptionLevel                         0
TotalWorkingYears                        8
TrainingTimesLastYear                    0
WorkLifeBalance                          1
YearsAtCompany                           6
YearsInCurrentRole                       4
YearsSinceL

In [182]:
x_dum

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single,OverTime_Yes
0,41,1102,1,2,1,2,94,3,2,4,...,False,False,False,False,False,True,False,False,True,True
1,49,279,8,1,1,3,61,2,2,2,...,False,False,False,False,True,False,False,True,False,False
2,37,1373,2,2,1,4,92,2,1,3,...,True,False,False,False,False,False,False,False,True,True
3,33,1392,3,4,1,4,56,3,1,3,...,False,False,False,False,True,False,False,True,False,True
4,27,591,2,1,1,1,40,3,1,2,...,True,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,884,23,2,1,3,41,4,2,4,...,True,False,False,False,False,False,False,True,False,False
1466,39,613,6,1,1,4,42,2,3,1,...,False,False,False,False,False,False,False,True,False,False
1467,27,155,4,3,1,2,87,4,2,2,...,False,False,True,False,False,False,False,True,False,True
1468,49,1023,2,3,1,4,63,2,2,2,...,False,False,False,False,False,True,False,True,False,False


<h3>Train test split</h3>

In [183]:
x_train, x_test, y_train, y_test = train_test_split(x_dum, y, test_size=0.2, random_state = 13, stratify = y)

<h1 style="text-align:center">Applying filter methods</h1>

<h3>Remove constant/ Near zero variance features</h3>

In [184]:
var_thresh = VarianceThreshold(threshold=0.01)
var_thresh

In [185]:
x_train_var = var_thresh.fit_transform(x_train)

# keep selected column names
selected_columns = x_train.columns[var_thresh.get_support()]
selected_columns

Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education',
       'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel',
       'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager',
       'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely',
       'Department_Research & Development', 'Department_Sales',
       'EducationField_Life Sciences', 'EducationField_Marketing',
       'EducationField_Medical', 'EducationField_Other',
       'EducationField_Technical Degree', 'Gender_Male',
       'JobRole_Human Resources', 'JobRole_Laboratory Technician',
       'JobRole_Manager', 'JobRole_Manufacturing Director',
       'JobRole_Research Director', 'JobRole_Research Scienti

In [186]:
x_train = pd.DataFrame(x_train_var, columns = selected_columns)
x_test = x_test[selected_columns]

<h3>Remove highly correlated features</h3>

In [187]:
corr_matrix = x_train.corr().abs()

upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(bool))
upper_triangle

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single,OverTime_Yes
Age,,0.011338,0.001763,0.228089,0.013282,0.016154,0.010363,0.50446,0.009671,0.496408,...,0.147759,0.314973,0.056771,0.16168,0.143499,0.00783,0.16829,0.07736,0.11691,0.003952
DailyRate,,,0.006832,0.026838,0.005643,0.022838,0.060803,0.007091,0.041458,0.013361,...,0.008269,0.001377,0.006556,0.010598,0.010941,6.8e-05,0.000515,0.042193,0.08703,0.003441
DistanceFromHome,,,,0.031479,0.020562,0.027698,0.002331,0.002824,0.001089,0.024197,...,0.000868,0.053916,0.00852,0.020136,0.010649,0.048612,0.017345,0.055916,0.035344,0.017636
Education,,,,,0.01818,0.026359,0.052229,0.104825,0.012196,0.101889,...,0.057647,0.037849,0.002246,0.039386,0.004544,0.064088,0.107629,0.005089,0.00717,0.026418
EnvironmentSatisfaction,,,,,,0.051591,0.005085,0.007013,0.005798,0.012271,...,0.009204,0.006922,0.071119,0.051176,0.018061,0.050736,0.004485,0.010959,0.014528,0.091775
HourlyRate,,,,,,,0.063191,0.034001,0.060195,0.02469,...,0.010124,0.000322,0.007355,0.013827,0.01449,0.012909,0.018077,0.018384,0.023307,0.001389
JobInvolvement,,,,,,,,0.021249,0.011693,0.018539,...,0.034202,0.022668,0.02519,0.006372,0.032985,0.006277,0.004165,0.035266,0.054772,0.01448
JobLevel,,,,,,,,,0.002564,0.950699,...,0.351337,0.576662,0.120957,0.390256,0.387166,0.124065,0.217155,0.04559,0.086895,0.01251
JobSatisfaction,,,,,,,,,,0.001653,...,0.014018,0.014935,0.010612,0.011892,0.00259,0.011171,0.013931,0.017157,0.039179,0.016653
MonthlyIncome,,,,,,,,,,,...,0.324058,0.647059,0.058216,0.462756,0.341912,0.035021,0.204549,0.056001,0.087431,0.000918


In [188]:
to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.85)]
to_drop           

['MonthlyIncome', 'Department_Sales']

In [189]:
x_train = x_train.drop(columns = to_drop)
x_test = x_test.drop(columns = to_drop)

print("Remaining features after filter: ", x_train.shape[1])

Remaining features after filter:  42


<h3>Random forest feature selection(embedded method)</h3>

In [190]:
rf = RandomForestClassifier(n_estimators = 200, random_state = 32)
rf.fit(x_train, y_train)

In [191]:
# extracting feature importance
importances = pd.Series(rf.feature_importances_, index = x_train.columns).sort_values(ascending = False)

importances.head(15)

Age                        0.071754
TotalWorkingYears          0.060202
OverTime_Yes               0.055658
DailyRate                  0.053487
HourlyRate                 0.052640
MonthlyRate                0.052410
DistanceFromHome           0.049897
YearsAtCompany             0.044027
PercentSalaryHike          0.033815
YearsInCurrentRole         0.032135
EnvironmentSatisfaction    0.031229
NumCompaniesWorked         0.031046
YearsWithCurrManager       0.030404
StockOptionLevel           0.029988
TrainingTimesLastYear      0.029537
dtype: float64

In [192]:
# selecting top K features

top_k = 12
top_features = importances.head(top_k).index
top_features

Index(['Age', 'TotalWorkingYears', 'OverTime_Yes', 'DailyRate', 'HourlyRate',
       'MonthlyRate', 'DistanceFromHome', 'YearsAtCompany',
       'PercentSalaryHike', 'YearsInCurrentRole', 'EnvironmentSatisfaction',
       'NumCompaniesWorked'],
      dtype='object')

In [193]:
x_train_selected = x_train[top_features]
x_test_selected = x_test[top_features]

In [194]:
y_train.value_counts()

Attrition
0    986
1    190
Name: count, dtype: int64

<h3>Scaled input features</h3>

In [195]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_selected)
x_test_scaled = scaler.transform(x_test_selected)

<h3>Class imbalance</h3>
Use class_weight="balanced" first.

Evaluate recall for Attrition = Yes.

If recall is still low → try SMOTE.

<h3>Logistic regression</h3>

In [196]:
# using class_weight

log_model = LogisticRegression(class_weight = 'balanced',max_iter = 1000, solver = 'lbfgs')  # lbfgs is the default solver for logistic regression
log_model.fit(x_train_scaled, y_train)

In [197]:
y_pred_log = log_model.predict(x_test_scaled)
print("The accuracy of logistic regression model on test data is: ", accuracy_score(y_pred_log, y_test))
print("Classification report: \n", classification_report(y_test, y_pred_log))

The accuracy of logistic regression model on test data is:  0.7074829931972789
Classification report: 
               precision    recall  f1-score   support

           0       0.94      0.70      0.80       247
           1       0.32      0.74      0.45        47

    accuracy                           0.71       294
   macro avg       0.63      0.72      0.62       294
weighted avg       0.84      0.71      0.74       294



<b> Changing the probability threshold </b>

In [198]:
y_probs = log_model.predict_proba(x_test_scaled)
y_probs

array([[0.75901342, 0.24098658],
       [0.91872515, 0.08127485],
       [0.3554517 , 0.6445483 ],
       [0.64133005, 0.35866995],
       [0.39676338, 0.60323662],
       [0.60147924, 0.39852076],
       [0.78545467, 0.21454533],
       [0.38166383, 0.61833617],
       [0.91403335, 0.08596665],
       [0.58124182, 0.41875818],
       [0.6171197 , 0.3828803 ],
       [0.2898447 , 0.7101553 ],
       [0.31206964, 0.68793036],
       [0.37724954, 0.62275046],
       [0.66969466, 0.33030534],
       [0.51592232, 0.48407768],
       [0.5972307 , 0.4027693 ],
       [0.32475045, 0.67524955],
       [0.51543107, 0.48456893],
       [0.86421351, 0.13578649],
       [0.67845344, 0.32154656],
       [0.59048914, 0.40951086],
       [0.40119868, 0.59880132],
       [0.60790862, 0.39209138],
       [0.40874553, 0.59125447],
       [0.28740802, 0.71259198],
       [0.62858327, 0.37141673],
       [0.52992313, 0.47007687],
       [0.71246286, 0.28753714],
       [0.55103435, 0.44896565],
       [0.

In [199]:
log_model.classes_  # 0(stayed in the job) -> 1st column ,  1(quit the job) -> 2nd column

array([0, 1])

In [200]:
y_probs_quit = y_probs[:, 1]
threshold = 0.4
y_pred = (y_probs_quit >= threshold).astype(int)
y_pred

array([0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 1])

In [201]:
# after changing threshold,
print("Classfication report: \n", classification_report(y_test, y_pred))

Classfication report: 
               precision    recall  f1-score   support

           0       0.94      0.53      0.67       247
           1       0.25      0.81      0.38        47

    accuracy                           0.57       294
   macro avg       0.59      0.67      0.52       294
weighted avg       0.82      0.57      0.63       294




In HR attrition, missing someone who is about to resign is worse than flagging a loyal employee as probable to resign. Thus 
Recall is more important than Precision

<h3>Trying out with SMOTE</h3>

In [202]:
# apply smote only to training data
sm = SMOTE(random_state = 32)

x_train_smote, y_train_smote = sm.fit_resample(x_train_scaled, y_train)

log_model2 = LogisticRegression(max_iter = 1000)
log_model2.fit(x_train_smote, y_train_smote)

In [203]:
pred = log_model2.predict(x_test_scaled)
print("Classification report: \n", classification_report(y_test, pred))

Classification report: 
               precision    recall  f1-score   support

           0       0.94      0.69      0.79       247
           1       0.32      0.77      0.45        47

    accuracy                           0.70       294
   macro avg       0.63      0.73      0.62       294
weighted avg       0.84      0.70      0.74       294



<h3>SVM</h3>

<b>Vanilla SVM </b>

In [204]:
svm_model = SVC(kernel = 'rbf', C = 1.0, random_state = 31)
svm_model.fit(x_train_scaled, y_train)

In [205]:
y_pred_svm = svm_model.predict(x_test_scaled)
print(f"Classification report: \n{classification_report(y_test, y_pred_svm)}")

# the warning arises because, the model predicted no one is going to leave

Classification report: 
              precision    recall  f1-score   support

           0       0.85      1.00      0.92       247
           1       0.75      0.06      0.12        47

    accuracy                           0.85       294
   macro avg       0.80      0.53      0.52       294
weighted avg       0.83      0.85      0.79       294



<b>Using class_weight</b>

In [206]:
svm_model1 = SVC(kernel = 'rbf', class_weight = 'balanced', C = 1.0, random_state = 42)
svm_model1.fit(x_train_scaled, y_train)

In [207]:
y_pred_svm1 = svm_model1.predict(x_test_scaled)
print(f"Classification report: \n{classification_report(y_test, y_pred_svm1)}")

Classification report: 
              precision    recall  f1-score   support

           0       0.91      0.78      0.84       247
           1       0.33      0.57      0.42        47

    accuracy                           0.74       294
   macro avg       0.62      0.68      0.63       294
weighted avg       0.81      0.74      0.77       294



<b>Using SMOTE </b>

In [208]:
sm = SMOTE(random_state = 17)
x_train_smote, y_train_smote = sm.fit_resample(x_train_scaled, y_train)
svm_model2 = SVC(kernel = 'linear', C = 1.0, random_state = 32)

In [209]:
svm_model2.fit(x_train_smote, y_train_smote)
y_pred_svm2 = svm_model2.predict(x_test_scaled)
print("Classification report: ", classification_report(y_test, y_pred_svm2))

Classification report:                precision    recall  f1-score   support

           0       0.93      0.73      0.82       247
           1       0.34      0.72      0.46        47

    accuracy                           0.73       294
   macro avg       0.63      0.73      0.64       294
weighted avg       0.84      0.73      0.76       294



<h3>Decision Trees</h3>

<b>Vanilla decision tree </b>

In [210]:
tree_model = DecisionTreeClassifier(criterion = 'gini', max_depth = 4, random_state = 2)
tree_model.fit(x_train_selected, y_train)

In [211]:
y_pred_tree = tree_model.predict(x_test_selected)
print(f"Classification report: \n{classification_report(y_test, y_pred_tree)}")

Classification report: 
              precision    recall  f1-score   support

           0       0.85      0.96      0.90       247
           1       0.38      0.13      0.19        47

    accuracy                           0.83       294
   macro avg       0.61      0.54      0.55       294
weighted avg       0.78      0.83      0.79       294



<b>Using class weight </b>

In [212]:
tree_model1 = DecisionTreeClassifier(criterion = 'gini', class_weight = 'balanced', max_depth = 2, random_state = 13)
tree_model1.fit(x_train_selected, y_train)

In [213]:
y_pred_tree1 = tree_model1.predict(x_test_selected)
print(f"Classification report: \n{classification_report(y_test, y_pred_tree1)}")

Classification report: 
              precision    recall  f1-score   support

           0       0.90      0.70      0.79       247
           1       0.27      0.60      0.37        47

    accuracy                           0.68       294
   macro avg       0.59      0.65      0.58       294
weighted avg       0.80      0.68      0.72       294



<b>Using SMOTE</b>

In [214]:
sm = SMOTE(random_state = 15)
x_train_smote, y_train_smote = sm.fit_resample(x_train_selected, y_train)

In [215]:
tree_model2 = DecisionTreeClassifier(criterion = 'gini', max_depth = 3, random_state = 13)
tree_model2.fit(x_train_smote, y_train_smote)

In [216]:
y_pred_tree2 = tree_model2.predict(x_test_selected)
print("Classification report: ", classification_report(y_test, y_pred_tree2))

Classification report:                precision    recall  f1-score   support

           0       0.86      0.74      0.80       247
           1       0.21      0.36      0.27        47

    accuracy                           0.68       294
   macro avg       0.53      0.55      0.53       294
weighted avg       0.76      0.68      0.71       294



<h3>KNN</h3>

<b>Vanilla KNN </b>

In [217]:
knn_model = KNeighborsClassifier(n_neighbors = 10, metric = 'euclidean', weights = 'distance')
# weights = distance -> this is better for imbalanced datasets
knn_model.fit(x_train_scaled, y_train)

In [218]:
y_pred_knn = knn_model.predict(x_test_scaled)
print(f"Classification report: \n{classification_report(y_test, y_pred_knn)}")

Classification report: 
              precision    recall  f1-score   support

           0       0.85      0.98      0.91       247
           1       0.50      0.09      0.15        47

    accuracy                           0.84       294
   macro avg       0.67      0.53      0.53       294
weighted avg       0.79      0.84      0.79       294



<b>SMOTE</b>

In [219]:
x_train_smote, y_train_smote = sm.fit_resample(x_train_scaled, y_train)
knn_model1 = KNeighborsClassifier(n_neighbors = 10, metric = 'euclidean', weights='distance')
knn_model1.fit(x_train_smote, y_train_smote)

In [220]:
y_pred_knn1 = knn_model1.predict(x_test_scaled)
print("Classification report: \n", classification_report(y_test, y_pred_knn1))

Classification report: 
               precision    recall  f1-score   support

           0       0.88      0.73      0.80       247
           1       0.26      0.49      0.34        47

    accuracy                           0.69       294
   macro avg       0.57      0.61      0.57       294
weighted avg       0.78      0.69      0.72       294



<h3>Bagging</h3>

In [221]:
bag = BaggingClassifier(
    estimator = DecisionTreeClassifier(class_weight = 'balanced'),
    n_estimators = 100,
    max_samples = 0.8,   # each tree sees 80% of the training data
    max_features = 1.0, # each tree sees complete features
    bootstrap = True,
    random_state = 17,
    n_jobs = -1
)

In [222]:
bag.fit(x_train_selected, y_train)

In [223]:
y_pred_bag = bag.predict(x_test_selected)
print("Classification report: \n", classification_report(y_test, y_pred_bag))

Classification report: 
               precision    recall  f1-score   support

           0       0.85      0.98      0.91       247
           1       0.38      0.06      0.11        47

    accuracy                           0.83       294
   macro avg       0.61      0.52      0.51       294
weighted avg       0.77      0.83      0.78       294



<h3>Random Forest</h3>

In [224]:

rf = RandomForestClassifier(n_estimators=100, max_depth = 10, random_state = 13, n_jobs = -1, class_weight = 'balanced')
rf.fit(x_train_selected, y_train)

In [225]:
y_pred_rf = rf.predict(x_test_selected)
print("Classification report: \n", classification_report(y_test, y_pred_rf))

Classification report: 
               precision    recall  f1-score   support

           0       0.85      0.98      0.91       247
           1       0.50      0.09      0.15        47

    accuracy                           0.84       294
   macro avg       0.67      0.53      0.53       294
weighted avg       0.79      0.84      0.79       294



<h3>Balanced Random Forest</h3>

In [226]:
brf = BalancedRandomForestClassifier(
    n_estimators=200,
    random_state=42
)

brf.fit(x_train_selected, y_train)
y_pred_rf1 = brf.predict(x_test_selected)
print("Classification report: \n", classification_report(y_test, y_pred_rf1))

Classification report: 
               precision    recall  f1-score   support

           0       0.89      0.86      0.88       247
           1       0.38      0.45      0.41        47

    accuracy                           0.80       294
   macro avg       0.64      0.65      0.64       294
weighted avg       0.81      0.80      0.80       294



<h3>XGBoost</h3>

In [227]:
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
#  ratio of no.of majority class samples to the no. of minority class samples


xgb_model = XGBClassifier(
    n_estimators = 500,
    learning_rate = 0.05,
    max_depth = 2,
    subsample = 0.8,
    colsample_bytree = 0.8,    
    random_state = 41,
    scale_pos_weight = scale_pos_weight
)

xgb_model.fit(x_train_selected, y_train)

In [228]:
y_pred_xgb = xgb_model.predict(x_test_selected)
print("Classification report: \n", classification_report(y_test, y_pred_xgb))

Classification report: 
               precision    recall  f1-score   support

           0       0.90      0.82      0.86       247
           1       0.35      0.51      0.42        47

    accuracy                           0.77       294
   macro avg       0.63      0.67      0.64       294
weighted avg       0.81      0.77      0.79       294



In [229]:
y_scores = xgb_model.predict_proba(x_test_selected)[:,-1]
precision, recall, thresholds = precision_recall_curve(y_test, y_scores)
pr_auc = auc(recall, precision)
print(f"PR-AUC: {pr_auc}")

PR-AUC: 0.3277619104967166


<h3>Saving Logistic regression model working with SMOTE</h3>

In [231]:

joblib.dump(log_model2, "logistic_model.pkl")  # Joblib is a library used to save python objects to disk(serialization)
    # this is model persistence

joblib.dump(scaler, "scaler.pkl")  # it saves the scaler(StandardScaler()) you used. If you don't do it, the new data won't 
    # be scaled the same way and predictions will be wrong.

['scaler.pkl']