In [250]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

from imblearn.over_sampling import SMOTE

In [140]:
df = pd.read_csv("emp_attr.csv")
df.iloc[0]

Age                                      41
Attrition                               Yes
BusinessTravel                Travel_Rarely
DailyRate                              1102
Department                            Sales
DistanceFromHome                          1
Education                                 2
EducationField                Life Sciences
EmployeeCount                             1
EmployeeNumber                            1
EnvironmentSatisfaction                   2
Gender                               Female
HourlyRate                               94
JobInvolvement                            3
JobLevel                                  2
JobRole                     Sales Executive
JobSatisfaction                           4
MaritalStatus                        Single
MonthlyIncome                          5993
MonthlyRate                           19479
NumCompaniesWorked                        8
Over18                                    Y
OverTime                        

<h3>No nulls</h3>

In [141]:
df.isna().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [142]:
df['Attrition'] = df['Attrition'].map({'Yes':1, 'No':0})

In [143]:
df['Attrition'].value_counts()

Attrition
0    1233
1     237
Name: count, dtype: int64

In [144]:
x = df.drop('Attrition', axis = 1)
y = df['Attrition']

In [145]:
x.iloc[0]

Age                                      41
BusinessTravel                Travel_Rarely
DailyRate                              1102
Department                            Sales
DistanceFromHome                          1
Education                                 2
EducationField                Life Sciences
EmployeeCount                             1
EmployeeNumber                            1
EnvironmentSatisfaction                   2
Gender                               Female
HourlyRate                               94
JobInvolvement                            3
JobLevel                                  2
JobRole                     Sales Executive
JobSatisfaction                           4
MaritalStatus                        Single
MonthlyIncome                          5993
MonthlyRate                           19479
NumCompaniesWorked                        8
Over18                                    Y
OverTime                                Yes
PercentSalaryHike               

In [146]:
x_dum = pd.get_dummies(x, drop_first = True)
x_dum.iloc[0]

Age                                     41
DailyRate                             1102
DistanceFromHome                         1
Education                                2
EmployeeCount                            1
EmployeeNumber                           1
EnvironmentSatisfaction                  2
HourlyRate                              94
JobInvolvement                           3
JobLevel                                 2
JobSatisfaction                          4
MonthlyIncome                         5993
MonthlyRate                          19479
NumCompaniesWorked                       8
PercentSalaryHike                       11
PerformanceRating                        3
RelationshipSatisfaction                 1
StandardHours                           80
StockOptionLevel                         0
TotalWorkingYears                        8
TrainingTimesLastYear                    0
WorkLifeBalance                          1
YearsAtCompany                           6
YearsInCurr

In [147]:
x_dum

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single,OverTime_Yes
0,41,1102,1,2,1,1,2,94,3,2,...,False,False,False,False,False,True,False,False,True,True
1,49,279,8,1,1,2,3,61,2,2,...,False,False,False,False,True,False,False,True,False,False
2,37,1373,2,2,1,4,4,92,2,1,...,True,False,False,False,False,False,False,False,True,True
3,33,1392,3,4,1,5,4,56,3,1,...,False,False,False,False,True,False,False,True,False,True
4,27,591,2,1,1,7,1,40,3,1,...,True,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,884,23,2,1,2061,3,41,4,2,...,True,False,False,False,False,False,False,True,False,False
1466,39,613,6,1,1,2062,4,42,2,3,...,False,False,False,False,False,False,False,True,False,False
1467,27,155,4,3,1,2064,2,87,4,2,...,False,False,True,False,False,False,False,True,False,True
1468,49,1023,2,3,1,2065,4,63,2,2,...,False,False,False,False,False,True,False,True,False,False


<h3>Train test split</h3>

In [148]:
x_train, x_test, y_train, y_test = train_test_split(x_dum, y, test_size=0.2, random_state = 13, stratify = y)

<h1 style="text-align:center">Applying filter methods</h1>

<h3>Remove constant/ Near zero variance features</h3>

In [149]:
var_thresh = VarianceThreshold(threshold=0.01)
var_thresh

In [150]:
x_train_var = var_thresh.fit_transform(x_train)

# keep selected column names
selected_columns = x_train.columns[var_thresh.get_support()]
selected_columns

Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeNumber',
       'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel',
       'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager',
       'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely',
       'Department_Research & Development', 'Department_Sales',
       'EducationField_Life Sciences', 'EducationField_Marketing',
       'EducationField_Medical', 'EducationField_Other',
       'EducationField_Technical Degree', 'Gender_Male',
       'JobRole_Human Resources', 'JobRole_Laboratory Technician',
       'JobRole_Manager', 'JobRole_Manufacturing Director',
       'JobRole_Research Director', 'JobRol

In [151]:
x_train = pd.DataFrame(x_train_var, columns = selected_columns)
x_test = x_test[selected_columns]

<h3>Remove highly correlated features</h3>

In [152]:
corr_matrix = x_train.corr().abs()

upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(bool))
upper_triangle

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single,OverTime_Yes
Age,,0.011338,0.001763,0.228089,0.019947,0.013282,0.016154,0.010363,0.50446,0.009671,...,0.147759,0.314973,0.056771,0.16168,0.143499,0.00783,0.16829,0.07736,0.11691,0.003952
DailyRate,,,0.006832,0.026838,0.0901,0.005643,0.022838,0.060803,0.007091,0.041458,...,0.008269,0.001377,0.006556,0.010598,0.010941,6.8e-05,0.000515,0.042193,0.08703,0.003441
DistanceFromHome,,,,0.031479,0.023361,0.020562,0.027698,0.002331,0.002824,0.001089,...,0.000868,0.053916,0.00852,0.020136,0.010649,0.048612,0.017345,0.055916,0.035344,0.017636
Education,,,,,0.053322,0.01818,0.026359,0.052229,0.104825,0.012196,...,0.057647,0.037849,0.002246,0.039386,0.004544,0.064088,0.107629,0.005089,0.00717,0.026418
EmployeeNumber,,,,,,0.006275,0.032822,0.012523,0.026284,0.045048,...,0.012419,0.053222,0.024675,0.001245,0.01163,0.004676,0.003633,0.051559,0.040615,0.022549
EnvironmentSatisfaction,,,,,,,0.051591,0.005085,0.007013,0.005798,...,0.009204,0.006922,0.071119,0.051176,0.018061,0.050736,0.004485,0.010959,0.014528,0.091775
HourlyRate,,,,,,,,0.063191,0.034001,0.060195,...,0.010124,0.000322,0.007355,0.013827,0.01449,0.012909,0.018077,0.018384,0.023307,0.001389
JobInvolvement,,,,,,,,,0.021249,0.011693,...,0.034202,0.022668,0.02519,0.006372,0.032985,0.006277,0.004165,0.035266,0.054772,0.01448
JobLevel,,,,,,,,,,0.002564,...,0.351337,0.576662,0.120957,0.390256,0.387166,0.124065,0.217155,0.04559,0.086895,0.01251
JobSatisfaction,,,,,,,,,,,...,0.014018,0.014935,0.010612,0.011892,0.00259,0.011171,0.013931,0.017157,0.039179,0.016653


In [153]:
to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.85)]
to_drop           

['MonthlyIncome', 'Department_Sales']

In [154]:
x_train = x_train.drop(columns = to_drop)
x_test = x_test.drop(columns = to_drop)

print("Remaining features after filter: ", x_train.shape[1])

Remaining features after filter:  43


<h3>Random forest feature selection(embedded method)</h3>

In [155]:
rf = RandomForestClassifier(n_estimators = 200, random_state = 32)
rf.fit(x_train, y_train)

In [156]:
# extracting feature importance
importances = pd.Series(rf.feature_importances_, index = x_train.columns).sort_values(ascending = False)

importances.head(15)

Age                        0.072649
TotalWorkingYears          0.060034
OverTime_Yes               0.052813
DailyRate                  0.052542
EmployeeNumber             0.051168
HourlyRate                 0.050453
MonthlyRate                0.049877
YearsAtCompany             0.046162
DistanceFromHome           0.045836
YearsWithCurrManager       0.032100
PercentSalaryHike          0.031023
NumCompaniesWorked         0.030671
EnvironmentSatisfaction    0.028664
StockOptionLevel           0.027751
YearsInCurrentRole         0.026793
dtype: float64

In [157]:
# selecting top K features

top_k = 12
top_features = importances.head(top_k).index
top_features

Index(['Age', 'TotalWorkingYears', 'OverTime_Yes', 'DailyRate',
       'EmployeeNumber', 'HourlyRate', 'MonthlyRate', 'YearsAtCompany',
       'DistanceFromHome', 'YearsWithCurrManager', 'PercentSalaryHike',
       'NumCompaniesWorked'],
      dtype='object')

In [158]:
x_train_selected = x_train[top_features]
x_test_selected = x_test[top_features]

In [159]:
y_train.value_counts()

Attrition
0    986
1    190
Name: count, dtype: int64

<h3>Scaled input features</h3>

In [160]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_selected)
x_test_scaled = scaler.transform(x_test_selected)

<h3>Class imbalance</h3>
Use class_weight="balanced" first.

Evaluate recall for Attrition = Yes.

If recall is still low → try SMOTE.

<h3>Logistic regression</h3>

In [175]:
# using class_weight

log_model = LogisticRegression(class_weight = 'balanced',max_iter = 1000, solver = 'lbfgs')  # lbfgs is the default solver for logistic regression
log_model.fit(x_train_scaled, y_train)

In [174]:
y_pred_log = log_model.predict(x_test_scaled)
print("The accuracy of logistic regression model on test data is: ", accuracy_score(y_pred_log, y_test))
print("Classification report: \n", classification_report(y_test, y_pred_log))

The accuracy of logistic regression model on test data is:  0.6666666666666666
Classification report: 
               precision    recall  f1-score   support

           0       0.91      0.67      0.77       247
           1       0.27      0.66      0.39        47

    accuracy                           0.67       294
   macro avg       0.59      0.66      0.58       294
weighted avg       0.81      0.67      0.71       294



<b> Changing the probability threshold </b>

In [163]:
y_probs = log_model.predict_proba(x_test_scaled)
y_probs

array([[0.77353367, 0.22646633],
       [0.92745477, 0.07254523],
       [0.46845921, 0.53154079],
       [0.599538  , 0.400462  ],
       [0.37642212, 0.62357788],
       [0.48394757, 0.51605243],
       [0.79006974, 0.20993026],
       [0.36520736, 0.63479264],
       [0.89817224, 0.10182776],
       [0.61945467, 0.38054533],
       [0.5770938 , 0.4229062 ],
       [0.28989583, 0.71010417],
       [0.34406074, 0.65593926],
       [0.35066044, 0.64933956],
       [0.73907185, 0.26092815],
       [0.41198927, 0.58801073],
       [0.46788676, 0.53211324],
       [0.32974217, 0.67025783],
       [0.42451539, 0.57548461],
       [0.90542493, 0.09457507],
       [0.77880767, 0.22119233],
       [0.8099248 , 0.1900752 ],
       [0.35424211, 0.64575789],
       [0.41167202, 0.58832798],
       [0.47986891, 0.52013109],
       [0.25015488, 0.74984512],
       [0.66477125, 0.33522875],
       [0.65688495, 0.34311505],
       [0.55440108, 0.44559892],
       [0.40034396, 0.59965604],
       [0.

In [164]:
log_model.classes_  # 0(stayed in the job) -> 1st column ,  1(quit the job) -> 2nd column

array([0, 1])

In [176]:
y_probs_quit = y_probs[:, 1]
threshold = 0.4
y_pred = (y_probs_quit >= threshold).astype(int)
y_pred

array([0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 1])

In [177]:
# after changing threshold,
print("Classfication report: \n", classification_report(y_test, y_pred))

Classfication report: 
               precision    recall  f1-score   support

           0       0.94      0.49      0.64       247
           1       0.23      0.83      0.37        47

    accuracy                           0.54       294
   macro avg       0.59      0.66      0.50       294
weighted avg       0.83      0.54      0.60       294




In HR attrition, missing someone who is about to resign is worse than flagging a loyal employee as probable to resign. Thus 
Recall is more important than Precision

<h3>Trying out with SMOTE</h3>

In [171]:
# apply smote only to training data
sm = SMOTE(random_state = 32)

x_train_smote, y_train_smote = sm.fit_resample(x_train_scaled, y_train)

log_model2 = LogisticRegression(max_iter = 1000)
log_model2.fit(x_train_smote, y_train_smote)

In [172]:
pred = log_model2.predict(x_test_scaled)
print("Classification report: \n", classification_report(y_test, pred))

Classification report: 
               precision    recall  f1-score   support

           0       0.91      0.67      0.77       247
           1       0.27      0.66      0.39        47

    accuracy                           0.67       294
   macro avg       0.59      0.66      0.58       294
weighted avg       0.81      0.67      0.71       294



<h3>SVM</h3>

<b>Vanilla SVM </b>

In [178]:
svm_model = SVC(kernel = 'rbf', C = 1.0, random_state = 31)
svm_model.fit(x_train_scaled, y_train)

In [179]:
y_pred_svm = svm_model.predict(x_test_scaled)
print(f"Classification report: \n{classification_report(y_test, y_pred_svm)}")

# the warning arises because, the model predicted no one is going to leave

Classification report: 
              precision    recall  f1-score   support

           0       0.84      0.99      0.91       247
           1       0.00      0.00      0.00        47

    accuracy                           0.83       294
   macro avg       0.42      0.50      0.45       294
weighted avg       0.70      0.83      0.76       294



<b>Using class_weight</b>

In [182]:
svm_model1 = SVC(kernel = 'rbf', class_weight = 'balanced', C = 1.0, random_state = 19)
svm_model1.fit(x_train_scaled, y_train)

In [183]:
y_pred_svm1 = svm_model1.predict(x_test_scaled)
print(f"Classification report: \n{classification_report(y_test, y_pred_svm1)}")

Classification report: 
              precision    recall  f1-score   support

           0       0.90      0.77      0.83       247
           1       0.30      0.53      0.39        47

    accuracy                           0.73       294
   macro avg       0.60      0.65      0.61       294
weighted avg       0.80      0.73      0.76       294



<b>Using SMOTE </b>

In [189]:
sm = SMOTE(random_state = 17)
x_train_smote, y_train_smote = sm.fit_resample(x_train_scaled, y_train)
svm_model2 = SVC(kernel = 'linear', C = 1.0, random_state = 32)

In [190]:
svm_model2.fit(x_train_smote, y_train_smote)
y_pred_svm2 = svm_model2.predict(x_test_scaled)
print("Classification report: ", classification_report(y_test, y_pred_svm2))

Classification report:                precision    recall  f1-score   support

           0       0.88      0.74      0.81       247
           1       0.27      0.49      0.35        47

    accuracy                           0.70       294
   macro avg       0.58      0.62      0.58       294
weighted avg       0.79      0.70      0.73       294



<h3>Decision Trees</h3>

<b>Vanilla decision tree </b>

In [201]:
tree_model = DecisionTreeClassifier(criterion = 'gini', max_depth = 4, random_state = 2)
tree_model.fit(x_train_selected, y_train)

In [202]:
y_pred_tree = tree_model.predict(x_test_selected)
print(f"Classification report: \n{classification_report(y_test, y_pred_tree)}")

Classification report: 
              precision    recall  f1-score   support

           0       0.86      0.96      0.91       247
           1       0.47      0.19      0.27        47

    accuracy                           0.84       294
   macro avg       0.67      0.58      0.59       294
weighted avg       0.80      0.84      0.81       294



<b>Using class weight </b>

In [208]:
tree_model1 = DecisionTreeClassifier(criterion = 'gini', class_weight = 'balanced', max_depth = 2, random_state = 13)
tree_model1.fit(x_train_selected, y_train)

In [209]:
y_pred_tree1 = tree_model1.predict(x_test_selected)
print(f"Classification report: \n{classification_report(y_test, y_pred_tree1)}")

Classification report: 
              precision    recall  f1-score   support

           0       0.90      0.70      0.79       247
           1       0.27      0.60      0.37        47

    accuracy                           0.68       294
   macro avg       0.59      0.65      0.58       294
weighted avg       0.80      0.68      0.72       294



<b>Using SMOTE</b>

In [211]:
sm = SMOTE(random_state = 15)
x_train_smote, y_train_smote = sm.fit_resample(x_train_selected, y_train)

In [218]:
tree_model2 = DecisionTreeClassifier(criterion = 'gini', max_depth = 3, random_state = 13)
tree_model2.fit(x_train_smote, y_train_smote)

In [219]:
y_pred_tree2 = tree_model2.predict(x_test_selected)
print("Classification report: ", classification_report(y_test, y_pred_tree2))

Classification report:                precision    recall  f1-score   support

           0       0.88      0.84      0.86       247
           1       0.33      0.40      0.36        47

    accuracy                           0.77       294
   macro avg       0.60      0.62      0.61       294
weighted avg       0.79      0.77      0.78       294



<h3>KNN</h3>

<b>Vanilla KNN </b>

In [233]:
knn_model = KNeighborsClassifier(n_neighbors = 10, metric = 'euclidean', weights = 'distance')
# weights = distance -> this is better for imbalanced datasets
knn_model.fit(x_train_scaled, y_train)

In [234]:
y_pred_knn = knn_model.predict(x_test_scaled)
print(f"Classification report: \n{classification_report(y_test, y_pred_knn)}")

Classification report: 
              precision    recall  f1-score   support

           0       0.85      0.98      0.91       247
           1       0.33      0.06      0.11        47

    accuracy                           0.83       294
   macro avg       0.59      0.52      0.51       294
weighted avg       0.76      0.83      0.78       294



<b>SMOTE</b>

In [236]:
x_train_smote, y_train_smote = sm.fit_resample(x_train_scaled, y_train)
knn_model1 = KNeighborsClassifier(n_neighbors = 10, metric = 'euclidean', weights='distance')
knn_model1.fit(x_train_smote, y_train_smote)

In [237]:
y_pred_knn1 = knn_model1.predict(x_test_scaled)
print("Classification report: \n", classification_report(y_test, y_pred_knn1))

Classification report: 
               precision    recall  f1-score   support

           0       0.90      0.72      0.80       247
           1       0.28      0.57      0.38        47

    accuracy                           0.69       294
   macro avg       0.59      0.65      0.59       294
weighted avg       0.80      0.69      0.73       294



<h3>Bagging</h3>

In [239]:
bag = BaggingClassifier(
    estimator = DecisionTreeClassifier(class_weight = 'balanced'),
    n_estimators = 100,
    max_samples = 0.8,   # each tree sees 80% of the training data
    max_features = 1.0, # each tree sees complete features
    bootstrap = True,
    random_state = 17,
    n_jobs = -1
)

In [241]:
bag.fit(x_train_selected, y_train)

In [242]:
y_pred_bag = bag.predict(x_test_selected)
print("Classification report: \n", classification_report(y_test, y_pred_bag))

Classification report: 
               precision    recall  f1-score   support

           0       0.85      0.96      0.90       247
           1       0.31      0.09      0.13        47

    accuracy                           0.82       294
   macro avg       0.58      0.52      0.52       294
weighted avg       0.76      0.82      0.78       294



<h3>Random Forest</h3>

In [244]:

rf = RandomForestClassifier(n_estimators=100, max_depth = 10, random_state = 13, n_jobs = -1, class_weight = 'balanced')
rf.fit(x_train_selected, y_train)

In [245]:
y_pred_rf = rf.predict(x_test_selected)
print("Classification report: \n", classification_report(y_test, y_pred_rf))

Classification report: 
               precision    recall  f1-score   support

           0       0.85      0.96      0.90       247
           1       0.36      0.11      0.16        47

    accuracy                           0.83       294
   macro avg       0.60      0.53      0.53       294
weighted avg       0.77      0.83      0.79       294



<h3>Balanced Random Forest</h3>

In [248]:
brf = BalancedRandomForestClassifier(
    n_estimators=200,
    random_state=42
)

brf.fit(x_train_selected, y_train)
y_pred_rf1 = brf.predict(x_test_selected)
print("Classification report: \n", classification_report(y_test, y_pred_rf1))

Classification report: 
               precision    recall  f1-score   support

           0       0.88      0.85      0.87       247
           1       0.33      0.38      0.36        47

    accuracy                           0.78       294
   macro avg       0.61      0.62      0.61       294
weighted avg       0.79      0.78      0.78       294



<h3>XGBoost</h3>

In [269]:
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
#  ratio of no.of majority class samples to the no. of minority class samples


xgb_model = XGBClassifier(
    n_estimators = 500,
    learning_rate = 0.05,
    max_depth = 2,
    subsample = 0.8,
    colsample_bytree = 0.8,    
    random_state = 41,
    scale_pos_weight = scale_pos_weight
)

xgb_model.fit(x_train_selected, y_train)

In [270]:
y_pred_xgb = xgb_model.predict(x_test_selected)
print("Classification report: \n", classification_report(y_test, y_pred_xgb))

Classification report: 
               precision    recall  f1-score   support

           0       0.89      0.82      0.86       247
           1       0.34      0.49      0.40        47

    accuracy                           0.77       294
   macro avg       0.62      0.66      0.63       294
weighted avg       0.81      0.77      0.78       294

