##0. Data Preparation

In [None]:
from pandas import read_csv, DataFrame, get_dummies, Series
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
CC = read_csv('/content/Customer_Churn.csv')
CC['Attrition_Flag'] = CC['Attrition_Flag'].map({'Existing Customer':1, 'Attrited Customer':0})
CC['Gender'] = CC['Gender'].map({'M':0,'F':1})
CC['Education_Level'] = CC['Education_Level'].map({'Uneducated':0, 'High School':1, 'Graduate':2, 'Post-Graduate':3, 'Doctorate':4})
CC['Income_Category'] = CC['Income_Category'].map({'Less than $40K':0, '$40K - $60K':1, '$60K - $80K':2, '$80K - $120K':3, '$120K +':4})
CC['Card_Category'] = CC['Card_Category'].map({'Blue':0, 'Silver':1, 'Gold':2, 'Platinum':3})
CC2 = get_dummies(CC, ['Marital_Status'], dtype=int)
x = CC2.drop('Attrition_Flag', axis=1)
y = CC2['Attrition_Flag']
x_scaled = StandardScaler().fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=100)
x_train, y_train = SMOTE().fit_resample(x_train, y_train)

##1. Regular method (method #1) + Evaluation

In [None]:
from sklearn import ensemble
RF_classifier1 =ensemble.RandomForestClassifier(n_estimators=50, criterion='entropy', max_features='auto', random_state=1)
RF_classifier1.fit(x_train,y_train)
y_pred1=RF_classifier1.predict(x_test)

  warn(


In [None]:
from sklearn import metrics
recall = metrics.recall_score(y_test, y_pred1)
print (recall)

0.9598086124401913


##2. GridSearchCV (method #2)

In [None]:
# Random Forest Classifier (method 2)
from sklearn.model_selection import GridSearchCV
RF_classifier2 = ensemble.RandomForestClassifier(criterion='entropy', max_features='sqrt', random_state=100)
no_trees = {'n_estimators': [545 550, 560, 575]}
grid_search1 = GridSearchCV(estimator=RF_classifier2, param_grid=no_trees, scoring='recall', cv=5)
grid_search1.fit(x_scaled, y)
best_parameters = grid_search1.best_params_
print(best_parameters)
best_result = grid_search1.best_score_
print(best_result)

{'n_estimators': 550}
0.9729951196217254


##3. Regular method with best parameter + Evaluation

In [None]:
RF_classifier3 = ensemble.RandomForestClassifier(n_estimators=371, criterion='entropy', max_features='auto', random_state=1)
RF_classifier3.fit(x_train,y_train)
Y_pred3=RF_classifier3.predict(x_test)
imp_features = Series(RF_classifier3.feature_importances_, index=list(x)).sort_values(ascending=False)
print(imp_features)

  warn(


Survival Months                            0.310992
Reginol Node Positive                      0.114524
Age                                        0.089236
Regional Node Examined                     0.086817
Tumor Size                                 0.086081
Progesterone Status                        0.034743
Marital Status_Married                     0.023976
N Stage_N1                                 0.020408
Marital Status_Single                      0.017038
T Stage_T1                                 0.014867
T Stage_T2                                 0.013684
Grade_3                                    0.013599
differentiate_Poorly differentiated        0.012501
Race_White                                 0.012277
Estrogen Status                            0.012055
Marital Status_Divorced                    0.011984
6th Stage_IIA                              0.011701
Race_Other                                 0.010246
N Stage_N3                                 0.009418
differentiat

In [None]:
recall = metrics.recall_score(y_test, Y_pred3)
print (recall)

0.934402332361516


##4. GridSearchCV with best features

In [None]:
X2 = data2[['Survival Months', 'Reginol Node Positive', 'Age', 'Regional Node Examined', 'Tumor Size']]
X_scaled = StandardScaler().fit_transform(X2)
# X_train, X_test, Y_train, Y_test = train_test_split( X_scaled, Y, test_size = 0.3, random_state = 100)# splitting
# X_train,Y_train =SMOTE (random_state = 100).fit_resample(X_train,Y_train)# balancing

RF_classifier4 = ensemble.RandomForestClassifier(criterion='entropy', max_features='sqrt', random_state=1)
no_trees = {'n_estimators': [398, 399, 400, 401, 402]}
grid_search2 = GridSearchCV(estimator=RF_classifier4, param_grid=no_trees, scoring='recall', cv=5)
grid_search2.fit(X_scaled, y) # training, testing , evaluation, ranking.

best_parameters = grid_search2.best_params_
print(best_parameters)
best_result = grid_search2.best_score_
print(best_result)


{'n_estimators': 399}
0.9738869439025756


##5. Regular method with best parameter and best features + Evaluation

In [None]:
RF_classifier5 = ensemble.RandomForestClassifier(n_estimators=399, criterion='entropy', max_features='auto', random_state=1)
RF_classifier5.fit(x_train,y_train)
Y_pred4=RF_classifier5.predict(x_test)
imp_features = Series(RF_classifier5.feature_importances_, index=list(x)).sort_values(ascending=False)
print(imp_features)

  warn(


Survival Months                            0.310528
Reginol Node Positive                      0.114379
Age                                        0.089184
Regional Node Examined                     0.086894
Tumor Size                                 0.086148
Progesterone Status                        0.034608
Marital Status_Married                     0.023939
N Stage_N1                                 0.020382
Marital Status_Single                      0.017113
T Stage_T1                                 0.014944
T Stage_T2                                 0.013670
Grade_3                                    0.013566
differentiate_Poorly differentiated        0.012510
Race_White                                 0.012332
Estrogen Status                            0.012185
Marital Status_Divorced                    0.012076
6th Stage_IIA                              0.011853
Race_Other                                 0.010297
N Stage_N3                                 0.009193
Grade_2     

In [None]:
recall = metrics.recall_score(y_test, Y_pred4)
print (recall)

0.9358600583090378


##6. A balanced GridSearchCV (method#3)

In [None]:
# Using pipeline (method #3)
from imblearn.pipeline import Pipeline
RF_classifier6 = Pipeline([('balancing', SMOTE(random_state = 101)),('classification', ensemble.RandomForestClassifier(criterion='entropy', max_features='auto', random_state=1))]) # building classifier
no_trees = {'classification__n_estimators': [165, 166, 167, 168, 169]}
grid_search3 = GridSearchCV(estimator=RF_classifier6, param_grid=no_trees, scoring='recall', cv=5)
grid_search3.fit(X_scaled, y)

best_parameters = grid_search3.best_params_
print(best_parameters)
best_result = grid_search3.best_score_
print(best_result)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


{'classification__n_estimators': 167}
0.936618996559312


##7. Regular method with best parameter + Evaluation

In [None]:
# Building random forest (method #1 ) with the best number of trees
RF_classifier7 = ensemble.RandomForestClassifier(n_estimators=167, criterion='entropy', max_features='auto', random_state=1)
RF_classifier7.fit(x_train,y_train)
Y_pred5=RF_classifier7.predict(x_test)
imp_features = Series(RF_classifier7.feature_importances_, index=list(x)).sort_values(ascending=False)
print(imp_features)

  warn(


Survival Months                            0.311367
Reginol Node Positive                      0.113635
Age                                        0.089391
Regional Node Examined                     0.086476
Tumor Size                                 0.086046
Progesterone Status                        0.035835
Marital Status_Married                     0.023576
N Stage_N1                                 0.020656
Marital Status_Single                      0.017321
T Stage_T1                                 0.015866
T Stage_T2                                 0.014104
Grade_3                                    0.012628
Race_White                                 0.012126
Estrogen Status                            0.011859
differentiate_Poorly differentiated        0.011830
Marital Status_Divorced                    0.011432
6th Stage_IIA                              0.010845
Race_Other                                 0.010211
Grade_2                                    0.009582
differentiat

In [None]:
recall = metrics.recall_score(y_test, Y_pred5)
print (recall)

0.9402332361516035


##8. A balanced GridSearchCV with best features

In [None]:
# # Using pipeline (method #3) using the most important features
X3 = data2[['Survival Months', 'Reginol Node Positive','Age','Tumor Size','Regional Node Examined']]
X_scaled = StandardScaler().fit_transform(X3)

RF_classifier7 = Pipeline([('balancing', SMOTE(random_state = 101)),('classification', ensemble.RandomForestClassifier(criterion='entropy', max_features='auto', random_state=1) )])
no_trees = {'classification__n_estimators': [191, 192, 193, 194, 195]}
grid_search4 = GridSearchCV(estimator=RF_classifier7, param_grid=no_trees, scoring='recall', cv=5)
grid_search4.fit(X_scaled, y)

best_parameters = grid_search4.best_params_
print(best_parameters)
best_result = grid_search4.best_score_
print(best_result)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


{'classification__n_estimators': 193}
0.9339784084988008


##9. Regular method with best parameter and best features + Evaluation

In [None]:
RF_classifier8 = ensemble.RandomForestClassifier(n_estimators=193, criterion='entropy', max_features='auto', random_state=1)
RF_classifier8.fit(x_train,y_train)
Y_pred8=RF_classifier8.predict(x_test)
imp_features = Series(RF_classifier8.feature_importances_, index=list(x)).sort_values(ascending=False)
print(imp_features)


  warn(


Survival Months                            0.310597
Reginol Node Positive                      0.113613
Age                                        0.089165
Regional Node Examined                     0.087057
Tumor Size                                 0.085630
Progesterone Status                        0.035590
Marital Status_Married                     0.023568
N Stage_N1                                 0.020583
Marital Status_Single                      0.017110
T Stage_T1                                 0.015491
T Stage_T2                                 0.014121
Grade_3                                    0.013022
Estrogen Status                            0.012174
Race_White                                 0.012116
differentiate_Poorly differentiated        0.011782
Marital Status_Divorced                    0.011619
6th Stage_IIA                              0.011380
Race_Other                                 0.010281
N Stage_N3                                 0.010234
Grade_2     

In [None]:
recall = metrics.recall_score(y_test, Y_pred8)
print (recall)

0.9402332361516035


##10. Reasoning behind choice of Evaluation Metric

Evaluation Metric: Recall<br>
<br>
The target feature, i.e. Dead/Alive, was encoded as follows: 'Dead':0, 'Alive':1 <br>
<br>
It is preferrable to eliminate false negatives, i.e. incorrectly predict that a person will die when in fact they will survive. <br>
<br>
Hence, the model was tuned such that the recall is maximized and most number of survivors are correctly identified.