##0. Data Preparation

In [5]:
from pandas import read_csv,DataFrame,get_dummies,Series
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
data1=read_csv('/content/Breast_Cancer.csv')
data1['Status']=data1['Status'].map({'Alive':1,'Dead':0})
data1['A Stage']=data1['A Stage'].map({'Regional':1,'Distant':0})
data1['Estrogen Status']=data1['Estrogen Status'].map({'Positive':1,'Negative':0})
data1['Progesterone Status']=data1['Progesterone Status'].map({'Positive':1,'Negative':0})
data2=get_dummies(data1,['Race','Marital Status','T Stage','N Stage','6th Stage','differentiate','Grade'],dtype=int)
x=data2.drop('Status', axis=1)
y=data2['Status']
x_scaled=StandardScaler().fit_transform(x)
x_train,x_test,y_train,y_test=train_test_split(x_scaled, y, test_size=0.20, random_state=100)
x_train,y_train=SMOTE().fit_resample(x_train,y_train)

##1. Regular method + Evaluation

In [6]:
from sklearn import ensemble
RF_classifier1 =ensemble.RandomForestClassifier(n_estimators=50, criterion='entropy', max_features='sqrt', random_state=1)
RF_classifier1.fit(x_train,y_train)
y_pred1=RF_classifier1.predict(x_test)

In [7]:
from sklearn import metrics
recall = metrics.recall_score(y_test, y_pred1)
print (recall)

0.9271137026239067


##2. GridSearchCV Method

In [8]:
# Random Forest Classifier (method 2)
from sklearn.model_selection import GridSearchCV
RF_classifier2 = ensemble.RandomForestClassifier(criterion='entropy', max_features='sqrt', random_state=100)
no_trees = {'n_estimators': [545, 550, 560, 575]}
grid_search1 = GridSearchCV(estimator=RF_classifier2, param_grid=no_trees, scoring='recall', cv=5)
grid_search1.fit(x_scaled, y)
best_parameters = grid_search1.best_params_
print(best_parameters)
best_result = grid_search1.best_score_
print(best_result)

{'n_estimators': 545}
0.980928512063939


##3. Regular method with best parameter + Evaluation

In [9]:
RF_classifier3 = ensemble.RandomForestClassifier(n_estimators=371, criterion='entropy', max_features='sqrt', random_state=1)
RF_classifier3.fit(x_train,y_train)
Y_pred3=RF_classifier3.predict(x_test)
imp_features = Series(RF_classifier3.feature_importances_, index=list(x)).sort_values(ascending=False)
print(imp_features)

Survival Months                            0.305610
Reginol Node Positive                      0.116660
Age                                        0.092761
Regional Node Examined                     0.086364
Tumor Size                                 0.085127
Progesterone Status                        0.034149
Marital Status_Married                     0.022949
N Stage_N1                                 0.020926
Marital Status_Single                      0.017263
Marital Status_Divorced                    0.013623
Race_White                                 0.013283
T Stage_T1                                 0.013102
T Stage_T2                                 0.012935
Race_Other                                 0.011476
6th Stage_IIA                              0.011402
N Stage_N3                                 0.011366
6th Stage_IIIC                             0.011166
differentiate_Poorly differentiated        0.011018
Grade_3                                    0.010809
Estrogen Sta

In [10]:
recall = metrics.recall_score(y_test, Y_pred3)
print (recall)

0.9314868804664723


##4. GridSearchCV with best features

In [11]:
X2 = data2[['Survival Months', 'Reginol Node Positive', 'Age', 'Regional Node Examined', 'Tumor Size']]
X_scaled = StandardScaler().fit_transform(X2)
# X_train, X_test, Y_train, Y_test = train_test_split( X_scaled, Y, test_size = 0.3, random_state = 100)# splitting
# X_train,Y_train =SMOTE (random_state = 100).fit_resample(X_train,Y_train)# balancing

RF_classifier4 = ensemble.RandomForestClassifier(criterion='entropy', max_features='sqrt', random_state=1)
no_trees = {'n_estimators': [398, 399, 400, 401, 402]}
grid_search2 = GridSearchCV(estimator=RF_classifier4, param_grid=no_trees, scoring='recall', cv=5)
grid_search2.fit(X_scaled, y) # training, testing , evaluation, ranking.

best_parameters = grid_search2.best_params_
print(best_parameters)
best_result = grid_search2.best_score_
print(best_result)


{'n_estimators': 399}
0.9738869439025756


##5. Regular method with best parameter and best features + Evaluation

In [12]:
RF_classifier5 = ensemble.RandomForestClassifier(n_estimators=399, criterion='entropy', max_features='sqrt', random_state=1)
RF_classifier5.fit(x_train,y_train)
Y_pred4=RF_classifier5.predict(x_test)
imp_features = Series(RF_classifier5.feature_importances_, index=list(x)).sort_values(ascending=False)
print(imp_features)

Survival Months                            0.304829
Reginol Node Positive                      0.116179
Age                                        0.092875
Regional Node Examined                     0.086586
Tumor Size                                 0.085460
Progesterone Status                        0.034023
Marital Status_Married                     0.022967
N Stage_N1                                 0.020924
Marital Status_Single                      0.017192
Marital Status_Divorced                    0.013721
Race_White                                 0.013239
T Stage_T1                                 0.013024
T Stage_T2                                 0.012915
Race_Other                                 0.011599
N Stage_N3                                 0.011432
6th Stage_IIA                              0.011430
6th Stage_IIIC                             0.011269
differentiate_Poorly differentiated        0.011033
Estrogen Status                            0.010972
Grade_3     

In [13]:
recall = metrics.recall_score(y_test, Y_pred4)
print (recall)

0.9285714285714286


##6. A balanced GridSearchCV

In [14]:
# Using pipeline (method #3)
from imblearn.pipeline import Pipeline
RF_classifier6 = Pipeline([('balancing', SMOTE(random_state = 101)),('classification', ensemble.RandomForestClassifier(criterion='entropy', max_features='sqrt', random_state=1))]) # building classifier
no_trees = {'classification__n_estimators': [165, 166, 167, 168, 169]}
grid_search3 = GridSearchCV(estimator=RF_classifier6, param_grid=no_trees, scoring='recall', cv=5)
grid_search3.fit(X_scaled, y)

best_parameters = grid_search3.best_params_
print(best_parameters)
best_result = grid_search3.best_score_
print(best_result)

{'classification__n_estimators': 167}
0.936618996559312


##7. Regular method with best parameter + Evaluation

In [15]:
# Building random forest (method #1 ) with the best number of trees
RF_classifier7 = ensemble.RandomForestClassifier(n_estimators=167, criterion='entropy', max_features='sqrt', random_state=1)
RF_classifier7.fit(x_train,y_train)
Y_pred5=RF_classifier7.predict(x_test)
imp_features = Series(RF_classifier7.feature_importances_, index=list(x)).sort_values(ascending=False)
print(imp_features)

Survival Months                            0.305564
Reginol Node Positive                      0.116129
Age                                        0.093151
Regional Node Examined                     0.086708
Tumor Size                                 0.085443
Progesterone Status                        0.035195
Marital Status_Married                     0.023420
N Stage_N1                                 0.019971
Marital Status_Single                      0.017295
Marital Status_Divorced                    0.013412
Race_White                                 0.013128
T Stage_T2                                 0.012828
T Stage_T1                                 0.012620
6th Stage_IIIC                             0.011873
6th Stage_IIA                              0.011520
Race_Other                                 0.011158
Grade_3                                    0.011030
differentiate_Poorly differentiated        0.010840
N Stage_N3                                 0.010544
Estrogen Sta

In [16]:
recall = metrics.recall_score(y_test, Y_pred5)
print (recall)

0.934402332361516


##8. A balanced GridSearchCV with best features

In [17]:
# # Using pipeline (method #3) using the most important features
X3 = data2[['Survival Months', 'Reginol Node Positive','Age','Tumor Size','Regional Node Examined']]
X_scaled = StandardScaler().fit_transform(X3)

RF_classifier7 = Pipeline([('balancing', SMOTE(random_state = 101)),('classification', ensemble.RandomForestClassifier(criterion='entropy', max_features='sqrt', random_state=1) )])
no_trees = {'classification__n_estimators': [191, 192, 193, 194, 195]}
grid_search4 = GridSearchCV(estimator=RF_classifier7, param_grid=no_trees, scoring='recall', cv=5)
grid_search4.fit(X_scaled, y)

best_parameters = grid_search4.best_params_
print(best_parameters)
best_result = grid_search4.best_score_
print(best_result)

{'classification__n_estimators': 193}
0.9339784084988008


##9. Regular method with best parameter and best features + Evaluation

In [18]:
RF_classifier8 = ensemble.RandomForestClassifier(n_estimators=193, criterion='entropy', max_features='sqrt', random_state=1)
RF_classifier8.fit(x_train,y_train)
Y_pred8=RF_classifier8.predict(x_test)
imp_features = Series(RF_classifier8.feature_importances_, index=list(x)).sort_values(ascending=False)
print(imp_features)


Survival Months                            0.305474
Reginol Node Positive                      0.116102
Age                                        0.092993
Regional Node Examined                     0.086592
Tumor Size                                 0.085347
Progesterone Status                        0.034480
Marital Status_Married                     0.023294
N Stage_N1                                 0.020823
Marital Status_Single                      0.017083
Marital Status_Divorced                    0.013456
Race_White                                 0.013209
T Stage_T2                                 0.012902
T Stage_T1                                 0.012866
N Stage_N3                                 0.012297
Race_Other                                 0.011323
6th Stage_IIA                              0.011180
6th Stage_IIIC                             0.011035
Grade_3                                    0.010836
differentiate_Poorly differentiated        0.010802
Estrogen Sta

In [19]:
recall = metrics.recall_score(y_test, Y_pred8)
print (recall)

0.9358600583090378


##10. Reasoning behind choice of Evaluation Metric

Evaluation Metric: Recall<br>
<br>
The target feature, i.e. Dead/Alive, was encoded as follows: 'Dead':0, 'Alive':1 <br>
<br>
It is preferrable to eliminate false negatives, i.e. incorrectly predict that a person will die when in fact they will survive. <br>
<br>
Hence, the model was tuned such that the recall is maximized and most number of survivors are correctly identified.