In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [2]:
data=pd.read_csv("breast_cancer_dataset.csv",na_values=['-100000'])
data

Unnamed: 0,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitosis,class
0,5,1,1,1,2,1.0,3,1,1,2
1,5,4,4,5,7,10.0,3,2,1,2
2,3,1,1,1,2,2.0,3,1,1,2
3,6,8,8,1,3,4.0,3,7,1,2
4,4,1,1,3,2,1.0,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
564,4,1,1,1,2,1.0,3,2,1,2
565,5,7,10,10,5,10.0,10,10,1,4
566,3,1,2,1,2,1.0,3,1,1,2
567,4,1,1,1,2,3.0,2,1,1,2


In [3]:
data.isna().sum()

clump_thickness                 0
uniformity_of_cell_size         0
uniformity_of_cell_shape        0
marginal_adhesion               0
single_epithelial_cell_size     0
bare_nuclei                    15
bland_chromatin                 0
normal_nucleoli                 0
mitosis                         0
class                           0
dtype: int64

In [4]:
data['bare_nuclei'].fillna(data['bare_nuclei'].mean(),inplace=True)

In [5]:
Y=data.pop('class')
X=data.iloc[:,:9]
Y

0      2
1      2
2      2
3      2
4      2
      ..
564    2
565    4
566    2
567    2
568    4
Name: class, Length: 569, dtype: int64

In [6]:
data.describe(include='all')

Unnamed: 0,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitosis
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,4.539543,3.184534,3.265378,2.845343,3.29877,3.785199,3.490334,2.989455,1.637961
std,2.896501,3.002236,2.955935,2.873626,2.304775,3.68606,2.324925,3.091315,1.773941
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0
50%,4.0,1.0,2.0,1.0,2.0,1.0,3.0,1.0,1.0
75%,6.0,5.0,5.0,4.0,4.0,8.0,5.0,4.0,1.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0


In [7]:
from sklearn.preprocessing import StandardScaler
scalar=StandardScaler()
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=100)
scaled_X_train=scalar.fit_transform(X_train)
scaled_X_test=scalar.transform(X_test)

In [8]:
model=RandomForestClassifier(n_estimators=1000,oob_score=True,n_jobs=-1)
model.fit(scaled_X_train,Y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=True, random_state=None, verbose=0,
                       warm_start=False)

In [9]:
n_job_list=[1,-1]
n_estimators_list=[50,100,200,500,1000,2000,3000]
max_features_list=['auto','log2','sqrt',None,0.1,0.9]
min_samples_leaf_list=[1,2,3,4,5,6,7,8,9,10]

In [10]:
%%timeit
for n_job in n_job_list:
    model=RandomForestClassifier(n_estimators=1000,oob_score=True,n_jobs=n_job)

21.4 µs ± 87.7 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [13]:
for n_estimator in n_estimators_list:
    model=RandomForestClassifier(n_estimators=n_estimator,oob_score=True,n_jobs=-1)
    model.fit(scaled_X_train,Y_train)
    print(n_estimator,'trees')
    oob=model.oob_score_
    print('OOB:',oob)

50 trees
OOB: 0.967032967032967
100 trees
OOB: 0.9560439560439561
200 trees
OOB: 0.9626373626373627
500 trees
OOB: 0.9648351648351648
1000 trees
OOB: 0.9692307692307692
2000 trees
OOB: 0.9692307692307692
3000 trees
OOB: 0.967032967032967


In [14]:
for max_feature in max_features_list:
    model=RandomForestClassifier(n_estimators=500,oob_score=True,n_jobs=-1,max_features=max_feature)
    model.fit(scaled_X_train,Y_train)
    print(max_feature,'features')
    oob=model.oob_score_
    print('OOB:',oob)

auto features
OOB: 0.9648351648351648
log2 features
OOB: 0.967032967032967
sqrt features
OOB: 0.9604395604395605
None features
OOB: 0.9516483516483516
0.1 features
OOB: 0.967032967032967
0.9 features
OOB: 0.9538461538461539


In [15]:
for min_samples_leaves in min_samples_leaf_list:
    model=RandomForestClassifier(n_estimators=500,oob_score=True,n_jobs=-1,max_features='sqrt',min_samples_leaf=min_samples_leaves)
    model.fit(scaled_X_train,Y_train)
    print(min_samples_leaves,'leaves')
    oob=model.oob_score_
    print('OOB:',oob)

1 leaves
OOB: 0.967032967032967
2 leaves
OOB: 0.9648351648351648
3 leaves
OOB: 0.967032967032967
4 leaves
OOB: 0.967032967032967
5 leaves
OOB: 0.967032967032967
6 leaves
OOB: 0.9692307692307692
7 leaves
OOB: 0.9648351648351648
8 leaves
OOB: 0.9692307692307692
9 leaves
OOB: 0.9692307692307692
10 leaves
OOB: 0.9626373626373627


In [16]:
model_random=RandomForestClassifier(n_estimators=500,oob_score=True,n_jobs=-1,max_features='sqrt',min_samples_leaf=5)
model_random.fit(scaled_X_train,Y_train)
oob=model_random.oob_score_
print('OOB:',oob)

OOB: 0.9692307692307692


In [17]:
y_pred=model.predict(scaled_X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test,y_pred)

array([[61,  4],
       [ 1, 48]], dtype=int64)

In [31]:
clean_ups={4:1,2:0}
Y_test.replace(clean_ups,inplace=True)
Y_test

400    1
225    0
321    0
173    1
506    1
      ..
359    1
261    1
37     0
6      0
524    0
Name: class, Length: 114, dtype: int64

In [36]:
y_pred=pd.DataFrame(y_pred)
y_pred.replace(clean_ups,inplace=True)

In [38]:
from sklearn import metrics
print("Recall:",metrics.recall_score(Y_test,y_pred))
print("Precision:",metrics.precision_score(Y_test,y_pred))
print("Accuracy:",metrics.accuracy_score(Y_test,y_pred))
print("F-score:",metrics.f1_score(Y_test,y_pred))
print("Log-loss",metrics.log_loss(Y_test,y_pred))

Recall: 0.9795918367346939
Precision: 0.9230769230769231
Accuracy: 0.956140350877193
F-score: 0.9504950495049506
Log-loss 1.514886669862055
