In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

In [14]:
df = pd.read_csv('wdbc.data',header=None).drop(0,axis=1)
df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,22,23,24,25,26,27,28,29,30,31
0,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [15]:
df[1].value_counts() # B > M

B    357
M    212
Name: 1, dtype: int64

In [26]:
y_target = df[1].copy()
x_explanatory = df.drop(1,axis=1).copy()

LE = LabelEncoder()
y_target = LE.fit_transform(y_target)

In [27]:
x_train,x_test,y_train,y_test = train_test_split(x_explanatory,y_target,test_size=0.33,stratify=y_target,random_state=42)

In [28]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((381, 30), (188, 30), (381,), (188,))

## Directly applied Random Forest

In [29]:
from sklearn.metrics import classification_report

RFC = RandomForestClassifier()
RFC.fit(x_train,y_train)
y_predict = RFC.predict(x_test)

class_rep = classification_report(y_test,y_pred=y_predict)
print(class_rep)

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       118
           1       1.00      0.90      0.95        70

    accuracy                           0.96       188
   macro avg       0.97      0.95      0.96       188
weighted avg       0.96      0.96      0.96       188



# Precision Score on Cross validation

In [30]:
from sklearn.model_selection import cross_val_score
RFC = RandomForestClassifier()    
cvs = cross_val_score(RFC,X=x_explanatory,y=y_target,cv=5,scoring='accuracy')
cvs.mean()

0.9560937742586555

## Extra Tree Feature Selection

In [31]:
from sklearn.ensemble import ExtraTreesClassifier
EX_T_C = ExtraTreesClassifier()
EX_T_C.fit(X=x_explanatory,y=y_target)
features = EX_T_C.feature_importances_

In [32]:
most_importants = []
for index in range(len(features)):
    if features[index] >= 0.03: ##Threshold
        most_importants.append(index)

most_importants = x_explanatory.columns[most_importants]

In [33]:
new_x = x_explanatory[most_importants].copy()
new_x

Unnamed: 0,2,4,5,8,9,22,24,25,28,29
0,17.99,122.80,1001.0,0.30010,0.14710,25.380,184.60,2019.0,0.7119,0.2654
1,20.57,132.90,1326.0,0.08690,0.07017,24.990,158.80,1956.0,0.2416,0.1860
2,19.69,130.00,1203.0,0.19740,0.12790,23.570,152.50,1709.0,0.4504,0.2430
3,11.42,77.58,386.1,0.24140,0.10520,14.910,98.87,567.7,0.6869,0.2575
4,20.29,135.10,1297.0,0.19800,0.10430,22.540,152.20,1575.0,0.4000,0.1625
...,...,...,...,...,...,...,...,...,...,...
564,21.56,142.00,1479.0,0.24390,0.13890,25.450,166.10,2027.0,0.4107,0.2216
565,20.13,131.20,1261.0,0.14400,0.09791,23.690,155.00,1731.0,0.3215,0.1628
566,16.60,108.30,858.1,0.09251,0.05302,18.980,126.70,1124.0,0.3403,0.1418
567,20.60,140.10,1265.0,0.35140,0.15200,25.740,184.60,1821.0,0.9387,0.2650


## Final Precision score with stratified kfold cross validation and ExtraTree Feature Selection

In [34]:
# Shuffling in this context means that the data is first randomly shuffled before splitting into test/train. 
# The random_state will allow the way in which the data is shuffled to be repeatable. 
# Without the shuffling switched on, the random_state has no meaning.

final_scores = []

for i in range(30):
    kfold = StratifiedKFold(n_splits=5,random_state=i,shuffle=True)
    RFC = RandomForestClassifier()    
    cvs = cross_val_score(RFC,X=new_x,y=y_target,cv=kfold,scoring='precision')
    final_scores.append(cvs.mean())
sum(final_scores) / len(final_scores)

0.9416157816692942

## Precision score with UnderSampling

In [None]:
# pip install -U imbalanced-learn

In [40]:
from imblearn.under_sampling import TomekLinks
tl = TomekLinks(sampling_strategy='majority')
x_under,y_under = tl.fit_resample(x_explanatory,y_target) # Whole Dataset

In [42]:
np.unique(y_target,return_counts=True)

(array([0, 1]), array([357, 212], dtype=int64))

In [41]:
np.unique(y_under,return_counts=True)

(array([0, 1]), array([345, 212], dtype=int64))

In [43]:
final_scores = []

for i in range(30):
    kfold = StratifiedKFold(n_splits=5,random_state=i,shuffle=True)
    RFC = RandomForestClassifier()    
    cvs = cross_val_score(RFC,X=x_under,y=y_under,cv=kfold,scoring='precision')
    final_scores.append(cvs.mean())
sum(final_scores) / len(final_scores)

0.9584558929318374

## Precision score with OverSampling

In [49]:
from imblearn.over_sampling import SMOTE
tl = SMOTE(sampling_strategy='minority')
x_over,y_over = tl.fit_resample(x_explanatory,y_target) # Whole Dataset

In [50]:
np.unique(y_target,return_counts=True)

(array([0, 1]), array([357, 212], dtype=int64))

In [51]:
np.unique(y_over,return_counts=True)

(array([0, 1]), array([357, 357], dtype=int64))

In [52]:
final_scores = []

for i in range(30):
    kfold = StratifiedKFold(n_splits=5,random_state=i,shuffle=True)
    RFC = RandomForestClassifier()    
    cvs = cross_val_score(RFC,X=x_over,y=y_over,cv=kfold,scoring='precision')
    final_scores.append(cvs.mean())
sum(final_scores) / len(final_scores)

0.9680371415207734