In [1]:
import pandas as pd

df = pd.read_csv("wdbc.data", names= ['Id','Outcome',
                                    'radius_mean','texture_mean','perimeter_mean','area_mean',
                                    'smoothness_mean','compactness_mean','concavity_mean','concave points_mean',
                                    'symmetry_mean','fractal dimension_mean',
                                    'radius_stderr','texture_stderr','perimeter_stderr','area_stderr',
                                    'smoothness_stderr','compactness_stderr','concavity_stderr','concave points_stderr',
                                    'symmetry_stderr','fractal dimension_stderr',
                                    'radius_worst','texture_worst','perimeter_worst','area_worst',
                                    'smoothness_worst','compactness_worst','concavity_worst','concave points_worst',
                                    'symmetry_worst','fractal dimension_worst'])


In [2]:
df.head()

Unnamed: 0,Id,Outcome,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Id                        569 non-null    int64  
 1   Outcome                   569 non-null    object 
 2   radius_mean               569 non-null    float64
 3   texture_mean              569 non-null    float64
 4   perimeter_mean            569 non-null    float64
 5   area_mean                 569 non-null    float64
 6   smoothness_mean           569 non-null    float64
 7   compactness_mean          569 non-null    float64
 8   concavity_mean            569 non-null    float64
 9   concave points_mean       569 non-null    float64
 10  symmetry_mean             569 non-null    float64
 11  fractal dimension_mean    569 non-null    float64
 12  radius_stderr             569 non-null    float64
 13  texture_stderr            569 non-null    float64
 14  perimeter_

In [4]:
df['Outcome'].value_counts()

B    357
M    212
Name: Outcome, dtype: int64

In [5]:
df.loc[df['Outcome']=='B','Outcome'] = 0
df.loc[df['Outcome']=='M','Outcome'] = 1
df['Outcome'] = df['Outcome'].astype(int)

In [6]:
feature = df[['radius_mean','texture_mean','perimeter_mean','area_mean',
             'smoothness_mean','compactness_mean','concavity_mean','concave points_mean',
             'symmetry_mean','fractal dimension_mean',
             'radius_worst','texture_worst','perimeter_worst','area_worst',
             'smoothness_worst','compactness_worst','concavity_worst','concave points_worst',
             'symmetry_worst','fractal dimension_worst']]
outcome = df['Outcome']

In [7]:
from sklearn.model_selection import train_test_split

train_feature, test_feature, train_target, test_target = train_test_split(feature, outcome, random_state=42, stratify=outcome)

In [8]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_jobs=-1,random_state=42)

In [9]:
clf.fit(train_feature, train_target)

RandomForestClassifier(n_jobs=-1, random_state=42)

In [10]:
y_pred = clf.predict(test_feature)

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print("Accuracy :",accuracy_score(test_target, y_pred))
print("Precision :",precision_score(test_target, y_pred))
print("Recall :",recall_score(test_target, y_pred))
print("F1 score :",f1_score(test_target, y_pred))
print(confusion_matrix(test_target, y_pred,labels=[1,0]))

Accuracy : 0.9790209790209791
Precision : 1.0
Recall : 0.9433962264150944
F1 score : 0.970873786407767
[[50  3]
 [ 0 90]]


In [12]:
print("Before oversampling")
print(train_target.value_counts())

Before oversampling
0    267
1    159
Name: Outcome, dtype: int64


In [13]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42, n_jobs=-1)
train_feature_oversampling, train_target_oversampling = sm.fit_resample(train_feature, train_target)

In [14]:
print("After oversampling")
print(train_target_oversampling.value_counts())

After oversampling
0    267
1    267
Name: Outcome, dtype: int64


In [15]:
from sklearn.ensemble import RandomForestClassifier

clf2 = RandomForestClassifier(n_jobs=-1,random_state=42)

In [16]:
clf2.fit(train_feature_oversampling, train_target_oversampling)

RandomForestClassifier(n_jobs=-1, random_state=42)

In [17]:
y_pred = clf2.predict(test_feature)

In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print("Accuracy :",accuracy_score(test_target, y_pred))
print("Precision :",precision_score(test_target, y_pred))
print("Recall :",recall_score(test_target, y_pred))
print("F1 score :",f1_score(test_target, y_pred))
print(confusion_matrix(test_target, y_pred,labels=[1,0]))

Accuracy : 0.965034965034965
Precision : 1.0
Recall : 0.9056603773584906
F1 score : 0.9504950495049505
[[48  5]
 [ 0 90]]


In [19]:
print("Before undersampling")
print(train_target.value_counts())

Before undersampling
0    267
1    159
Name: Outcome, dtype: int64


In [20]:
from imblearn.under_sampling import CondensedNearestNeighbour

cnn = CondensedNearestNeighbour(random_state=42, n_jobs=-1)
train_feature_undersampling, train_target_undersampling = cnn.fit_resample(train_feature, train_target)

In [21]:
print("After undersampling")
print(train_target_undersampling.value_counts())

After undersampling
1    159
0     36
Name: Outcome, dtype: int64


In [22]:
from sklearn.ensemble import RandomForestClassifier

clf3 = RandomForestClassifier(n_jobs=-1,random_state=42)

In [23]:
clf3.fit(train_feature_undersampling, train_target_undersampling)

RandomForestClassifier(n_jobs=-1, random_state=42)

In [24]:
y_pred = clf3.predict(test_feature)

In [25]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print("Accuracy :",accuracy_score(test_target, y_pred))
print("Precision :",precision_score(test_target, y_pred))
print("Recall :",recall_score(test_target, y_pred))
print("F1 score :",f1_score(test_target, y_pred))
print(confusion_matrix(test_target, y_pred,labels=[1,0]))

Accuracy : 0.951048951048951
Precision : 0.896551724137931
Recall : 0.9811320754716981
F1 score : 0.9369369369369369
[[52  1]
 [ 6 84]]


In [26]:
print("Before Over-undersampling")
print(train_target.value_counts())

Before Over-undersampling
0    267
1    159
Name: Outcome, dtype: int64


In [27]:
from imblearn.under_sampling import CondensedNearestNeighbour

cnn = CondensedNearestNeighbour(random_state=42, n_jobs=-1)
train_feature_undersampling, train_target_undersampling = cnn.fit_resample(train_feature, train_target)

In [28]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42, n_jobs=-1)
train_feature_oversampling, train_target_oversampling = sm.fit_resample(train_feature_undersampling, train_target_undersampling)

In [29]:
print("After Over-undersampling")
print(train_target_oversampling.value_counts())

After Over-undersampling
0    159
1    159
Name: Outcome, dtype: int64


In [30]:
from sklearn.ensemble import RandomForestClassifier

clf4 = RandomForestClassifier(n_jobs=-1,random_state=42)

In [31]:
clf4.fit(train_feature_oversampling, train_target_oversampling)

RandomForestClassifier(n_jobs=-1, random_state=42)

In [32]:
y_pred = clf4.predict(test_feature)

In [33]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print("Accuracy :",accuracy_score(test_target, y_pred))
print("Precision :",precision_score(test_target, y_pred))
print("Recall :",recall_score(test_target, y_pred))
print("F1 score :",f1_score(test_target, y_pred))
print(confusion_matrix(test_target, y_pred,labels=[1,0]))

Accuracy : 0.9090909090909091
Precision : 0.8225806451612904
Recall : 0.9622641509433962
F1 score : 0.8869565217391304
[[51  2]
 [11 79]]


In [34]:
from sklearn.ensemble import ExtraTreesClassifier

clf5 = ExtraTreesClassifier(n_jobs=-1,random_state=42)

In [35]:
clf5.fit(train_feature, train_target)

ExtraTreesClassifier(n_jobs=-1, random_state=42)

In [36]:
y_pred = clf5.predict(test_feature)

In [37]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print("Accuracy :",accuracy_score(test_target, y_pred))
print("Precision :",precision_score(test_target, y_pred))
print("Recall :",recall_score(test_target, y_pred))
print("F1 score :",f1_score(test_target, y_pred))
print(confusion_matrix(test_target, y_pred,labels=[1,0]))

Accuracy : 0.9790209790209791
Precision : 1.0
Recall : 0.9433962264150944
F1 score : 0.970873786407767
[[50  3]
 [ 0 90]]


In [38]:
from sklearn.ensemble import ExtraTreesClassifier

clf6 = ExtraTreesClassifier(n_jobs=-1,random_state=42)

In [39]:
clf6.fit(train_feature_oversampling, train_target_oversampling)

ExtraTreesClassifier(n_jobs=-1, random_state=42)

In [40]:
y_pred = clf6.predict(test_feature)

In [41]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print("Accuracy :",accuracy_score(test_target, y_pred))
print("Precision :",precision_score(test_target, y_pred))
print("Recall :",recall_score(test_target, y_pred))
print("F1 score :",f1_score(test_target, y_pred))
print(confusion_matrix(test_target, y_pred,labels=[1,0]))

Accuracy : 0.9440559440559441
Precision : 0.8813559322033898
Recall : 0.9811320754716981
F1 score : 0.9285714285714285
[[52  1]
 [ 7 83]]
