In [1]:
import pandas as pd

df = pd.read_csv("wpbc.data", names= ['Id','Outcome','Time',
                                    'radius_mean','texture_mean','perimeter_mean','area_mean',
                                    'smoothness_mean','compactness_mean','concavity_mean','concave points_mean',
                                    'symmetry_mean','fractal dimension_mean',
                                    'radius_stderr','texture_stderr','perimeter_stderr','area_stderr',
                                    'smoothness_stderr','compactness_stderr','concavity_stderr','concave points_stderr',
                                    'symmetry_stderr','fractal dimension_stderr',
                                    'radius_worst','texture_worst','perimeter_worst','area_worst',
                                    'smoothness_worst','compactness_worst','concavity_worst','concave points_worst',
                                    'symmetry_worst','fractal dimension_worst',
                                    'Tumor size','Lymph node status'])


In [2]:
df.head()

Unnamed: 0,Id,Outcome,Time,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,...,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal dimension_worst,Tumor size,Lymph node status
0,119513,N,31,18.02,27.6,117.5,1013.0,0.09489,0.1036,0.1086,...,139.7,1436.0,0.1195,0.1926,0.314,0.117,0.2677,0.08113,5.0,5
1,8423,N,61,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,...,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,3.0,2
2,842517,N,116,21.37,17.44,137.5,1373.0,0.08836,0.1189,0.1255,...,159.1,1949.0,0.1188,0.3449,0.3414,0.2032,0.4334,0.09067,2.5,0
3,843483,N,123,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,...,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,2.0,0
4,843584,R,27,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,...,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,3.5,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198 entries, 0 to 197
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Id                        198 non-null    int64  
 1   Outcome                   198 non-null    object 
 2   Time                      198 non-null    int64  
 3   radius_mean               198 non-null    float64
 4   texture_mean              198 non-null    float64
 5   perimeter_mean            198 non-null    float64
 6   area_mean                 198 non-null    float64
 7   smoothness_mean           198 non-null    float64
 8   compactness_mean          198 non-null    float64
 9   concavity_mean            198 non-null    float64
 10  concave points_mean       198 non-null    float64
 11  symmetry_mean             198 non-null    float64
 12  fractal dimension_mean    198 non-null    float64
 13  radius_stderr             198 non-null    float64
 14  texture_st

In [4]:
df['Outcome'].value_counts()

N    151
R     47
Name: Outcome, dtype: int64

In [5]:
df.loc[df['Lymph node status']=='?','Lymph node status']=None

In [6]:
df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)

In [7]:
df.loc[df['Outcome']=='N','Outcome'] = 0
df.loc[df['Outcome']=='R','Outcome'] = 1
df['Outcome'] = df['Outcome'].astype(int)

In [8]:
feature = df[['radius_mean','texture_mean','perimeter_mean','area_mean',
             'smoothness_mean','compactness_mean','concavity_mean','concave points_mean',
             'symmetry_mean','fractal dimension_mean',
             'radius_worst','texture_worst','perimeter_worst','area_worst',
             'smoothness_worst','compactness_worst','concavity_worst','concave points_worst',
             'symmetry_worst','fractal dimension_worst',
             'Tumor size','Lymph node status']]
outcome = df['Outcome']

In [9]:
from sklearn.model_selection import train_test_split

train_feature, test_feature, train_target, test_target = train_test_split(feature, outcome, random_state=42, stratify=outcome)

In [10]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_jobs=-1,random_state=42)

In [11]:
clf.fit(train_feature, train_target)

RandomForestClassifier(n_jobs=-1, random_state=42)

In [12]:
y_pred = clf.predict(test_feature)

In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print("Accuracy :",accuracy_score(test_target, y_pred))
print("Precision :",precision_score(test_target, y_pred))
print("Recall :",recall_score(test_target, y_pred))
print("F1 score :",f1_score(test_target, y_pred))
print(confusion_matrix(test_target, y_pred,labels=[1,0]))

Accuracy : 0.7959183673469388
Precision : 1.0
Recall : 0.16666666666666666
F1 score : 0.2857142857142857
[[ 2 10]
 [ 0 37]]


In [14]:
print("Before oversampling")
print(train_target.value_counts())

Before oversampling
0    111
1     34
Name: Outcome, dtype: int64


In [15]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42, n_jobs=-1)
train_feature_oversampling, train_target_oversampling = sm.fit_resample(train_feature, train_target)

In [16]:
print("After oversampling")
print(train_target_oversampling.value_counts())

After oversampling
0    111
1    111
Name: Outcome, dtype: int64


In [17]:
from sklearn.ensemble import RandomForestClassifier

clf2 = RandomForestClassifier(n_jobs=-1,random_state=42)

In [18]:
clf2.fit(train_feature_oversampling, train_target_oversampling)

RandomForestClassifier(n_jobs=-1, random_state=42)

In [19]:
y_pred = clf2.predict(test_feature)

In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print("Accuracy :",accuracy_score(test_target, y_pred))
print("Precision :",precision_score(test_target, y_pred))
print("Recall :",recall_score(test_target, y_pred))
print("F1 score :",f1_score(test_target, y_pred))
print(confusion_matrix(test_target, y_pred,labels=[1,0]))

Accuracy : 0.7142857142857143
Precision : 0.4166666666666667
Recall : 0.4166666666666667
F1 score : 0.4166666666666667
[[ 5  7]
 [ 7 30]]


In [21]:
print("Before undersampling")
print(train_target.value_counts())

Before undersampling
0    111
1     34
Name: Outcome, dtype: int64


In [22]:
from imblearn.under_sampling import CondensedNearestNeighbour

cnn = CondensedNearestNeighbour(random_state=42, n_jobs=-1)
train_feature_undersampling, train_target_undersampling = cnn.fit_resample(train_feature, train_target)

In [23]:
print("After undersampling")
print(train_target_undersampling.value_counts())

After undersampling
0    44
1    34
Name: Outcome, dtype: int64


In [24]:
from sklearn.ensemble import RandomForestClassifier

clf3 = RandomForestClassifier(n_jobs=-1,random_state=42)

In [25]:
clf3.fit(train_feature_undersampling, train_target_undersampling)

RandomForestClassifier(n_jobs=-1, random_state=42)

In [26]:
y_pred = clf3.predict(test_feature)

In [27]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print("Accuracy :",accuracy_score(test_target, y_pred))
print("Precision :",precision_score(test_target, y_pred))
print("Recall :",recall_score(test_target, y_pred))
print("F1 score :",f1_score(test_target, y_pred))
print(confusion_matrix(test_target, y_pred,labels=[1,0]))

Accuracy : 0.6122448979591837
Precision : 0.29411764705882354
Recall : 0.4166666666666667
F1 score : 0.3448275862068966
[[ 5  7]
 [12 25]]


In [28]:
print("Before Over-undersampling")
print(train_target.value_counts())

Before Over-undersampling
0    111
1     34
Name: Outcome, dtype: int64


In [29]:
from imblearn.under_sampling import CondensedNearestNeighbour

cnn = CondensedNearestNeighbour(random_state=42, n_jobs=-1)
train_feature_undersampling, train_target_undersampling = cnn.fit_resample(train_feature, train_target)

In [30]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42, n_jobs=-1)
train_feature_oversampling, train_target_oversampling = sm.fit_resample(train_feature_undersampling, train_target_undersampling)

In [31]:
print("After Over-undersampling")
print(train_target_oversampling.value_counts())

After Over-undersampling
0    44
1    44
Name: Outcome, dtype: int64


In [32]:
from sklearn.ensemble import RandomForestClassifier

clf4 = RandomForestClassifier(n_jobs=-1,random_state=42)

In [33]:
clf4.fit(train_feature_oversampling, train_target_oversampling)

RandomForestClassifier(n_jobs=-1, random_state=42)

In [34]:
y_pred = clf4.predict(test_feature)

In [35]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print("Accuracy :",accuracy_score(test_target, y_pred))
print("Precision :",precision_score(test_target, y_pred))
print("Recall :",recall_score(test_target, y_pred))
print("F1 score :",f1_score(test_target, y_pred))
print(confusion_matrix(test_target, y_pred,labels=[1,0]))

Accuracy : 0.673469387755102
Precision : 0.4
Recall : 0.6666666666666666
F1 score : 0.5
[[ 8  4]
 [12 25]]


In [36]:
from sklearn.ensemble import ExtraTreesClassifier

clf5 = ExtraTreesClassifier(n_jobs=-1,random_state=42)

In [37]:
clf5.fit(train_feature, train_target)

ExtraTreesClassifier(n_jobs=-1, random_state=42)

In [38]:
y_pred = clf5.predict(test_feature)

In [39]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print("Accuracy :",accuracy_score(test_target, y_pred))
print("Precision :",precision_score(test_target, y_pred))
print("Recall :",recall_score(test_target, y_pred))
print("F1 score :",f1_score(test_target, y_pred))
print(confusion_matrix(test_target, y_pred,labels=[1,0]))

Accuracy : 0.7755102040816326
Precision : 0.6
Recall : 0.25
F1 score : 0.35294117647058826
[[ 3  9]
 [ 2 35]]


In [40]:
from sklearn.ensemble import ExtraTreesClassifier

clf6 = ExtraTreesClassifier(n_jobs=-1,random_state=42)

In [41]:
clf6.fit(train_feature_oversampling, train_target_oversampling)

ExtraTreesClassifier(n_jobs=-1, random_state=42)

In [42]:
y_pred = clf6.predict(test_feature)

In [43]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print("Accuracy :",accuracy_score(test_target, y_pred))
print("Precision :",precision_score(test_target, y_pred))
print("Recall :",recall_score(test_target, y_pred))
print("F1 score :",f1_score(test_target, y_pred))
print(confusion_matrix(test_target, y_pred,labels=[1,0]))

Accuracy : 0.673469387755102
Precision : 0.4
Recall : 0.6666666666666666
F1 score : 0.5
[[ 8  4]
 [12 25]]
