# Import required packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV,train_test_split,KFold
import sklearn.impute 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,auc,classification_report,confusion_matrix

# Import data 

In [2]:
df=pd.read_csv('Wisconsin_Breast_Cancer_Dataset.csv')
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


# Data validation

In [3]:
df.isnull().sum()

id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fractal_dimension_worst      0
Unnamed:

# Select input and Output data

In [4]:
X=df.iloc[:,2:32]
X.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [5]:
y=df['diagnosis']
y.head()

0    M
1    M
2    M
3    M
4    M
Name: diagnosis, dtype: object

# Replace non-numeric values to numeric

In [6]:
y = y.replace('M',1)
y = y.replace('B',0)
SEED = 1 # for reproducing

# Create Test and Train data 

In [7]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=SEED,stratify=y)

In [8]:
X_train.shape # (455, 2)

(455, 30)

In [9]:
y_train.shape # (455,)

(455,)

# Create Model using criterion as entropy

DecisionTreeClassifier(
    ["criterion='gini'", "splitter='best'", 'max_depth=None', 'min_samples_split=2', 'min_samples_leaf=1', 'min_weight_fraction_leaf=0.0', 'max_features=None', 'random_state=None', 'max_leaf_nodes=None', 'min_impurity_decrease=0.0', 'min_impurity_split=None', 'class_weight=None', 'presort=False']

In [10]:
dt_entropy=DecisionTreeClassifier(criterion='entropy',max_depth=8,random_state=SEED)

In [11]:
# Fit dt_entropy to the training set
dt_entropy.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')

In [12]:
# Use dt_entropy to predict test set labels
y_pred = dt_entropy.predict(X_test)

In [13]:
# Evaluate accuracy_entropy
accuracy_entropy = accuracy_score(y_test, y_pred)
accuracy_entropy

0.9298245614035088

# Create Model using criterion as gini

In [14]:
# Instantiate dt_gini, set 'gini' as the information criterion
dt_gini= DecisionTreeClassifier(max_depth=8, 
                                    criterion='gini', 
                                    random_state=SEED)

In [15]:
# Fit dt_entropy to the training set
dt_gini.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')

In [16]:
# Use dt_entropy to predict test set labels
y_pred_gini = dt_gini.predict(X_test)

In [17]:
# Evaluate accuracy_gini
accuracy_gini = accuracy_score(y_test, y_pred_gini)
accuracy_gini

0.9298245614035088

 # Create Model using KNN classifier

In [18]:
from sklearn.neighbors import KNeighborsClassifier

In [19]:
knn=KNeighborsClassifier()

In [20]:
# Fit knn to the training set
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [21]:
# Use knn to predict test set labels
y_pred_knn = knn.predict(X_test)

In [22]:
accuracy_knn=accuracy_score(y_test,y_pred_knn)
accuracy_knn

0.9210526315789473

# Create Model using Support vector classifier

In [23]:
from sklearn.svm import SVC

In [24]:
svc=SVC()

In [25]:
# Fit SVC to the training set
svc.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [26]:
# Use svc to predict test set labels
y_pred_svc = svc.predict(X_test)

In [27]:
accuracy_svc=accuracy_score(y_test,y_pred_svc)
accuracy_svc

0.631578947368421

# Create Model using Logistic regression classifier

In [29]:
from sklearn.linear_model import LogisticRegression

In [30]:
lr=LogisticRegression()

In [31]:
lr.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [32]:
y_pred_lr=lr.predict(X_test)

In [33]:
accuracy_lr=accuracy_score(y_test,y_pred_lr)
accuracy_lr

0.9649122807017544

# Compare accuracy of diffrent models

In [34]:
# Print accuracy_entropy
print('Accuracy achieved by using entropy       : ', accuracy_entropy)
# Print accuracy_gini
print('Accuracy achieved by using gini          : ', accuracy_gini)
# Print accuracy_knn
print('Accuracy achieved by using knn           : ', accuracy_knn)
# Print accuracy_svc
print('Accuracy achieved by using svc           : ', accuracy_svc)
# Print accuracy_lr
print('Accuracy achieved by using lr            : ', accuracy_lr)

Accuracy achieved by using entropy       :  0.9298245614035088
Accuracy achieved by using gini          :  0.9298245614035088
Accuracy achieved by using knn           :  0.9210526315789473
Accuracy achieved by using svc           :  0.631578947368421
Accuracy achieved by using lr            :  0.9649122807017544


In [35]:
confusion_matrix(y_test,y_pred_lr)

array([[70,  2],
       [ 2, 40]], dtype=int64)