# Predicting Breast Cancer by modelling on UCI Breast Cancer Dataset

In [11]:
# Importing packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')

from sklearn import tree, linear_model, neighbors
from sklearn import metrics
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [2]:
# setting OS directory
os.chdir('C:\\Users\\rckar\\OneDrive\\Documents\\MSBA\\Fall Semester\\6420 Predictive Analytics\\HW1')

In [3]:
# Reading Data
df = pd.read_csv("wdbc.data", header = None)

In [4]:
# Renaming column names
col_names = ['id','diagnosis',
                 'radius_mean','texture_mean','perimeter_mean','area_mean',
                 'smoothness_mean','compactness_mean','concavity_mean',
                 'concave points_mean','symmetry_mean','fractal_dimension_mean',
                 'radius_se','texture_se','perimeter_se','area_se',
                 'smoothness_se','compactness_se','concavity_se','concave points_se',
                 'symmetry_se','fractal_dimension_se','radius_worst','texture_worst',
                 'perimeter_worst','area_worst','smoothness_worst','compactness_worst',
                 'concavity_worst','concave points_worst','symmetry_worst','fractal_dimension_worst']

df.columns = col_names

In [5]:
# Data pre-processing

# checking for null values
df.isnull().values.any()

False

In [6]:
# Factorizing diagnosis as diagnosis class for modelling purpose
df['diagnosis_class'],class_names = pd.factorize(df['diagnosis'])
print(class_names)
df[['diagnosis','diagnosis_class']].head(n=2)

Index(['M', 'B'], dtype='object')


Unnamed: 0,diagnosis,diagnosis_class
0,M,0
1,M,0


In [7]:
# Checking for class imbalance
df['diagnosis_class'].value_counts()

1    357
0    212
Name: diagnosis_class, dtype: int64

We do not observe heavy class imbalace in the dataset 

##### Creating the Train and Test data

In [8]:
X_df = df.iloc[:,2:32]
y_df = df.iloc[:,32:]

In [9]:
X_train, X_test_holdout, y_train, y_test_holdout = train_test_split(X_df, y_df, test_size=0.25)

## DECISION TREE

###### Tuning the model using GridSearch Cross Validation to find the values of hyperparameters that best fit the model

In [12]:
# Hyper parameter tuning using GridSearch
param_set ={'max_depth': range(1,20), 'min_samples_split' : range(2,30), 'criterion' : ["gini", "entropy"]}
clf_DTree = tree.DecisionTreeClassifier()
grid_DTree = GridSearchCV(clf_DTree, param_grid = param_set, cv=10)
grid_DTree.fit(X_train,y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': range(1, 20), 'min_samples_split': range(2, 30), 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

##### Examining the best model

In [14]:
# examine the best model

print("Best score achieved across all parameters: ", grid_DTree.best_score_)

# Dictionary containing the parameters used to generate that score
print(" ")
print("Best parameters")
print(grid_DTree.best_params_)

print(" ")
print("Best estimator")
print(grid_DTree.best_estimator_)

Best score achieved across all parameters:  0.9436619718309859
 
Best parameters
{'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 4}
 
Best estimator
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=4,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


##### cross - validation with the best hyperparameters

In [15]:
clf_Dtree_best = grid_DTree.best_estimator_

scores = cross_val_score(clf_Dtree_best, X_train, y_train, cv=10)

print("Below are the scores for each model run")
print(scores)

print(" ")
print("Mean Accuracy and variance: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Below are the scores for each model run
[0.93181818 0.90909091 0.88372093 0.95348837 0.95238095 0.9047619
 0.97619048 0.9047619  0.95238095 0.97619048]
 
Mean Accuracy and variance: 0.93 (+/- 0.06)


##### Fitting the Train data and Prediction on test data

In [16]:
clf_Dtree_best.fit(X_train,y_train)

y_pred = clf_Dtree_best.predict(X_test_holdout)
print("Decision Tree : accuracy on test data is ",round(accuracy_score(y_test_holdout, y_pred)*100,2),"%")
print(" ")
print("Decision Tree : Confusion Matrix")
print(" ")
print(confusion_matrix(y_test_holdout, y_pred))
print(" ")
print("Decision Tree : Classification Report")
print(classification_report(y_test_holdout, y_pred))

Decision Tree : accuracy on test data is  95.8 %
 
Decision Tree : Confusion Matrix
 
[[57  1]
 [ 5 80]]
 
Decision Tree : Classification Report
              precision    recall  f1-score   support

           0       0.92      0.98      0.95        58
           1       0.99      0.94      0.96        85

   micro avg       0.96      0.96      0.96       143
   macro avg       0.95      0.96      0.96       143
weighted avg       0.96      0.96      0.96       143



## Logistic Regression

###### Tuning the model using GridSearch Cross Validation to find the values of hyperparameters that best fit the model

In [17]:
param_set ={'penalty' : ["l1","l2"], 'C':np.arange(0.2,1,0.1), 'class_weight': [None, 'balanced']}

clf_logistic = linear_model.LogisticRegression()

grid_logistic = GridSearchCV(clf_logistic, param_grid = param_set, cv=10)
grid_logistic.fit(X_train,y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'penalty': ['l1', 'l2'], 'C': array([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), 'class_weight': [None, 'balanced']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

##### examine the best model

In [18]:
# Single best score achieved across all params
print("Best score achieved across all parameters: ", grid_logistic.best_score_)

# Dictionary containing the parameters used to generate that score
print(" ")
print("Best parameters")
print(grid_logistic.best_params_)

# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(" ")
print("Best estimator")
print(grid_logistic.best_estimator_)

Best score achieved across all parameters:  0.9530516431924883
 
Best parameters
{'C': 0.5000000000000001, 'class_weight': 'balanced', 'penalty': 'l1'}
 
Best estimator
LogisticRegression(C=0.5000000000000001, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l1', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)


##### cross - validation with the best hyperparameters

In [19]:
clf_logistic_best = grid_logistic.best_estimator_

scores = cross_val_score(clf_logistic_best, X_train, y_train, cv=10)

print("Below are the scores for each model run")
print(scores)

print(" ")
print("Mean Accuracy and variance: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Below are the scores for each model run
[0.90909091 1.         0.93023256 0.97674419 0.92857143 0.95238095
 0.97619048 0.92857143 0.95238095 0.97619048]
 
Mean Accuracy and variance: 0.95 (+/- 0.05)


##### Fitting the Train data and Prediction on test data

In [20]:
clf_logistic_best.fit(X_train,y_train)

y_pred = clf_logistic_best.predict(X_test_holdout)

print("Logistic Regression : accuracy on test data is ",round(accuracy_score(y_test_holdout, y_pred)*100,2),"%")
print(" ")
print("Logistic Regression : Confusion Matrix")
print(" ")
print(confusion_matrix(y_test_holdout, y_pred))
print(" ")
print("Logistic Regression : Classification Report")
print(classification_report(y_test_holdout, y_pred))

Logistic Regression : accuracy on test data is  97.2 %
 
Logistic Regression : Confusion Matrix
 
[[57  1]
 [ 3 82]]
 
Logistic Regression : Classification Report
              precision    recall  f1-score   support

           0       0.95      0.98      0.97        58
           1       0.99      0.96      0.98        85

   micro avg       0.97      0.97      0.97       143
   macro avg       0.97      0.97      0.97       143
weighted avg       0.97      0.97      0.97       143



## KNN Classifier

##### Normalizing the input parameters

In [21]:
scaler = MinMaxScaler(feature_range=(0, 1))

In [22]:
x_train_scaled = scaler.fit_transform(X_train)
x_train_scaled = pd.DataFrame(x_train_scaled)

x_test_scaled = scaler.fit_transform(X_test_holdout)
x_test_scaled = pd.DataFrame(x_test_scaled)

###### Tuning the model using GridSearch Cross Validation to find the values of hyperparameters that best fit the model

In [23]:
param_set ={'n_neighbors': list(range(1,30)), 'weights': ["uniform", "distance"]}

clf_knn = neighbors.KNeighborsClassifier()

grid_knn = GridSearchCV(clf_knn, param_grid = param_set, cv=10)
grid_knn.fit(x_train_scaled,y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29], 'weights': ['uniform', 'distance']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

##### examine the best model

In [24]:
# Single best score achieved across all params
print("Best score achieved across all parameters: ", grid_knn.best_score_)

# Dictionary containing the parameters used to generate that score
print(" ")
print("Best parameters")
print(grid_knn.best_params_)

# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(" ")
print("Best estimator")
print(grid_knn.best_estimator_)

Best score achieved across all parameters:  0.9788732394366197
 
Best parameters
{'n_neighbors': 10, 'weights': 'uniform'}
 
Best estimator
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=10, p=2,
           weights='uniform')


##### cross - validation with the best hyperparameters

In [25]:
clf_knn_best = grid_knn.best_estimator_

scores = cross_val_score(clf_knn_best, x_train_scaled, y_train, cv=10)

print("Below are the scores for each model run")
print(scores)

print(" ")
print("Mean Accuracy and variance: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Below are the scores for each model run
[0.97727273 0.95454545 1.         0.97674419 0.95238095 0.95238095
 0.97619048 1.         1.         1.        ]
 
Mean Accuracy and variance: 0.98 (+/- 0.04)


##### Fitting the Train data and Prediction on test data

In [26]:
clf_knn_best.fit(X_train,y_train)


y_pred = clf_knn_best.predict(X_test_holdout)

print("KNN : accuracy on test data is ",round(accuracy_score(y_test_holdout, y_pred)*100,2),"%")
print(" ")
print("KNN : Confusion Matrix")
print(" ")
print(confusion_matrix(y_test_holdout, y_pred))
print(" ")
print("KNN : Classification Report")
print(classification_report(y_test_holdout, y_pred))

KNN : accuracy on test data is  93.71 %
 
KNN : Confusion Matrix
 
[[52  6]
 [ 3 82]]
 
KNN : Classification Report
              precision    recall  f1-score   support

           0       0.95      0.90      0.92        58
           1       0.93      0.96      0.95        85

   micro avg       0.94      0.94      0.94       143
   macro avg       0.94      0.93      0.93       143
weighted avg       0.94      0.94      0.94       143

