In [1]:
from ucimlrepo import fetch_ucirepo 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
  
# fetch dataset 
breast_cancer_wisconsin_original = fetch_ucirepo(id=15) 
  
# data (as pandas dataframes) 
df_features = breast_cancer_wisconsin_original.data.features 
df_targets = breast_cancer_wisconsin_original.data.targets 

## Prediction Goal: 
### To create a prediction model to find what patients are more likely to develop malignant breast cancer. (1989 - 1991)

| Variable Name | Role | Type | Description |
|-----------|:-----------:|:----------:|------------:|
| Sample_code_number | ID | Categorical | ID Number |
| Clump_Thickness | Feature | Integer | 1 - 10 |
| Size_Cell_Uniformity | Feature | Integer | 1 - 10 |
| Shape_Cell_Uniformity | Feature | Integer | 1 - 10 |
| Marginal_Adhesion | Feature | Integer | 1 - 10 |
| Single_Epithelial_Cell_Size | Feature |  Integer | 1 - 10|
| Bare_Nuclei | Feature | Integer | 1 - 10 |
| Bland_Chromatin | Feature | Integer | 1 - 10 |
| Normal_Nuclei | Feature | Integer | 1 - 10 |
| Mitoses | Feature | Integer | 1 - 10 |
| Class | Target | Binary | 2 = Benign, 4 = Malignant |


In [2]:
df_targets.value_counts()
# Checking how many benign vs malignant targets there are 

Class
2        458
4        241
Name: count, dtype: int64

In [3]:
# Concatenate features and targets dataframe into one 
df = pd.concat([df_targets, df_features], axis = 1)
df.head(8)

Unnamed: 0,Class,Clump_thickness,Uniformity_of_cell_size,Uniformity_of_cell_shape,Marginal_adhesion,Single_epithelial_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses
0,2,5,1,1,1,2,1.0,3,1,1
1,2,5,4,4,5,7,10.0,3,2,1
2,2,3,1,1,1,2,2.0,3,1,1
3,2,6,8,8,1,3,4.0,3,7,1
4,2,4,1,1,3,2,1.0,3,1,1
5,4,8,10,10,8,7,10.0,9,7,1
6,2,1,1,1,1,2,10.0,3,1,1
7,2,2,1,2,1,2,1.0,3,1,1


In [4]:
# Check for any null values, and there are so we have to find out what null values there are
df.isnull().values.any()

True

In [5]:
# We know that Bare_nuclei has null values
df.isnull().sum()
# We will impute later after splitting data

Class                           0
Clump_thickness                 0
Uniformity_of_cell_size         0
Uniformity_of_cell_shape        0
Marginal_adhesion               0
Single_epithelial_cell_size     0
Bare_nuclei                    16
Bland_chromatin                 0
Normal_nucleoli                 0
Mitoses                         0
dtype: int64

In [6]:
class_map = {2:1, 4:0}

In [7]:
# Check ratio of benign vs malignant cancer

num_obs = len(df)
benign_num = len(df.loc[df['Class'] == 2])
malig_num = len(df.loc[df['Class'] == 4])

print("Number of Benign Cases: {0} ({1:2.2f}%)".format(benign_num, (benign_num / (benign_num + malig_num)) * 100))
print("Number of Malignant Cases: {0} ({1:2.2f}%)".format(malig_num, (malig_num / (benign_num + malig_num)) * 100))

Number of Benign Cases: 458 (65.52%)
Number of Malignant Cases: 241 (34.48%)


## Splitting the data into Training and Test sets

In [8]:
# Using scikit-learn to split the data

from sklearn.model_selection import train_test_split

feature_column_names = ["Clump_thickness", "Uniformity_of_cell_size", "Uniformity_of_cell_shape", "Marginal_adhesion", "Single_epithelial_cell_size", "Bare_nuclei", "Bland_chromatin", "Normal_nucleoli", "Mitoses"]
predicted_class_name = ["Class"]

X = df.loc[:, feature_column_names]
y = df.loc[:, predicted_class_name]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 35)

X_train.shape, y_train.shape
X_test.shape , y_test.shape

((140, 9), (140, 1))

In [9]:
# Checking data splitting

print("{0:0.2f}% of features data in training set".format((len(X_train)/len(df.index))*100))
print("{0:0.2f}% of features data in test set".format((len(X_test)/len(df.index))*100))
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


79.97% of features data in training set
20.03% of features data in test set
(559, 9)
(140, 9)
(559, 1)
(140, 1)


## Splitting the Training data into Validation set

In [9]:
# Splitting for a validation set of data

from sklearn.model_selection import train_test_split
feature_column_names = ["Clump_thickness", "Uniformity_of_cell_size", "Uniformity_of_cell_shape", "Marginal_adhesion", "Single_epithelial_cell_size", "Bare_nuclei", "Bland_chromatin", "Normal_nucleoli", "Mitoses"]
predicted_class_name = ["Class"]

x = df[feature_column_names]
y = df[predicted_class_name]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 35)
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size = 0.1, random_state = 35)


#And checking split for training/ test/ validation sets
print("{0:0.2f}% of features data in training set".format((len(x_train)/len(df.index))*100))
print("{0:0.2f}% of features data in test set".format((len(x_test)/len(df.index))*100))
print("{0:0.2f}% of features data in validation set".format((len(x_valid)/len(df.index))*100))


71.96% of features data in training set
20.03% of features data in test set
8.01% of features data in validation set


In [10]:
#Check if the percentages of 2 and 4 are similar between sets

print("Original Benign: {0} ({1:0.2f}%)".format(len(df.loc[df["Class"] == 2]), (len(df.loc[df["Class"] == 2])/len(df.index))*100 ))
print("Original Malignant: {0} ({1:0.2f}%)".format(len(df.loc[df['Class'] == 4]), (len(df.loc[df['Class'] == 4])/len(df.index)) * 100))
print("")

print("Training Benign  : {0} ({1:0.2f}%)".format(len(y_train[y_train['Class'] == 2]), (len(y_train[y_train['Class'] == 2])/len(y_train) * 100.0)))
print("Training Malignant : {0} ({1:0.2f}%)".format(len(y_train[y_train['Class'] == 4]), (len(y_train[y_train['Class'] == 4])/len(y_train) * 100.0)))

print("")

print("Test Benign  : {0} ({1:0.2f}%)".format(len(y_test[y_test['Class'] == 2]), (len(y_test[y_test['Class'] == 2])/len(y_test)) * 100.0))
print("Test Malignant : {0} ({1:0.2f}%)".format(len(y_test[y_test['Class'] == 4]), (len(y_test[y_test['Class'] == 4])/len(y_test)) * 100.0))
print("")
print("Valid Benign  : {0} ({1:0.2f}%)".format(len(y_valid[y_valid['Class'] == 2]), (len(y_valid[y_valid['Class'] == 2])/len(y_valid)) * 100.0))
print("Valid Malignant : {0} ({1:0.2f}%)".format(len(y_valid[y_valid['Class'] == 4]), (len(y_valid[y_valid['Class']== 4])/len(y_valid)) * 100.0))

Original Benign: 458 (65.52%)
Original Malignant: 241 (34.48%)

Training Benign  : 328 (65.21%)
Training Malignant : 175 (34.79%)

Test Benign  : 91 (65.00%)
Test Malignant : 49 (35.00%)

Valid Benign  : 39 (69.64%)
Valid Malignant : 17 (30.36%)


In [14]:
# Impute missing data in Bare Nuclei
from sklearn.impute import SimpleImputer

bare_nuclei_impute = SimpleImputer(missing_values = np.nan, strategy = "mean")

x_train = bare_nuclei_impute.fit_transform(x_train)
x_test = bare_nuclei_impute.fit_transform(x_test)
x_valid = bare_nuclei_impute.fit_transform(x_valid)

# Splitting and Preprocessing data completed (for now)

# Training model on a Logistic Regression algorithm

In [48]:
from sklearn.linear_model import LogisticRegressionCV

lrcv_model = LogisticRegressionCV(Cs = 1, fit_intercept = False, random_state = 35, solver = "liblinear", cv = 10, class_weight = 'balanced', n_jobs = -1)

lrcv_model.fit(x_train, y_train.values.ravel())

In [43]:
import sklearn.metrics as metrics

lrcv_predict_test = lrcv_model.predict(x_test)

print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, lrcv_predict_test)))
print(metrics.confusion_matrix(y_test, lrcv_predict_test))
print("")
print("Classification Report")
print(metrics.classification_report(y_test, lrcv_predict_test))

Accuracy: 0.3500
[[ 0 91]
 [ 0 49]]

Classification Report
              precision    recall  f1-score   support

           2       0.00      0.00      0.00        91
           4       0.35      1.00      0.52        49

    accuracy                           0.35       140
   macro avg       0.17      0.50      0.26       140
weighted avg       0.12      0.35      0.18       140



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
