### Wisconsin Breast Cancer Data Prediction Model

This is a dataset of diagnositc information on biopses from actual patients.

Your task is to use the training data provided to create a model for predicting whether a biopsy is Benign (Good News) or Malignant (Cancer).  

You are given the dataset in the first cell, you should use this data for training and validation.  At the end of class, a separate dataset will be provided to test the accuracy of your Machine Learning Model.

Goal is to create a Machine Learning Model (KNN) that makes predictions with the greatest level of accuracy.


In [1]:
import pandas as pd
import numpy as np

'''
   1. Sample code number            id number
   2. Clump Thickness               1 - 10
   3. Uniformity of Cell Size       1 - 10
   4. Uniformity of Cell Shape      1 - 10
   5. Marginal Adhesion             1 - 10
   6. Single Epithelial Cell Size   1 - 10
   7. Bare Nuclei                   1 - 10
   8. Bland Chromatin               1 - 10
   9. Normal Nucleoli               1 - 10
  10. Mitoses                       1 - 10
  11. Class:                        (2 for benign, 4 for malignant)
'''

# load the data with class labels from the file called:
# "wisconsin_training_data.csv"
df = pd.read_csv('wisconsin_training_data.csv')
df.columns = df.columns.str.strip()
df.head(15)

Unnamed: 0,Sample_code_number,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,1116116,9,10,10,1,10,8,3,3,1,4
1,1000025,5,1,1,1,2,1,3,1,1,2
2,1116715,5,1,1,1,3,2,2,2,1,2
3,1072179,10,7,7,3,8,5,7,4,3,4
4,1026122,2,1,1,1,2,1,1,1,1,2
5,1229929,1,1,1,1,2,1,2,1,1,2
6,1136142,2,1,1,1,3,1,2,1,1,2
7,1043068,3,1,1,1,2,1,2,1,1,2
8,859164,5,3,3,1,3,3,3,3,3,4
9,1238777,6,1,1,3,2,1,1,1,1,2


In [2]:
# first explore the data a little
# do you need to drop anything? fix anything?  change any datatypes?
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 599 entries, 0 to 598
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Sample_code_number           599 non-null    int64 
 1   Clump_Thickness              599 non-null    int64 
 2   Uniformity_of_Cell_Size      599 non-null    int64 
 3   Uniformity_of_Cell_Shape     599 non-null    int64 
 4   Marginal_Adhesion            599 non-null    int64 
 5   Single_Epithelial_Cell_Size  599 non-null    int64 
 6   Bare_Nuclei                  599 non-null    object
 7   Bland_Chromatin              599 non-null    int64 
 8   Normal_Nucleoli              599 non-null    int64 
 9   Mitoses                      599 non-null    int64 
 10  Class                        599 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 51.6+ KB


In [3]:
# Next, make any fixes or adjustments or "data cleaning" you 
# discovered by exploring above 
# use this cell to create a cleaned dataset
df = df[df['Bare_Nuclei'] != '?']
#df['Bare_Nuclei'] = df['Bare_Nuclei'].astype(int)
df['Bare_Nuclei'] = df['Bare_Nuclei'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 584 entries, 0 to 598
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   Sample_code_number           584 non-null    int64
 1   Clump_Thickness              584 non-null    int64
 2   Uniformity_of_Cell_Size      584 non-null    int64
 3   Uniformity_of_Cell_Shape     584 non-null    int64
 4   Marginal_Adhesion            584 non-null    int64
 5   Single_Epithelial_Cell_Size  584 non-null    int64
 6   Bare_Nuclei                  584 non-null    int32
 7   Bland_Chromatin              584 non-null    int64
 8   Normal_Nucleoli              584 non-null    int64
 9   Mitoses                      584 non-null    int64
 10  Class                        584 non-null    int64
dtypes: int32(1), int64(10)
memory usage: 52.5 KB


In [4]:
# Next, take a look at some correlations between features and between features and the target
# This should help you select or remove features to include in your model
df.corr().style.background_gradient(cmap='tab20c')

Unnamed: 0,Sample_code_number,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
Sample_code_number,1.0,-0.068009,-0.042984,-0.044361,-0.065921,-0.046814,-0.104734,-0.069275,-0.054912,-0.036655,-0.091219
Clump_Thickness,-0.068009,1.0,0.63935,0.647067,0.492726,0.519874,0.590349,0.558704,0.53317,0.340916,0.707555
Uniformity_of_Cell_Size,-0.042984,0.63935,1.0,0.911498,0.703445,0.752787,0.691944,0.774224,0.729704,0.463056,0.820694
Uniformity_of_Cell_Shape,-0.044361,0.647067,0.911498,1.0,0.69235,0.724239,0.722745,0.754893,0.735961,0.442496,0.824945
Marginal_Adhesion,-0.065921,0.492726,0.703445,0.69235,1.0,0.588639,0.668978,0.683064,0.612688,0.403076,0.702385
Single_Epithelial_Cell_Size,-0.046814,0.519874,0.752787,0.724239,0.588639,1.0,0.578658,0.620254,0.649531,0.500484,0.684352
Bare_Nuclei,-0.104734,0.590349,0.691944,0.722745,0.668978,0.578658,1.0,0.678303,0.582971,0.331992,0.826016
Bland_Chromatin,-0.069275,0.558704,0.774224,0.754893,0.683064,0.620254,0.678303,1.0,0.682029,0.366133,0.765692
Normal_Nucleoli,-0.054912,0.53317,0.729704,0.735961,0.612688,0.649531,0.582971,0.682029,1.0,0.439276,0.729432
Mitoses,-0.036655,0.340916,0.463056,0.442496,0.403076,0.500484,0.331992,0.366133,0.439276,1.0,0.422348


In [5]:
# Select your features
# Decide if you need to do any imputing of data
# Divide data into training and validation data
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df, test_size=0.1, shuffle=True, random_state=23)
X_train = train_data[['Sample_code_number', 'Clump_Thickness', 'Uniformity_of_Cell_Size',
       'Uniformity_of_Cell_Shape', 'Marginal_Adhesion',
       'Single_Epithelial_Cell_Size', 'Bare_Nuclei', 'Bland_Chromatin',
       'Normal_Nucleoli', 'Mitoses']].values
y_train = train_data['Class'].tolist()

In [6]:
# Train your model on your training data
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

classifier = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler()),
    ('estimator', KNeighborsClassifier(n_neighbors = 3))
])

classifier.fit(X_train, y_train)

# tune your parameters using the validation dataset
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score

# Use the fitted model to make predictions on the train dataset
# Train data going through the Pipeline it's first imputed (with means from the train), scaled (with the min/max from the train data), and finally used to make predictions
train_predictions = classifier.predict(X_train)

print('Model performance on the train set:')
print(confusion_matrix(y_train, train_predictions))
print(classification_report(y_train, train_predictions))
print("Train accuracy:", accuracy_score(y_train, train_predictions))
# tune your parameters using the validation dataset

Model performance on the train set:
[[327   9]
 [  4 185]]
              precision    recall  f1-score   support

           2       0.99      0.97      0.98       336
           4       0.95      0.98      0.97       189

    accuracy                           0.98       525
   macro avg       0.97      0.98      0.97       525
weighted avg       0.98      0.98      0.98       525

Train accuracy: 0.9752380952380952


In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
x = df.drop(columns=['Class'])
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
model1 = RandomForestRegressor(n_estimators=100, random_state=42)
model1.fit(X_train, y_train)
print('Train:', model1.score(X_train, y_train))
print("MSE:",mean_squared_error(y_train, model1.predict(X_train)))
print()
print('Test:', model1.score(X_test, y_test))
print("MSE:",mean_squared_error(y_test, model1.predict(X_test)))

Train: 0.9805044792164357
MSE: 0.01796145610278373

Test: 0.8784307692307692
MSE: 0.10806153846153847


In [8]:
# create a list of your predictions of the outcomes 
# for the test data set called `wisconsin_test_data_without_class_info.csv`


# When you are happy with the performance of your model,
# paste the array of your predictions to the #general channel on Slack 
# and Matt will test the accuracy of your model on the test data

In [9]:
model_features = df.columns.drop('Class')
model_target = 'Class'

print('Model features: ', model_features)
print('Model target: ', model_target)

train_data, test_data = train_test_split(df, test_size=0.1, shuffle=True, random_state=23)
X_train = train_data[model_features].values
y_train = train_data[model_target].tolist()

Model features:  Index(['Sample_code_number', 'Clump_Thickness', 'Uniformity_of_Cell_Size',
       'Uniformity_of_Cell_Shape', 'Marginal_Adhesion',
       'Single_Epithelial_Cell_Size', 'Bare_Nuclei', 'Bland_Chromatin',
       'Normal_Nucleoli', 'Mitoses'],
      dtype='object')
Model target:  Class


In [16]:
df1 = pd.read_csv('wisconsin_test_data_without_class_info.csv')
df1.Bare_Nuclei = df1.Bare_Nuclei.replace('?', 1)
df1['Bare_Nuclei'] = df1['Bare_Nuclei'].astype(int)
df1['Bare_Nuclei'].fillna(df1['Bare_Nuclei'].mean())
df1['Bare_Nuclei'] = df1['Bare_Nuclei'].astype(int)
X_test = df1
train_predictions = classifier.predict(X_test)
train_predictions



array([2, 4, 2, 2, 2, 4, 4, 2, 2, 4, 2, 4, 2, 2, 4, 2, 2, 2, 4, 2, 4, 2,
       2, 2, 4, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2, 2, 4, 2, 2, 4, 2, 2, 2, 2,
       2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 4, 2, 4, 2, 2, 2, 2, 4, 4, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4,
       2, 2, 4, 2, 4, 2, 2, 2, 2, 2, 4, 4])

In [1]:
case1 = [2, 4, 2, 2, 2, 4, 4, 2, 2, 4, 2, 4, 2, 2, 4, 2, 2, 2, 4, 2, 4, 2,
       2, 2, 4, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2, 2, 4, 2, 2, 4, 2, 2, 2, 2,
       2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 4, 2, 4, 2, 2, 2, 2, 4, 4, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4,
       2, 2, 4, 2, 4, 2, 2, 2, 2, 2, 4, 4]
case2 = [2, 4, 2, 2, 2, 4, 4, 2, 2, 4, 2, 4, 2, 2, 4, 2, 2, 2, 4, 2, 4, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2, 2, 4, 2, 2, 4, 4, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 4, 4, 2, 4, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 4, 2, 4, 2, 2, 2, 2, 2, 4, 4]
count = 0
for i in range(len(case1)):
    if case1[i] == case2[i]:
        count += 1
print(count/len(case1))

0.97
