# Breast Cancer Diagnostic Classification Project

Coded by Luna McBride

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split #Split the data into train and test
from sklearn.ensemble import RandomForestClassifier #Forest for prediction and regression
from sklearn.metrics import mean_squared_error #Error testing
from sklearn.metrics import classification_report #Report of Classification

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/breast-cancer-wisconsin-data/data.csv


In [2]:
cancer = pd.read_csv("../input/breast-cancer-wisconsin-data/data.csv") #Put our data into a dataframe
cancer.head() #Take a peek at the data

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [3]:
print(len(cancer.index)) #Print the number of rows

569


---

# Check for Null Values

In [4]:
print(cancer.isnull().any()) #Check if the columns have null values

id                         False
diagnosis                  False
radius_mean                False
texture_mean               False
perimeter_mean             False
area_mean                  False
smoothness_mean            False
compactness_mean           False
concavity_mean             False
concave points_mean        False
symmetry_mean              False
fractal_dimension_mean     False
radius_se                  False
texture_se                 False
perimeter_se               False
area_se                    False
smoothness_se              False
compactness_se             False
concavity_se               False
concave points_se          False
symmetry_se                False
fractal_dimension_se       False
radius_worst               False
texture_worst              False
perimeter_worst            False
area_worst                 False
smoothness_worst           False
compactness_worst          False
concavity_worst            False
concave points_worst       False
symmetry_w

In [5]:
print(cancer.loc[cancer["Unnamed: 32"].isnull() != True]) #Check to see if unnamed32 actually has values

Empty DataFrame
Columns: [id, diagnosis, radius_mean, texture_mean, perimeter_mean, area_mean, smoothness_mean, compactness_mean, concavity_mean, concave points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, perimeter_worst, area_worst, smoothness_worst, compactness_worst, concavity_worst, concave points_worst, symmetry_worst, fractal_dimension_worst, Unnamed: 32]
Index: []

[0 rows x 33 columns]


Everything is not null except the column Unnamed: 32, which is entirely null. I will drop Unnamed: 32. I will also drop the ID, since the ID is not something I want to test against

In [6]:
cancer = cancer.drop(columns = {"Unnamed: 32", "id"}) #Drop the null column
cancer.head() #Take a peek and make sure it dropped

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


---

# Data Exploration

In [7]:
diagnosis = cancer["diagnosis"].copy() #Put the diagnosis into its own variable
characteristics = cancer.drop(columns = {"diagnosis"}).copy() #Put the characteristics in a separate dataframe

#For each column, print the min and max values for Malignant and Beneign tumors
for column in characteristics.columns:
    mal = cancer.loc[cancer["diagnosis"] == "M"][column] #Get the malignant values for the column
    ben = cancer.loc[cancer["diagnosis"] == "B"][column] #Get the beneign values for the column
    
    print("The max and min malignant values for {} are {}, {}".format(column, mal.max(), mal.min())) #Print malignant values
    print("The max and min beneign values for {} are {}, {}".format(column, ben.max(), ben.min())) #Print beneign values

The max and min malignant values for radius_mean are 28.11, 10.95
The max and min beneign values for radius_mean are 17.85, 6.981
The max and min malignant values for texture_mean are 39.28, 10.38
The max and min beneign values for texture_mean are 33.81, 9.71
The max and min malignant values for perimeter_mean are 188.5, 71.9
The max and min beneign values for perimeter_mean are 114.6, 43.79
The max and min malignant values for area_mean are 2501.0, 361.6
The max and min beneign values for area_mean are 992.1, 143.5
The max and min malignant values for smoothness_mean are 0.1447, 0.07371
The max and min beneign values for smoothness_mean are 0.1634, 0.052629999999999996
The max and min malignant values for compactness_mean are 0.3454, 0.04605
The max and min beneign values for compactness_mean are 0.2239, 0.01938
The max and min malignant values for concavity_mean are 0.4268, 0.02398
The max and min beneign values for concavity_mean are 0.4108, 0.0
The max and min malignant values for

It seems a malignant tumor tends to have higher min and max values overall, though with overlap to beneign values. This fits most variables, but variables like concavity appear to completely overlap the malignant numbers with the beneign. I believe characteristics like that will have low importance to the classification, but I will have to see.

---

# Build the Classifier (Full Data)

Here, I would like to see if a classifier has an easier/harder time when having all these worst/mean/deviation characteristics for the same variable versus just one or the other.

## Train-Test Split

In [8]:
diagnosis = pd.get_dummies(diagnosis) #Get the encoding for the diagnosis variable
print(diagnosis) #Take a peek at the diagnosis dummies

     B  M
0    0  1
1    0  1
2    0  1
3    0  1
4    0  1
..  .. ..
564  0  1
565  0  1
566  0  1
567  0  1
568  1  0

[569 rows x 2 columns]


In [9]:
charaTrain, charaTest, diagTrain, diagTest = train_test_split(characteristics, diagnosis, test_size = 0.2) #Create train and test sets
print(diagTrain) #Print one of the splits to have an idea about the structure

     B  M
420  1  0
326  1  0
428  1  0
286  1  0
425  1  0
..  .. ..
86   0  1
369  0  1
459  1  0
156  0  1
483  1  0

[455 rows x 2 columns]


## Fit a Random Forest Classifier

In [10]:
forest = RandomForestClassifier(n_estimators = 100) #Build a forest
forest.fit(charaTrain, diagTrain) #Fit the forest model

RandomForestClassifier()

In [11]:
predict = forest.predict(charaTest) #Get a list of predictions

In [12]:
overallAccuracy = ("Overall", forest.score(charaTest, diagTest)) #Get the overall accuracy
print("Forest Accuracy: ", forest.score(charaTest, diagTest)) #Print the accuracy
print("Root Mean Square Error: ", np.sqrt(mean_squared_error(diagTest, predict))) #Print the root mean square error
print("Classification Report:\n ", classification_report(diagTest, predict, target_names = ["B", "M"])) #Print a classification report

Forest Accuracy:  0.9649122807017544
Root Mean Square Error:  0.1873171623163388
Classification Report:
                precision    recall  f1-score   support

           B       0.96      0.99      0.97        72
           M       0.97      0.93      0.95        42

   micro avg       0.96      0.96      0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114
 samples avg       0.96      0.96      0.96       114



In [13]:
attributes = characteristics.columns #Get the tested attributes
attributes = list(zip(attributes, forest.feature_importances_)) #Zip the attributes together with their coefficient
sortAtt = sorted(attributes, key = lambda x: x[1], reverse = True) #Sort the zipped attributes by their coefficients

print("According to the Random Forest, the most important factors for cancer status are: ") #Start printing the most important labels
i=0 #Counter variable so only the top five are printed

#For each attribute in the sorted attributes
for label, coef in sortAtt:
    if i<5: #If there has not been five printed yet
        print(label) #Print the label as an important factor
    i += 1 #Increase i by 1

According to the Random Forest, the most important factors for cancer status are: 
concave points_mean
area_worst
concave points_worst
perimeter_worst
radius_worst


The forest of overall characteristics was able to attain a 97% accuracy on the test set. The worst characteristics appear to be what the forest dubbed most important, so it should be fair to assume the worsts, when alone, will do best in a new classifier.

---

# Build a Forest for Each Worst/SE/Mean

In [14]:
#SplitData: splits this data based on whether it is the mean, se, or worst column for the dataset
#Input: the list of characteristic columns
#Output: A list that contains the lists of each column (se, mean, and worst)
def splitData(charactColumns):
    se = [] #A list holder for all SE columns
    mean = [] #A list holder for all columns that end in mean
    worst = [] #A list holder for all columns that end in worst
    
    #For each characteristics column, put it in the correct se, mean, or worst list
    for column in charactColumns:
        if column.find("se") > -1: #If the column name contains se (I checked; the only SE sequence is the _se at the end)
            se.append(column) #Add it to the SE list
        elif column.find("mean") > -1: #If the column contains mean
            mean.append(column) #Add it to the mean list
        else: #If the column contains neither, which means it contains worst
            worst.append(column) #Add it to the worst list
    
    return [se, mean, worst] #Return a list with all the previous lists inside

#RunForest: runs a forest for the specified characteristic type (colType), assuming the diagnosis is in its dummied form
#Input: the diagnosis, the characteristics, the column/characteristic type
#Output: None
def runForest(diag, chara, colType):
    charaTrain, charaTest, diagTrain, diagTest = train_test_split(chara, diag, test_size = 0.2) #Split the data into train and test
    
    forest = RandomForestClassifier(n_estimators = 100) #Build a forest for this data
    forest.fit(charaTrain, diagTrain) #Fit the forest
    
    predict = forest.predict(charaTest) #Make predictions for the test set
    
    print("Forest Accuracy for {}: {}".format(colType, forest.score(charaTest, diagTest))) #Print the accuracy
    print("Root Mean Square Error for {}: {}".format(colType, np.sqrt(mean_squared_error(diagTest, predict)))) #Print the root mean square error
    print("Classification Report for {}:\n {}".format(colType, classification_report(diagTest, predict, target_names = ["B", "M"]))) #Print a classification report

In [15]:
charact = characteristics.columns #Get the characteristics columns

columnList = splitData(charact) #Split the data into se, mean, and worst
colTypes = ["se", "mean", "worst"] #Set a list of types to the corresponding column types in order
i = 0 #Set an i variable to get the correct colType 

#For each column type, run a forest with just that type
for colList in columnList:
    chara = cancer[colList] #Get the characteristics of just the columns of the specified type
    runForest(diagnosis, chara, colTypes[i]) #Run a forest for this specific type
    
    i = i + 1 #Increase i so the column type remains consistent

Forest Accuracy for se: 0.8070175438596491
Root Mean Square Error for se: 0.4392976851069794
Classification Report for se:
               precision    recall  f1-score   support

           B       0.77      0.94      0.85        65
           M       0.89      0.63      0.74        49

   micro avg       0.81      0.81      0.81       114
   macro avg       0.83      0.79      0.79       114
weighted avg       0.82      0.81      0.80       114
 samples avg       0.81      0.81      0.81       114

Forest Accuracy for mean: 0.9649122807017544
Root Mean Square Error for mean: 0.1873171623163388
Classification Report for mean:
               precision    recall  f1-score   support

           B       0.99      0.96      0.97        70
           M       0.93      0.98      0.96        44

   micro avg       0.96      0.96      0.96       114
   macro avg       0.96      0.97      0.96       114
weighted avg       0.97      0.96      0.97       114
 samples avg       0.96      0.96      

100 estimators appears to be best for accuracy. The worst appears to be the best predictor with an accuracy and recall being 96+% (recall being most important in this case since we need to identify these correctly). Mean is also very close, being 95+% in accuracy and recall (plus or minus depending on the run). This is based on the data, which only has about 500 entries, so I bet these results would change with more data. Despite this, I would say either the worst values or the mean values would do fine if there is no access to the other types.