# Predicting the recurrance of breast cancer using logistic regression

In [1]:
%%HTML  
<style>div.prompt {display:none}</style>


   1. Sample code number            id number
   2. Clump Thickness               1 - 10
   3. Uniformity of Cell Size       1 - 10
   4. Uniformity of Cell Shape      1 - 10
   5. Marginal Adhesion             1 - 10
   6. Single Epithelial Cell Size   1 - 10
   7. Bare Nuclei                   1 - 10
   8. Bland Chromatin               1 - 10
   9. Normal Nucleoli               1 - 10
  10. Mitoses                       1 - 10
  11. Class:                        (2 for benign, 4 for malignant)
  

In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn import metrics #use metric to get the model stats
#http://wiki.stat.ucla.edu/socr/index.php/SOCR_Data_Brain2BodyWeight

df=pd.read_csv('../data/breast-cancer-wisconsin.data', sep=',',names=['ID', 'Thickness', 'Size Uniformity', 'Shape Uniformity', 'Adhesion', 'Single Cell Size', 'Bare Nuclei', 'Chromatin', 'Nucleoli', 'Mitosis', 'Class'])
print(df)

          ID  Thickness  Size Uniformity  Shape Uniformity  Adhesion  \
0    1000025          5                1                 1         1   
1    1002945          5                4                 4         5   
2    1015425          3                1                 1         1   
3    1016277          6                8                 8         1   
4    1017023          4                1                 1         3   
5    1017122          8               10                10         8   
6    1018099          1                1                 1         1   
7    1018561          2                1                 2         1   
8    1033078          2                1                 1         1   
9    1033078          4                2                 1         1   
10   1035283          1                1                 1         1   
11   1036172          2                1                 1         1   
12   1041801          5                3                 3      

    This dataset is broken in that it contains 16 question marks in certain rows, we will need to drop those rows in order to process the data without changing it.

In [3]:
for row in df['Bare Nuclei']:
    df.replace(["?", 'NaN'], np.nan, inplace = True)

df = df.dropna(how='any') #get rid of any NA / NaN
print(df.shape)

(683, 11)


In [15]:
X = df.drop(['ID', 'Class'], axis=1)
Y = df['Class']
logistic = LogisticRegression()
fit = logistic.fit(X,Y)
prediction = logistic.predict(X)

print(logistic.coef_)
print(logistic.intercept_)
print(logistic.n_iter_)
exp_v = metrics.explained_variance_score(Y, prediction)	#Explained variance regression score function
mar = metrics.mean_absolute_error(Y, prediction)	#Mean absolute error regression loss
mse = metrics.mean_squared_error(Y, prediction)	#Mean squared error regression loss
mae = metrics.median_absolute_error(Y, prediction)	#Median absolute error regression loss
r2 = metrics.r2_score(Y, prediction)	#R^2 (coefficient of determination) regression score function.

print(exp_v)
print(mar)
print(mse)
print(mae)
print(r2)

[[ 0.24515515  0.16142571  0.27161638  0.17946675 -0.04158785  0.34698953
   0.1815928   0.19130598  0.20634236]]
[-6.25906995]
[8]
0.864921406762
0.0614934114202
0.12298682284
0.0
0.864836593916


In [37]:
print(X.loc[[0]])
print(logistic.predict_proba(X)[0])
print(logistic.classes_)

   Thickness  Size Uniformity  Shape Uniformity  Adhesion  Single Cell Size  \
0          5                1                 1         1                 2   

  Bare Nuclei  Chromatin  Nucleoli  Mitosis  
0           1          3         1        1  
[ 0.96137797  0.03862203]
[2 4]


This shows that there is a probability of 96% that it will classify as 2, which if you recall is 

Class: (2 for benign, 4 for malignant)

We already know this is the correct answer from the data, but we can use this to make further predictions by inputting another set of the predictos

We can run RLR to see if there might be a way we can customize our LR to get better results

In [13]:
from sklearn.linear_model import RandomizedLogisticRegression
logistic = RandomizedLogisticRegression()
fit = logistic.fit(X,Y)

    
for x,y in (list(zip(list(X), fit.scores_))):
    print(x,y)

Thickness 0.55
Size Uniformity 0.58
Shape Uniformity 0.555
Adhesion 0.48
Single Cell Size 0.43
Bare Nuclei 0.875
Chromatin 0.475
Nucleoli 0.57
Mitosis 0.04


We can see that probably Mitosis doesnt contribute very much to reoccuring breast cancer