In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [10]:
cancer = pd.read_csv('../../__DATA__/breast-cancer-wisconsin.data', header = None)
cancer.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [11]:
cancer = cancer.replace({10:{2:'Benign', 4:'Malignant'}})
cancer.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1,3,1,1,Benign
1,1002945,5,4,4,5,7,10,3,2,1,Benign
2,1015425,3,1,1,1,2,2,3,1,1,Benign
3,1016277,6,8,8,1,3,4,3,7,1,Benign
4,1017023,4,1,1,3,2,1,3,1,1,Benign


In [12]:
cancer.isna().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
dtype: int64

In [18]:
cancer.dtypes

0      int64
1      int64
2      int64
3      int64
4      int64
5      int64
6     object
7      int64
8      int64
9      int64
10    object
dtype: object

In [15]:
cancer.skew()

0    13.675326
1     0.592859
2     1.233137
3     1.161859
4     1.524468
5     1.712172
7     1.099969
8     1.422261
9     3.560658
dtype: float64

It appears that only column 6 contains bad values we will proceed to remove them.

In [36]:
cancer.replace({6:{'?':np.nan}})
cancer.isna().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
dtype: int64

In [46]:
num_strings = ['1','2','3','4','5','6','7','8','9','10']
rows_ind = ~cancer[6].isin(num_strings)
rows_to_drop = [i for i in rows_ind.index if rows_ind[i] == True]
rows_to_drop

[23, 40, 139, 145, 158, 164, 235, 249, 275, 292, 294, 297, 315, 321, 411, 617]

In [47]:
cancer_cleaned = cancer.drop(rows_to_drop)
cancer_cleaned = 

In [53]:
cancer_log = cancer_cleaned.copy()
for col in cancer_cleaned.iloc[:,1:10].columns:
    cancer_log[col] = cancer_cleaned[col].astype(int)
    cancer_log[col] = np.log(cancer_log[col])
cancer_log.skew()

0    13.748410
1    -0.458445
2     0.653086
3     0.542271
4     0.857404
5     0.712799
6     0.685048
7     0.100977
8     0.959574
9     2.421858
dtype: float64

In [55]:
cancer_inv = cancer_cleaned.copy()
for col in cancer_cleaned.iloc[:,1:10].columns:
    cancer_inv[col] = cancer_cleaned[col].astype(int)
    cancer_inv[col] = -1/(cancer_inv[col])
cancer_inv.skew()

0    13.748410
1    -1.154863
2     0.328286
3     0.191275
4     0.475816
5    -0.848350
6     0.467949
7    -0.795973
8     0.680200
9     1.904772
dtype: float64

### Some exploratory analysis before modeling

7. Attribute Information: (class attribute has been moved to last column)

      Attribute                     Domain
   -- -----------------------------------------
   0. Sample code number            id number
   1. Clump Thickness               1 - 10
   2. Uniformity of Cell Size       1 - 10
   3. Uniformity of Cell Shape      1 - 10
   4. Marginal Adhesion             1 - 10
   5. Single Epithelial Cell Size   1 - 10
   6. Bare Nuclei                   1 - 10
   7. Bland Chromatin               1 - 10
   8. Normal Nucleoli               1 - 10
   9. Mitoses                       1 - 10
   10. Class:                       (2 for benign, 4 for malignant)

In [85]:
# Individual Correlations of attributes with malignancy
import seaborn as sns
cancer_explore = cancer_log.copy().replace({10:{'Benign':0,'Malignant':1}})
cormat = cancer_explore.corr()
cormat.iloc[10,1:10]

1    0.714790
2    0.871118
3    0.857951
4    0.747059
5    0.739315
6    0.844576
7    0.737336
8    0.759357
9    0.495474
Name: 10, dtype: float64

All attributes have a positive correlation with malignancy. The weakest attribute as a predictor of malignancy is mitoses. The strongest indicators of malignancy include Bare Nuclei, Uniformity of Cell shape, and the strongest indicator being uniformity of cell size. 

The midrange indicators (though still quite strong with correlations in the 0.7 region) include Normality of Nucleoli, Bland Chromatin, Single Epithelial Cell Size, Marginal Adhesion, and Clump Thickness.

In [56]:
from sklearn.model_selection import train_test_split

X = cancer_log.iloc[:,1:10]
Y = cancer_log.iloc[:,10]

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.30)

In [75]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

lreg = LogisticRegression(penalty = 'none', solver = 'lbfgs') #penalty = 'none', solver = 'lbfgs'
lreg.fit(X_train, Y_train)
predict = lreg.predict(X_test)
conmat = confusion_matrix(Y_test, predict)
score = lreg.score(X_test, Y_test)

cv_scores = cross_val_score(lreg, X, Y, cv = 5)
precision = conmat[1,1]/conmat[:,1].sum()
recall = conmat[1,1]/conmat[1,:].sum()
print('Validation Scores: ', cv_scores)
print('Overal Model Accuracy: ', score)
print('Precision (TrueMalignantPredictions/AllMalignantPrediction): ', precision)
print('Recall (TrueMalignantPredictions/AllTrueMaligance):', recall)

Validation Scores:  [0.96350365 0.94890511 0.97080292 0.97080292 0.98518519]
Overal Model Accuracy:  0.9658536585365853
Precision (TrueMalignantPredictions/AllMalignantPrediction):  0.9324324324324325
Recall (TrueMalignantPredictions/AllTrueMaligance): 0.971830985915493


Results are great but Recall must be as close to 100 percent as possible. False negatives in the case of can

In [88]:
def cancerclassifier(clf):
    clf.fit(X_train, Y_train)
    predict = clf.predict(X_test)
    conmat = confusion_matrix(Y_test, predict)
    score = clf.score(X_test, Y_test)

    cv_scores = cross_val_score(clf, X, Y, cv = 5)
    precision = conmat[1,1]/conmat[:,1].sum()
    recall = conmat[1,1]/conmat[1,:].sum()
    print('Validation Scores: ', cv_scores)
    print('Overall Model Accuracy: ', score)
    print('Precision (TrueMalignantPredictions/AllMalignantPrediction): ', precision)
    print('Recall (TrueMalignantPredictions/AllTrueMaligance):', recall)

In [69]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100)
cancerclassifier(rfc)

Validation Scores:  [0.93430657 0.94890511 0.97080292 0.97080292 0.98518519]
Overal Model Accuracy:  0.9658536585365853
Precision (TrueMalignantPredictions/AllMalignantPrediction):  0.9444444444444444
Recall (TrueMalignantPredictions/AllTrueMaligance): 0.9577464788732394


In [78]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators = 100)
cancerclassifier(gbc)

Validation Scores:  [0.9270073  0.94890511 0.96350365 0.97810219 0.97037037]
Overal Model Accuracy:  0.9609756097560975
Precision (TrueMalignantPredictions/AllMalignantPrediction):  0.9315068493150684
Recall (TrueMalignantPredictions/AllTrueMaligance): 0.9577464788732394


In [87]:
from sklearn.svm import SVC

svc = SVC()
cancerclassifier(svc)

Validation Scores:  [0.96350365 0.94160584 0.97810219 0.98540146 0.97777778]
Overal Model Accuracy:  0.9658536585365853
Precision (TrueMalignantPredictions/AllMalignantPrediction):  0.9102564102564102
Recall (TrueMalignantPredictions/AllTrueMaligance): 1.0




Achieved a 100% recall rate with a support vector classifier.

Priortizing this model to detect cancer will most likely allow that all true cases of cancer are detected. However, the SVC predicts true malignant cases with 100% accuracy at the expense of predicting benign cases with a lower accuracy than in other classification models. More than one model can be used in tandem in this case, SVC to capture all true positives, and another model to validate who among the predicted positives in the SVC model may potentially be benign.