In [1]:
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns; sns.set()
from sklearn.svm import SVC

In [2]:
# Pre-processing: rid of all columns that are mostly unfilled and patients who declined to answer a question
cerv_data = pd.read_csv('risk_factors_cervical_cancer.csv', na_values=["?"])
print(len(cerv_data.columns))
cerv_data = cerv_data.drop(columns =['STDs: Time since last diagnosis','STDs: Time since first diagnosis'])
cerv_data = cerv_data.dropna()
#print(cerv_data)
x, y = cerv_data.iloc[:,:-4], cerv_data.iloc[:,-4:]
# print(x)
# print(y)

36


In [3]:
# Confusion matrix utilities
def print_confusion(cm):
    column_names = pd.DataFrame([['Predicted', '-'],
                                 ['Predicted', '+']],
                                 columns=['', ''])
    row_names = pd.DataFrame([['Observed', '-'],
                              ['Observed', '+']],
                              columns=['', ''])
    columns = pd.MultiIndex.from_frame(column_names)
    index = pd.MultiIndex.from_frame(row_names)
    display(pd.DataFrame(cm, index=index, columns=columns))

def get_score(cm):
    return (cm[0,0]+cm[1,1])/np.sum(cm)

def get_fp_rate(cm):
    return cm[0,1]/(cm[0,0]+cm[0,1])

def get_fn_rate(cm):
    return cm[1,0]/(cm[1,1]+cm[1,0])

In [4]:
###### Logistic regression ######
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.utils import compute_sample_weight
from sklearn.metrics import confusion_matrix

hinselmann, schiller, citology, biopsy = y.loc[:, 'Hinselmann'], y.loc[:, 'Schiller'], y.loc[:, 'Citology'], y.loc[:, 'Biopsy']

lr_pipe = Pipeline([('scale', StandardScaler()), ('logreg', LogisticRegression())])

# Direct fitting
def lr_direct_fit(x, y):
    lr_pipe.fit(x, y, **{'logreg__sample_weight': compute_sample_weight('balanced', y)})
    score = lr_pipe.score(x, y)
    confusion = confusion_matrix(y, lr_pipe.predict(x))
    return score, confusion

# Cross-validation
def lr_cv(x, y, *cv_args, **cv_kwargs):
    cv = KFold(*cv_args, **cv_kwargs)
    ypred = np.empty(x.shape[0])
    for train_i, test_i in cv.split(x):
        lr_pipe.fit(x.iloc[train_i], y.iloc[train_i],
                    **{'logreg__sample_weight': compute_sample_weight('balanced', y.iloc[train_i])})
        ypred[test_i] = lr_pipe.predict(x.iloc[test_i])
    confusion = confusion_matrix(y, ypred)
    return get_score(confusion), confusion

for outcome, name in zip([hinselmann, schiller, citology, biopsy],
                         ['Hinselmann', 'Schiller', 'Citology', 'Biopsy']):
    direct_score, direct_cm = lr_direct_fit(x, outcome)
    cv_score, cv_cm = lr_cv(x, outcome, shuffle=True)

    print('The prediction score for {} upon fitting is {:.3}'.format(name, direct_score))
    print('The false positive rate is {0:.2%}'.format(get_fp_rate(direct_cm)))
    print('The false negative rate is {0:.2%}'.format(get_fn_rate(direct_cm)))
    print_confusion(direct_cm)

    print('The prediction score for {} upon cross-validation is {:.3}'.format(name, cv_score))
    print('The false positive rate is {0:.2%}'.format(get_fp_rate(cv_cm)))
    print('The false negative rate is {0:.2%}'.format(get_fn_rate(cv_cm)))
    print_confusion(cv_cm)

The prediction score for Hinselmann upon fitting is 0.719
The false positive rate is 28.37%
The false negative rate is 23.33%


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,-,+
,,,
Observed,-,457.0,181.0
Observed,+,7.0,23.0


The prediction score for Hinselmann upon cross-validation is 0.71
The false positive rate is 27.74%
The false negative rate is 56.67%


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,-,+
,,,
Observed,-,461.0,177.0
Observed,+,17.0,13.0


The prediction score for Schiller upon fitting is 0.765
The false positive rate is 20.83%
The false negative rate is 49.21%


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,-,+
,,,
Observed,-,479.0,126.0
Observed,+,31.0,32.0


The prediction score for Schiller upon cross-validation is 0.75
The false positive rate is 21.32%
The false negative rate is 60.32%


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,-,+
,,,
Observed,-,476.0,129.0
Observed,+,38.0,25.0


The prediction score for Citology upon fitting is 0.719
The false positive rate is 27.34%
The false negative rate is 41.03%


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,-,+
,,,
Observed,-,457.0,172.0
Observed,+,16.0,23.0


The prediction score for Citology upon cross-validation is 0.692
The false positive rate is 28.93%
The false negative rate is 61.54%


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,-,+
,,,
Observed,-,447.0,182.0
Observed,+,24.0,15.0


The prediction score for Biopsy upon fitting is 0.814
The false positive rate is 17.01%
The false negative rate is 40.00%


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,-,+
,,,
Observed,-,517.0,106.0
Observed,+,18.0,27.0


The prediction score for Biopsy upon cross-validation is 0.762
The false positive rate is 20.71%
The false negative rate is 66.67%


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,-,+
,,,
Observed,-,494.0,129.0
Observed,+,30.0,15.0
