In [1]:
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns; sns.set()
from sklearn.svm import SVC

In [2]:
# Pre-processing: rid of all columns that are mostly unfilled and patients who declined to answer a question
cerv_data = pd.read_csv('risk_factors_cervical_cancer.csv', na_values=["?"])
print(len(cerv_data.columns))
cerv_data = cerv_data.drop(columns =['STDs: Time since last diagnosis','STDs: Time since first diagnosis'])
cerv_data = cerv_data.dropna()
#print(cerv_data)
x, y = cerv_data.iloc[:,:-4], cerv_data.iloc[:,-4:]
# print(x)
# print(y)

36


In [3]:
# Confusion matrix utilities
def print_confusion(cm):
    column_names = pd.DataFrame([['Predicted', '-'],
                                 ['Predicted', '+']],
                                 columns=['', ''])
    row_names = pd.DataFrame([['Observed', '-'],
                              ['Observed', '+']],
                              columns=['', ''])
    columns = pd.MultiIndex.from_frame(column_names)
    index = pd.MultiIndex.from_frame(row_names)
    display(pd.DataFrame(cm, index=index, columns=columns))

def get_score(cm):
    return (cm[0,0]+cm[1,1])/np.sum(cm)

def get_fp_rate(cm):
    return cm[0,1]/(cm[0,0]+cm[0,1])

def get_fn_rate(cm):
    return cm[1,0]/(cm[1,1]+cm[1,0])

In [4]:
###### Logistic regression ######
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.utils import compute_sample_weight
from sklearn.metrics import confusion_matrix

hinselmann = y.iloc[:, 0]

# Direct fitting
lr_pipe = Pipeline([('scale', StandardScaler()), ('logreg', LogisticRegression())])
lr_pipe.fit(x, hinselmann, **{'logreg__sample_weight': compute_sample_weight('balanced', hinselmann)})
lr_direct_score = lr_pipe.score(x, hinselmann)
lr_cm_hinselmann_direct = confusion_matrix(hinselmann, lr_pipe.predict(x))

# Cross-validation
cv = KFold(shuffle=True)
lr_ypred_hinselmann = np.empty(x.shape[0])
for train_i, test_i in cv.split(x):
    lr_pipe.fit(x.iloc[train_i], hinselmann.iloc[train_i], **{'logreg__sample_weight': compute_sample_weight('balanced', hinselmann.iloc[train_i])})
    lr_ypred_hinselmann[test_i] = lr_pipe.predict(x.iloc[test_i])

lr_cm_hinselmann_cv = confusion_matrix(hinselmann, lr_ypred_hinselmann)

print('The prediction score for Hinselmann upon fitting is {:.3}'.format(lr_direct_score))
print('The false positive rate is {0:.2%}'.format(get_fp_rate(lr_cm_hinselmann_direct)))
print('The false negative rate is {0:.2%}'.format(get_fn_rate(lr_cm_hinselmann_direct)))
print_confusion(lr_cm_hinselmann_direct)

print('The prediction score for Hinselmann upon cross-validation is {:.3}'.format(get_score(lr_cm_hinselmann_cv)))
print('The false positive rate is {0:.2%}'.format(get_fp_rate(lr_cm_hinselmann_cv)))
print('The false negative rate is {0:.2%}'.format(get_fn_rate(lr_cm_hinselmann_cv)))
print_confusion(lr_cm_hinselmann_cv)

The prediction score for Hinselmann upon fitting is 0.719
The false positive rate is 28.37%
The false negative rate is 23.33%


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,-,+
,,,
Observed,-,457.0,181.0
Observed,+,7.0,23.0


The prediction score for Hinselmann upon cross-validation is 0.692
The false positive rate is 29.62%
The false negative rate is 56.67%


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,-,+
,,,
Observed,-,449.0,189.0
Observed,+,17.0,13.0
