In [1]:
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns; sns.set()
from sklearn.svm import SVC

In [2]:
# Pre-processing: rid of all columns that are mostly unfilled and patients who declined to answer a question
cerv_data = pd.read_csv('risk_factors_cervical_cancer.csv', na_values=["?"])
print(len(cerv_data.columns))
cerv_data = cerv_data.drop(columns =['STDs: Time since last diagnosis','STDs: Time since first diagnosis'])
cerv_data = cerv_data.dropna()
#print(cerv_data)
x, y = cerv_data.iloc[:,:-4], cerv_data.iloc[:,-4:]
# print(x)
# print(y)

36


In [3]:
# Confusion matrix utilities
from IPython.display import HTML

def print_confusion(cm):
    column_names = pd.DataFrame([['Predicted', '-'],
                                 ['Predicted', '+']],
                                 columns=['', ''])
    row_names = pd.DataFrame([['Observed', '-'],
                              ['Observed', '+']],
                              columns=['', ''])
    columns = pd.MultiIndex.from_frame(column_names)
    index = pd.MultiIndex.from_frame(row_names)
    display(pd.DataFrame(cm, index=index, columns=columns))

def get_score(cm):
    return (cm[0,0]+cm[1,1])/np.sum(cm)

def get_fp_rate(cm):
    return cm[0,1]/(cm[0,0]+cm[0,1])

def get_fn_rate(cm):
    return cm[1,0]/(cm[1,1]+cm[1,0])

def display_model_stats(cm, name):
    display(HTML('''
        <div class="row">
        <style scoped>
            tr th {{
                font-weight: 600;
                text-align: center;
            }}
            thead th {{
                font-weight: bold;
                text-align: center;
            }}
            .rotate {{
                transform: rotate(-180deg);
                writing-mode: vertical-rl;
                margin: 0em;
            }}
            .row {{ display: flex; }}
            .column {{ padding: 5px; }}
        </style>
        <div class="column">
            <table>
            <thead>
                <th colspan="4">{0}</th>
            </thead>
            <tr>
                <td rowspan="2" colspan="2"></td>
                <th colspan="2">Predicted</th>
            </tr>
            <tr>
                <th>-</td>
                <th>+</td>
            </tr>
            <tr>
                <th rowspan="2"><p class="rotate">Observed</p></th>
                <th>-</td>
                <td>{1}</td>
                <td>{2}</td>
            </tr>
            <tr>
                <th>+</td>
                <td>{3}</td>
                <td>{4}</td>
            </tr>
            </table>
        </div>
        <div class="column">
            <table>
            <thead>
                <th colspan="2">{0}</th>
            </thead>
            <tr>
                <th>Total accuracy</th>
                <td>{5:.3}</td>
            </tr>
            <tr>
                <th>False positive rate</th>
                <td>{6:.3}</td>
            </tr>
            <tr>
                <th>False negative rate</th>
                <td>{7:.3}</td>
            </tr>
            </table>
        </div>
        </div>
    '''.format(
        name,
        cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1],
        get_score(cm), get_fp_rate(cm), get_fn_rate(cm)
    )))

In [4]:
###### Logistic regression ######
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.utils import compute_sample_weight
from sklearn.metrics import confusion_matrix

hinselmann, schiller, citology, biopsy = y.loc[:, 'Hinselmann'], y.loc[:, 'Schiller'], y.loc[:, 'Citology'], y.loc[:, 'Biopsy']

lr_pipe = Pipeline([('scale', StandardScaler()), ('logreg', LogisticRegression())])

# Direct fitting
def direct_fit_classifier(pipe, x, y, balanced_step_name=None):
    if balanced_step_name:
        fit_params = {'{0}__sample_weight'.format(balanced_step_name): compute_sample_weight('balanced', y)}
    else:
        fit_params = {}
    pipe.fit(x, y, **fit_params)
    return confusion_matrix(y, pipe.predict(x))

# Cross-validation
def cv_classifier(pipe, x, y, cv, balanced_step_name=None):
    ypred = np.empty(x.shape[0])
    for train_i, test_i in cv.split(x):
        if balanced_step_name:
            fit_params = {'{0}__sample_weight'.format(balanced_step_name): compute_sample_weight('balanced', y.iloc[train_i])}
        else:
            fit_params = {}
        pipe.fit(x.iloc[train_i], y.iloc[train_i], **fit_params)
        ypred[test_i] = pipe.predict(x.iloc[test_i])
    return confusion_matrix(y, ypred)

for outcome, name in zip([hinselmann, schiller, citology, biopsy],
                         ['Hinselmann', 'Schiller', 'Citology', 'Biopsy']):
    direct_cm = direct_fit_classifier(lr_pipe, x, outcome, 'logreg')
    cv_cm = cv_classifier(lr_pipe, x, outcome, KFold(shuffle=True), 'logreg')

    display_model_stats(direct_cm, '{0} upon fitting'.format(name))
    display_model_stats(cv_cm, '{0} upon CV'.format(name))

Hinselmann upon fitting,Hinselmann upon fitting.1,Hinselmann upon fitting.2,Hinselmann upon fitting.3
,,Predicted,Predicted
,,-,+
Observed,-,457,181
Observed,+,7,23

Hinselmann upon fitting,Hinselmann upon fitting.1
Total accuracy,0.719
False positive rate,0.284
False negative rate,0.233


Hinselmann upon CV,Hinselmann upon CV.1,Hinselmann upon CV.2,Hinselmann upon CV.3
,,Predicted,Predicted
,,-,+
Observed,-,444,194
Observed,+,20,10

Hinselmann upon CV,Hinselmann upon CV.1
Total accuracy,0.68
False positive rate,0.304
False negative rate,0.667


Schiller upon fitting,Schiller upon fitting.1,Schiller upon fitting.2,Schiller upon fitting.3
,,Predicted,Predicted
,,-,+
Observed,-,479,126
Observed,+,31,32

Schiller upon fitting,Schiller upon fitting.1
Total accuracy,0.765
False positive rate,0.208
False negative rate,0.492


Schiller upon CV,Schiller upon CV.1,Schiller upon CV.2,Schiller upon CV.3
,,Predicted,Predicted
,,-,+
Observed,-,470,135
Observed,+,37,26

Schiller upon CV,Schiller upon CV.1
Total accuracy,0.743
False positive rate,0.223
False negative rate,0.587


Citology upon fitting,Citology upon fitting.1,Citology upon fitting.2,Citology upon fitting.3
,,Predicted,Predicted
,,-,+
Observed,-,457,172
Observed,+,16,23

Citology upon fitting,Citology upon fitting.1
Total accuracy,0.719
False positive rate,0.273
False negative rate,0.41


Citology upon CV,Citology upon CV.1,Citology upon CV.2,Citology upon CV.3
,,Predicted,Predicted
,,-,+
Observed,-,427,202
Observed,+,22,17

Citology upon CV,Citology upon CV.1
Total accuracy,0.665
False positive rate,0.321
False negative rate,0.564


Biopsy upon fitting,Biopsy upon fitting.1,Biopsy upon fitting.2,Biopsy upon fitting.3
,,Predicted,Predicted
,,-,+
Observed,-,517,106
Observed,+,18,27

Biopsy upon fitting,Biopsy upon fitting.1
Total accuracy,0.814
False positive rate,0.17
False negative rate,0.4


Biopsy upon CV,Biopsy upon CV.1,Biopsy upon CV.2,Biopsy upon CV.3
,,Predicted,Predicted
,,-,+
Observed,-,510,113
Observed,+,30,15

Biopsy upon CV,Biopsy upon CV.1
Total accuracy,0.786
False positive rate,0.181
False negative rate,0.667


In [7]:
###### Support vector machines ######
#we will have to play with c value and kernel
svc_model = SVC(C=1000, kernel = 'poly')
hinselmann = np.array(hinselmann)
svc_model.fit(x,hinselmann.reshape(-1,1))
hinselmann_predict = svc_model.predict(x)
print(hinselmann_predict)

schiller = np.array(schiller)
svc_model.fit(x,schiller.reshape(-1,1))
schiller_predict = svc_model.predict(x)
print(schiller_predict)

citology = np.array(citology)
svc_model.fit(x,citology.reshape(-1,1))
citology_predict = svc_model.predict(x)
print(citology_predict)

biopsy = np.array(biopsy)
svc_model.fit(x,biopsy.reshape(-1,1))
biopsy_predict = svc_model.predict(x)
print(biopsy_predict)


  return f(*args, **kwargs)


[0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

  return f(*args, **kwargs)


[0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

  return f(*args, **kwargs)


[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

  return f(*args, **kwargs)


[0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 