In [None]:
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns; sns.set()
from sklearn.svm import SVC

In [None]:
# Pre-processing: rid of all columns that are mostly unfilled and patients who declined to answer a question
cerv_data = pd.read_csv('risk_factors_cervical_cancer.csv', na_values=["?"])
print(len(cerv_data.columns))
cerv_data = cerv_data.drop(columns =['STDs: Time since last diagnosis','STDs: Time since first diagnosis'])
cerv_data = cerv_data.dropna()
#print(cerv_data)
x, y = cerv_data.iloc[:,:-4], cerv_data.iloc[:,-4:]
# print(x)
# print(y)

In [None]:
# Confusion matrix utilities
from IPython.display import HTML

def print_confusion(cm):
    column_names = pd.DataFrame([['Predicted', '-'],
                                 ['Predicted', '+']],
                                 columns=['', ''])
    row_names = pd.DataFrame([['Observed', '-'],
                              ['Observed', '+']],
                              columns=['', ''])
    columns = pd.MultiIndex.from_frame(column_names)
    index = pd.MultiIndex.from_frame(row_names)
    display(pd.DataFrame(cm, index=index, columns=columns))

def get_score(cm):
    return (cm[0,0]+cm[1,1])/np.sum(cm)

def get_fp_rate(cm):
    return cm[0,1]/(cm[0,0]+cm[0,1])

def get_fn_rate(cm):
    return cm[1,0]/(cm[1,1]+cm[1,0])

def display_model_stats(cm, name):
    display(HTML('''
        <div class="row">
        <style scoped>
            tr th {{
                font-weight: 600;
                text-align: center;
            }}
            thead th {{
                font-weight: bold;
                text-align: center;
            }}
            .rotate {{
                transform: rotate(-180deg);
                writing-mode: vertical-rl;
                margin: 0em;
            }}
            .row {{ display: flex; }}
            .column {{ padding: 5px; }}
        </style>
        <div class="column">
            <table>
            <thead>
                <th colspan="4">{0}</th>
            </thead>
            <tr>
                <td rowspan="2" colspan="2"></td>
                <th colspan="2">Predicted</th>
            </tr>
            <tr>
                <th>-</td>
                <th>+</td>
            </tr>
            <tr>
                <th rowspan="2"><p class="rotate">Observed</p></th>
                <th>-</td>
                <td>{1}</td>
                <td>{2}</td>
            </tr>
            <tr>
                <th>+</td>
                <td>{3}</td>
                <td>{4}</td>
            </tr>
            </table>
        </div>
        <div class="column">
            <table>
            <thead>
                <th colspan="2">{0}</th>
            </thead>
            <tr>
                <th>Total accuracy</th>
                <td>{5:.3}</td>
            </tr>
            <tr>
                <th>False positive rate</th>
                <td>{6:.3}</td>
            </tr>
            <tr>
                <th>False negative rate</th>
                <td>{7:.3}</td>
            </tr>
            </table>
        </div>
        </div>
    '''.format(
        name,
        cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1],
        get_score(cm), get_fp_rate(cm), get_fn_rate(cm)
    )))

In [None]:
###### Logistic regression ######
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.utils import compute_sample_weight
from sklearn.metrics import confusion_matrix

hinselmann, schiller, citology, biopsy = y.loc[:, 'Hinselmann'], y.loc[:, 'Schiller'], y.loc[:, 'Citology'], y.loc[:, 'Biopsy']

lr_pipe = Pipeline([('scale', StandardScaler()), ('logreg', LogisticRegression())])

# Direct fitting
def direct_fit_classifier(pipe, x, y, balanced_step_name=None):
    if balanced_step_name:
        fit_params = {'{0}__sample_weight'.format(balanced_step_name): compute_sample_weight('balanced', y)}
    else:
        fit_params = {}
    pipe.fit(x, y, **fit_params)
    return confusion_matrix(y, pipe.predict(x))

# Cross-validation
def cv_classifier(pipe, x, y, cv, balanced_step_name=None):
    ypred = np.empty(x.shape[0])
    for train_i, test_i in cv.split(x):
        if balanced_step_name:
            fit_params = {'{0}__sample_weight'.format(balanced_step_name): compute_sample_weight('balanced', y.iloc[train_i])}
        else:
            fit_params = {}
        pipe.fit(x.iloc[train_i], y.iloc[train_i], **fit_params)
        ypred[test_i] = pipe.predict(x.iloc[test_i])
    return confusion_matrix(y, ypred)

for outcome, name in zip([hinselmann, schiller, citology, biopsy],
                         ['Hinselmann', 'Schiller', 'Citology', 'Biopsy']):
    direct_cm = direct_fit_classifier(lr_pipe, x, outcome, 'logreg')
    cv_cm = cv_classifier(lr_pipe, x, outcome, KFold(shuffle=True), 'logreg')

    display_model_stats(direct_cm, '{0} upon fitting'.format(name))
    display_model_stats(cv_cm, '{0} upon CV'.format(name))

In [None]:
###### Support vector machines ######
#we will have to play with c value and kernel
svc_model = Pipeline([('scale', StandardScaler()), ('svm', SVC(C=1000, kernel='rbf'))])
hinselmann_arr = np.array(hinselmann)
svc_model.fit(x, hinselmann_arr)
hinselmann_predict = svc_model.predict(x)
#print(hinselmann_predict)
print(cv_classifier(svc_model, x, hinselmann, KFold()))

schiller_arr = np.array(schiller)
svc_model.fit(x, schiller_arr)
schiller_predict = svc_model.predict(x)
#print(schiller_predict)
print(cv_classifier(svc_model, x, schiller, KFold()))

citology_arr = np.array(citology)
svc_model.fit(x, citology_arr)
citology_predict = svc_model.predict(x)
#print(citology_predict)
print(cv_classifier(svc_model, x, citology, KFold()))

biopsy_arr = np.array(biopsy)
svc_model.fit(x, biopsy_arr)
biopsy_predict = svc_model.predict(x)
#print(biopsy_predict)
print(cv_classifier(svc_model, x, biopsy, KFold()))


In [None]:
tests = [hinselmann, schiller, citology, biopsy]
kernels_options = ['rbf', 'poly']
cv_data = np.zeros((4,2,20))
c_values = np.logspace(-4,5,20)
for a in range(4):
    for i, b in enumerate(c_values):
        for c in range(2):
            svc_model = Pipeline([('scale', StandardScaler()), ('svm', SVC(C=b, kernel=kernels_options[c]))])
            direct_cm = direct_fit_classifier(svc_model, x, tests[a])
            cv_cm = cv_classifier(svc_model, x, tests[a], KFold())
            display_model_stats(direct_cm, '{0} upon fitting'.format(name))
            display_model_stats(cv_cm, '{0} upon CV'.format(name))
            cv_data[a,c,i] = get_score(cv_cm)
            print(tests[a])
            print(kernels_options[c])
            print(c_values[i])
    

In [None]:
for a in range(4):
    for b in range(2):
        plt.scatter(c_values,cv_data[a,b,:])
        plt.title('test ' + f'{tests[a]}' + 'kernel ' + f'{kernels_options[b]}')
        plt.xscale('log')
        plt.show()
        print(max(cv_data[a,b,:]))