# Notebook to test the chosen umd model on the held out umd data

### To do: Implement Logan's model for all test scenarios

In [140]:
# Imports general
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Imports scikit-learn
from sklearn import preprocessing
from sklearn import svm
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# Import bespoke
import rail_utils
from rail_utils import PlotModels
import PointsModel
import rail_utils

In [141]:
# Load data
train_file = '../../data/processed/umd_data_standard_train.csv'
train_data = pd.read_csv(train_file)
test_file = '../../data/processed/umd_data_standard_test.csv'
test_data = pd.read_csv(test_file)

# Define variables
admission_variables = ['gender_Male', 'ethnicity_Hispanic','ethnicity_Black', 'ethnicity_White', 'ethnicity_Asian', \
                       'term', 'bmi', 'age', 'allergies_Yes', 'asthma_Yes', 'gerd_Yes', 'tonsilsize_3-4', 'zscore']
test_variable = ['ahi']
continuous_variables = ["bmi", "age", 'zscore']

# Variables to ignore based on distributions
# ignore_variables = ['ethnicity Black', 'ethnicity Hispanic', 'gerd_Yes', 'tonsilsize_3-4', 'age']
ignore_variables = []

final_variables = list(set(admission_variables) - set(ignore_variables))
continuous_variables = list(set(continuous_variables) - set(ignore_variables))

# Set AHI threshold
ahi_thresh = 5

# Set up training set
ytrain_orig = pd.Series(train_data['ahi']>ahi_thresh, index=train_data.index)
xtrain_orig = train_data[final_variables]
ytest_orig = pd.Series(test_data['ahi']>ahi_thresh, index=test_data.index)
xtest_orig = test_data[final_variables]

### Set up model

In [142]:
ytrain = ytrain_orig.copy()
xtrain = xtrain_orig.copy()
ytest = ytest_orig.copy()
xtest = xtest_orig.copy()

# models = {'best_model': LogisticRegression(penalty='l1'), 'kang_model': PointsModel.PointsModel()}
models = {'best_model': LogisticRegression(penalty='l1')}

pipeline = [[preprocessing.StandardScaler(), continuous_variables],
            [preprocessing.PolynomialFeatures(degree=2), 'from before'],
            [PCA(n_components=6), 'from before']]

test_models = PlotModels(models=models, pipeline=pipeline)

[xtrain, ytrain], [xtest, ytest] = test_models._process_pipeline([xtrain, ytrain], [xtest, ytest])

# Test cross-validation score on training set
auroc_scores = cross_val_score(models['best_model'], xtrain, ytrain, cv=5, scoring ='roc_auc')
print('Training set cross-val AUC is: {:0.3f}'.format(auroc_scores.mean()))

Training set cross-val AUC is: 0.906


### Test model

In [143]:
models['best_model'].fit(xtrain, ytrain)
pred_proba = models['best_model'].predict_proba(xtest)[:, 1]
auc = metrics.roc_auc_score(ytest, pred_proba)
fpr, tpr, _ = metrics.roc_curve(ytest, pred_proba)
test_models._plot_vars['best_model'] = [fpr, tpr, auc, '']

### Test Kang et al

In [144]:
# ytrain = ytrain_orig.copy()
# xtrain = xtrain_orig.copy()
# ytest = ytest_orig.copy()
# xtest = xtest_orig.copy()


# models['kang_model'].fit(xtrain, ytrain)
# pred_proba = models['kang_model'].predict_proba(xtest)[:, 1]
# auc = metrics.roc_auc_score(ytest, pred_proba)
# fpr, tpr, _ = metrics.roc_curve(ytest, pred_proba)
# test_models._plot_vars['kang_model'] = [fpr, tpr, auc, '']

### Print model results

In [145]:
test_models._make_label()
print(test_models._plot_vars['best_model'][3])
# print(test_models._plot_vars['kang_model'][3])

best_model (AUC: 0.773, Sens: 0.620, Spec: 0.881)
