# Notebook to train best umd model on umd data, and test on chat data

In [9]:
# Imports general
import pandas as pd
import numpy as np

# Imports scikit-learn
from sklearn import preprocessing
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# Import bespoke
import PointsModel
import rail_utils
from rail_utils import PlotModels

### Load UMD data

In [36]:
# Load data
train_file = '../../data/processed/umd_data_standard_all.csv'
train_data = pd.read_csv(train_file)
test_file = '../../data/processed/chat_data_standard_all.csv'
test_data = pd.read_csv(test_file)

# Define variables
admission_variables = ['gender_Male', 'ethnicity_Hispanic','ethnicity_Black', 'ethnicity_White', 'ethnicity_Asian', \
                       'term', 'bmi', 'age', 'allergies_Yes', 'asthma_Yes', 'gerd_Yes', 'tonsilsize_3-4', 'zscore']
test_variable = ['ahi']
continuous_variables = ["bmi", "age", 'zscore']
reference_variables = ['reference: osa18', 'reference: psq', 'reference: ess']

# Variables to ignore based on distributions
# ignore_variables = ['ethnicity Black', 'ethnicity Hispanic', 'gerd_Yes', 'tonsilsize_3-4', 'age']
ignore_variables = []

final_variables = list(set(admission_variables) - set(ignore_variables))
continuous_variables = list(set(continuous_variables) - set(ignore_variables))

# Set thresholds
ahi_thresh = 5
ref_thresh = {'reference: osa18': 60, 'reference: psq': 0.33, 'reference: ess': 8.01}

# Set up training and test set
ytrain_orig = pd.Series(train_data['ahi']>ahi_thresh, index=train_data.index)
xtrain_orig = train_data[final_variables]
ytest_orig = pd.Series(test_data['ahi']>ahi_thresh, index=test_data.index)
xtest_orig = test_data[final_variables]
yref = test_data[reference_variables]

### Set up model

In [35]:
ytrain = ytrain_orig.copy()
xtrain = xtrain_orig.copy()
ytest = ytest_orig.copy()
xtest = xtest_orig.copy()

models = {'best_model': LogisticRegression(penalty='l1'), 'kang_model': PointsModel.PointsModel()}

pipeline = [[preprocessing.StandardScaler(), continuous_variables],
            [preprocessing.PolynomialFeatures(degree=2), 'from before'],
            [PCA(n_components=6), 'from before']]

test_models = PlotModels(models=models, pipeline=pipeline)

[xtrain, ytrain], [xtest, ytest] = test_models._process_pipeline([xtrain, ytrain], [xtest, ytest])

# Test cross-validation score on training set
auroc_scores = cross_val_score(models['best_model'], xtrain, ytrain, cv=5, scoring ='roc_auc')
print('Training set cross-val AUC is: {:0.3f}'.format(auroc_scores.mean()))

Training set cross-val AUC is: 0.786


### Test model

In [34]:
# Model train and test
models['best_model'].fit(xtrain, ytrain)
pred_proba = models['best_model'].predict_proba(xtest)[:, 1]
auc = metrics.roc_auc_score(ytest, pred_proba)
fpr, tpr, _ = metrics.roc_curve(ytest, pred_proba)
test_models._plot_vars['best_model'] = [fpr, tpr, auc, '']

### Test kang model

In [33]:
ytrain = ytrain_orig.copy()
xtrain = xtrain_orig.copy()
ytest = ytest_orig.copy()
xtest = xtest_orig.copy()

models['kang_model'].fit(xtrain, ytrain)
pred_proba = models['kang_model'].predict_proba(xtest)[:, 1]
auc = metrics.roc_auc_score(ytest, pred_proba)
fpr, tpr, _ = metrics.roc_curve(ytest, pred_proba)
test_models._plot_vars['kang_model'] = [fpr, tpr, auc, '']

Optimization terminated successfully.
         Current function value: 0.605563
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.608981
         Iterations 5


### Print model results

In [32]:
test_models._make_label()
print(test_models._plot_vars['best_model'][3])
print(test_models._plot_vars['kang_model'][3])

best_model (AUC: 0.575, Sens: 0.570, Spec: 0.576)
kang_model (AUC: 0.415, Sens: 0.042, Spec: 0.974)


### Test reference surveys on test set

In [31]:
models['best_model'].fit(xtrain, ytrain)
pred_proba = models['best_model'].predict_proba(xtest)[:, 1]
auc = metrics.roc_auc_score(ytest, pred_proba)
fpr, tpr, _ = metrics.roc_curve(ytest, pred_proba)

ref_sens_spec = {ref_metric: [] for ref_metric in reference_variables}

for ref_metric in reference_variables:
    ypred = pd.Series(yref[ref_metric]>=ref_thresh[ref_metric], index=yref.index)
    sensitivity, specificity = rail_utils.sens_and_spec(ytest, ypred)
    ref_sens_spec[ref_metric] = [sensitivity, specificity]

# Compare sensitivity and specificity to reference values
for ref_metric in reference_variables:
    print('{} --> Sensitivity = {:0.3f}, Specificity = {:0.3f}'.format(ref_metric, \
    ref_sens_spec[ref_metric][0], ref_sens_spec[ref_metric][1]))
    sensitivity, specificity = rail_utils.match_sens(fpr, tpr, ref_sens_spec[ref_metric][0])
    print('Match sensitivity for {} --> Sensitivity = {:0.3f}, Specificity = {:0.3f}'.format(ref_metric, \
    sensitivity, specificity))
    sensitivity, specificity = rail_utils.match_spec(fpr, tpr, ref_sens_spec[ref_metric][1])
    print('Match specificity for {} --> Sensitivity = {:0.3f}, Specificity = {:0.3f}'.format(ref_metric, \
    sensitivity, specificity))
    print('')

reference: osa18 --> Sensitivity = 0.360, Specificity = 0.701
Match sensitivity for reference: osa18 --> Sensitivity = 0.360, Specificity = 0.771
Match specificity for reference: osa18 --> Sensitivity = 0.411, Specificity = 0.710

reference: psq --> Sensitivity = 0.822, Specificity = 0.251
Match sensitivity for reference: psq --> Sensitivity = 0.822, Specificity = 0.238
Match specificity for reference: psq --> Sensitivity = 0.804, Specificity = 0.255

reference: ess --> Sensitivity = 0.364, Specificity = 0.636
Match sensitivity for reference: ess --> Sensitivity = 0.364, Specificity = 0.762
Match specificity for reference: ess --> Sensitivity = 0.458, Specificity = 0.636

