# Figures 3A, 3B and 4A

This notebook creates:

1. Figure 3A showing ROC curves for all model performances,
2. Figure 3B showing model performances at different sample sizes (up to samples of N=2,000), and 
3. Figure 4A showing the impact of the number and nature of predictors on model performance

In [None]:
# Import relevant packages

import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
from sklearn import metrics
from statsmodels.stats.proportion import proportion_confint
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import StratifiedKFold 
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Figure 3A: More complex models?

In [None]:
# Load LR values

with open('scores_lr_github.npy', 'rb') as f:
    lr_fpr = np.load(f)
    lr_tpr = np.load(f)

In [None]:
# Load MLP values

with open('scores_mlp_github.npy', 'rb') as f:
    mlp_fpr = np.load(f)
    mlp_tpr = np.load(f)

In [None]:
# Load XGBoost own values

with open('scores_xgboost_own_github.npy', 'rb') as f:
    xgboost_own_fpr = np.load(f)
    xgboost_own_tpr = np.load(f)

In [None]:
# Load XGBoost other values

with open('scores_xgboost_other_github.npy', 'rb') as f:
    xgboost_other_fpr = np.load(f)
    xgboost_other_tpr = np.load(f)

In [None]:
# Plot ROC and AUC scores

plt.figure(1)
plt.figure(figsize=(10, 8))
plt.plot(lr_fpr,lr_tpr, label = 'LR, AUC = 0.72', color = '#7a5071', linewidth=4)
plt.plot(mlp_fpr,mlp_tpr, label = 'MLP, AUC = 0.70', color = '#ce8080', linewidth=4)
plt.plot(xgboost_own_fpr,xgboost_own_tpr, label = 'XGBoost, AUC = 0.70', color = '#5698a3', linewidth=4)
plt.plot(xgboost_other_fpr,xgboost_other_tpr, label = 'External XGBoost, AUC = 0.62', color = '#bacfec', linewidth=4)
plt.plot([0, 1], [0, 1], 'k--')
plt.xticks(size = 25)
plt.yticks(size = 25)
plt.xlabel('False positive rate', size = 35)
plt.ylabel('True positive rate', size = 35)
plt.title('ROC curve', size = 35)
plt.legend(loc = 4,prop = {'size': 19})
plt.savefig('roc_models_Epilepsia.png')

# Figure 4A: Better data? Using LR

In [None]:
# Load LR select predictors 

with open('scores_lr_github.npy', 'rb') as f:
    lr_fpr = np.load(f)
    lr_tpr = np.load(f)

In [None]:
# Load LR MRI only 

with open('scores_lr_all_github.npy', 'rb') as f:
    lr_random_fpr = np.load(f)
    lr_random_tpr = np.load(f)

In [None]:
# Load LR all predictors 

with open('scores_lr_mri_only_github.npy', 'rb') as f:
    lr_reduced_fpr = np.load(f)
    lr_reduced_tpr = np.load(f)

In [None]:
# Plot ROC and AUC scores

plt.figure(1)
plt.figure(figsize=(10, 8))
plt.plot(lr_fpr,lr_tpr, label = 'LR - Select predictors, AUC = 0.72', color = '#677719', linewidth=4)
plt.plot(lr_random_fpr,lr_random_tpr, label = 'LR - All predictors, AUC = 0.69', color = '#e37c1d', linewidth=4)
plt.plot(lr_reduced_fpr,lr_reduced_tpr, label = 'LR - MRI diagnosis only, AUC = 0.59', color = '#a30234', linewidth=4)
plt.plot([0, 1], [0, 1], 'k--')
plt.xticks(size = 25)
plt.yticks(size = 25)
plt.xlabel('False positive rate', size = 35)
plt.ylabel('True positive rate', size = 35)
plt.title('ROC curve', size = 35)
plt.legend(loc = 4,prop = {'size': 20})
plt.savefig('roc_predictors_Epilepsia.png')

# Supplementary figure: Better data? Using MLP

In [None]:
# Load MLP select predictors

with open('scores_mlp_github.npy', 'rb') as f:
    lr_fpr = np.load(f)
    lr_tpr = np.load(f)

In [None]:
# Load MLP MRI only 

with open('scores_mlp_all_github.npy', 'rb') as f:
    lr_random_fpr = np.load(f)
    lr_random_tpr = np.load(f)

In [None]:
# Load MLP all predictors

with open('scores_mlp_mri_only_github.npy', 'rb') as f:
    lr_reduced_fpr = np.load(f)
    lr_reduced_tpr = np.load(f)

In [None]:
# Plot ROC and AUC scores

plt.figure(1)
plt.figure(figsize=(10, 8))
plt.plot(lr_fpr,lr_tpr, label = 'MLP - Select predictors, AUC = 0.70', color = '#677719', linewidth=4)
plt.plot(lr_random_fpr,lr_random_tpr, label = 'MLP - All predictors, AUC = 0.65', color = '#e37c1d', linewidth=4)
plt.plot(lr_reduced_fpr,lr_reduced_tpr, label = 'MLP - MRI diagnosis only, AUC = 0.50', color = '#a30234', linewidth=4)
plt.plot([0, 1], [0, 1], 'k--')
plt.xticks(size = 25)
plt.yticks(size = 25)
plt.xlabel('False positive rate', size = 35)
plt.ylabel('True positive rate', size = 35)
plt.title('ROC curve', size = 35)
plt.legend(loc = 4,prop = {'size': 19})
plt.savefig('/home/maria/Desktop/PhD/Predicting-outcome-clinical-paper/Figures/Epilepsia/Supp-figure-2.png')

# Supplementary figure: Better data? Using XGBoost

In [None]:
# Load XGBoost select predictors

with open('scores_xgboost_own_github.npy', 'rb') as f:
    lr_fpr = np.load(f)
    lr_tpr = np.load(f)

In [None]:
# Load XGBoost MRI only 

with open('scores_xgboost_own_all_github.npy', 'rb') as f:
    lr_random_fpr = np.load(f)
    lr_random_tpr = np.load(f)

In [None]:
# Load XGBoost all predictors

with open('scores_xgboost_own_mri_only_github.npy', 'rb') as f:
    lr_reduced_fpr = np.load(f)
    lr_reduced_tpr = np.load(f)

In [None]:
# Plot ROC and AUC scores

plt.figure(1)
plt.figure(figsize=(10, 8))
plt.plot(lr_fpr,lr_tpr, label = 'XGBoost - Select predictors, AUC = 0.70', color = '#677719', linewidth=4)
plt.plot(lr_random_fpr,lr_random_tpr, label = 'XGBoost - All predictors, AUC = 0.68', color = '#e37c1d', linewidth=4)
plt.plot(lr_reduced_fpr,lr_reduced_tpr, label = 'XGBoost - MRI diagnosis only, AUC = 0.59', color = '#a30234', linewidth=4)
plt.plot([0, 1], [0, 1], 'k--')
plt.xticks(size = 25)
plt.yticks(size = 25)
plt.xlabel('False positive rate', size = 35)
plt.ylabel('True positive rate', size = 35)
plt.title('ROC curve', size = 35)
plt.legend(loc = 4,prop = {'size': 19})
plt.savefig('/home/maria/Desktop/PhD/Predicting-outcome-clinical-paper/Figures/Epilepsia/Supp-figure-3.png')

# Figure 3B: Larger samples?

In [None]:
# Load LR values

with open('larger_sample_lr_github.npy', 'rb') as f:
    lr_x = np.load(f)
    lr_y = np.load(f)
    lr_popt = np.load(f)
    lr_y_mins = np.load(f)
    lr_y_maxs = np.load(f)

In [None]:
# Load MLP values

with open('larger_sample_mlp_github.npy', 'rb') as f:
    mlp_x = np.load(f)
    mlp_y = np.load(f)
    mlp_popt = np.load(f)
    mlp_y_mins = np.load(f)
    mlp_y_maxs = np.load(f)

In [None]:
# Load XGBoost own values

with open('larger_sample_xgboost_own_github.npy', 'rb') as f:
    xgboost_own_x = np.load(f)
    xgboost_own_y = np.load(f)
    xgboost_own_popt = np.load(f)
    xgboost_own_y_mins = np.load(f)
    xgboost_own_y_maxs = np.load(f)

In [None]:
# Important additional packages

from scipy.stats import powerlaw
from scipy.optimize import curve_fit
from scipy.integrate import quad

In [None]:
# Define function

def func_inverse_powerlaw(x, a, b, c):
    return (1-a)-b*x**c

target_func = func_inverse_powerlaw

# Expand x array
x_expand = np.arange(1,2000)

# Plot
plt.figure(figsize=(10, 8))

# LR
plt.plot(lr_x, lr_y*100, 'ro', color='#7a5071', label='Learning curve on dataset points')
plt.plot(lr_x, target_func(lr_x, *lr_popt)*100, '-', color='#7a5071', label='LR - Fitted inverse power-function')
plt.plot(x_expand, target_func(x_expand, *lr_popt)*100, '--', color='#7a5071', label='LR - Prediction on expanded dataset')
plt.fill_between(x_expand, lr_y_mins, lr_y_maxs,color = '#7a5071', alpha = 0.15)

# MLP
plt.plot(mlp_x, mlp_y*100, 'ro',color='#ce8080')
plt.plot(mlp_x, target_func(mlp_x, *mlp_popt)*100, '-', color='#ce8080', label='MLP - Fitted inverse power-function')
plt.plot(x_expand, target_func(x_expand, *mlp_popt)*100, '--', color='#ce8080', label='MLP - Prediction on expanded dataset')
plt.fill_between(x_expand, mlp_y_mins, mlp_y_maxs,color = '#ce8080', alpha = 0.15)

# XGBoost
plt.plot(xgboost_own_x, xgboost_own_y*100, 'ro', color='#5698a3')
plt.plot(xgboost_own_x, target_func(mlp_x, *xgboost_own_popt)*100, '-', color='#5698a3', label='XGBoost - Fitted inverse power-function')
plt.plot(x_expand, target_func(x_expand, *xgboost_own_popt)*100, '--', color='#5698a3', label='XGBoost - Prediction on expanded dataset')
plt.fill_between(x_expand, xgboost_own_y_mins, xgboost_own_y_maxs,color = '#5698a3', alpha = 0.15)

plt.ylim([40,80])
plt.xticks(size = 25)
plt.yticks(size = 25)
plt.ylabel('Accuracy (%)', fontsize = 35)
plt.xlabel('Sample size', fontsize = 35)
plt.legend(loc = 4,prop = {'size': 19})


plt.savefig('inverse_power_law_Epilepsia.png')