In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn import svm, preprocessing
from sklearn.model_selection import cross_validate,cross_val_predict

In [2]:
data_path = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_dataset/volumetric_perfusion_data/onset_scope/curated_onset_known_volumetric_data.xlsx'

In [3]:
data_df = pd.read_excel(data_path)

In [4]:
clinical_covars_names = ['Age (calc.)'	,'Sex',	'Referral'	,'Prestroke disability (Rankin)'	,'NIH on admission'	,'Antiplatelet drugs'	,'Anticoagulants',	'MedHist Stroke'	,'MedHist TIA',	'MedHist ICH',	'MedHist Hypertension',	'MedHist Diabetes', 'MedHist Hyperlipidemia', 'MedHist Smoking', 'MedHist Atrial Fibr.']
volumetric_covars_names = ['CBF', 'T10', 'T8', 'T6', 'T4']

In [5]:
outcome = data_df['TimeOnsetCT']
volumetric_covars = data_df[volumetric_covars_names]
clinical_covars = data_df[clinical_covars_names]
all_covars = data_df[(volumetric_covars_names + clinical_covars_names)]

In [23]:
binarised_outcome = outcome < (4.5 * 60)
binarised_outcome.value_counts()

True     347
False     43
Name: TimeOnsetCT, dtype: int64

## Framing as continuous problem

In [7]:
continuous_results_columns = ['method', 'covar selection', 'covars', 'mean_absolute_error', 'binarised_auc', 'binarised_accuracy']
continuous_results_df = pd.DataFrame(columns=continuous_results_columns)

In [8]:
def evaluate_continuous_model(model, model_name=None):
    if model_name is None:
        model_name = model.__class__.__name__
    volumetric_cv_results = cross_validate(model, volumetric_covars, outcome, cv=5, scoring=('neg_mean_absolute_error', 'neg_mean_absolute_percentage_error'))
    volumetric_cv_predictions = cross_val_predict(model, volumetric_covars, outcome, cv=5)
    volumetric_binarised_cv_predictions = volumetric_cv_predictions < (4.5 * 60)

    volumetric_result_df = pd.DataFrame([[model_name, 'volumetric', volumetric_covars_names,
                                  abs(volumetric_cv_results['test_neg_mean_absolute_error']).mean(),
                                  metrics.roc_auc_score(binarised_outcome, volumetric_binarised_cv_predictions),
                                  metrics.accuracy_score(binarised_outcome, volumetric_binarised_cv_predictions)
                                  ]], columns=continuous_results_columns)

    all_covar_cv_results = cross_validate(model, all_covars, outcome, cv=5, scoring=('neg_mean_absolute_error', 'neg_mean_absolute_percentage_error'))
    all_covar_cv_predictions = cross_val_predict(model, all_covars, outcome, cv=5)
    all_covar_binarised_cv_predictions = all_covar_cv_predictions < (4.5 * 60)

    all_covar_cv_result_df = pd.DataFrame([[model_name, 'volumetric + clinical',
                                            volumetric_covars_names + clinical_covars_names,
                                  abs(all_covar_cv_results['test_neg_mean_absolute_error']).mean(),
                                  metrics.roc_auc_score(binarised_outcome, all_covar_binarised_cv_predictions),
                                  metrics.accuracy_score(binarised_outcome, all_covar_binarised_cv_predictions)
                                  ]], columns=continuous_results_columns)

    return pd.concat([volumetric_result_df, all_covar_cv_result_df])


In [9]:
from sklearn.feature_selection import RFECV

linreg = LinearRegression()
lin_results_df = evaluate_continuous_model(linreg, LinearRegression().__class__.__name__)
continuous_results_df = continuous_results_df.append(lin_results_df)
lin_results_df


Unnamed: 0,method,covar selection,covars,mean_absolute_error,binarised_auc,binarised_accuracy
0,LinearRegression,volumetric,"[CBF, T10, T8, T6, T4]",130.112336,0.494337,0.861538
0,LinearRegression,volumetric + clinical,"[CBF, T10, T8, T6, T4, Age (calc.), Sex, Refer...",138.170033,0.491757,0.802564


In [10]:
from sklearn.neural_network import MLPRegressor

mlp = MLPRegressor(random_state=1, max_iter=1000)

mlp_results_df = evaluate_continuous_model(mlp)

continuous_results_df = continuous_results_df.append(mlp_results_df)

mlp_results_df



Unnamed: 0,method,covar selection,covars,mean_absolute_error,binarised_auc,binarised_accuracy
0,MLPRegressor,volumetric,"[CBF, T10, T8, T6, T4]",134.865857,0.480129,0.8
0,MLPRegressor,volumetric + clinical,"[CBF, T10, T8, T6, T4, Age (calc.), Sex, Refer...",148.761841,0.51079,0.782051


In [11]:
from sklearn import linear_model

clf = linear_model.Lasso()

clf_results_df = evaluate_continuous_model(clf)
continuous_results_df = continuous_results_df.append(clf_results_df)

clf_results_df

Unnamed: 0,method,covar selection,covars,mean_absolute_error,binarised_auc,binarised_accuracy
0,Lasso,volumetric,"[CBF, T10, T8, T6, T4]",130.097795,0.494337,0.861538
0,Lasso,volumetric + clinical,"[CBF, T10, T8, T6, T4, Age (calc.), Sex, Refer...",136.856867,0.488774,0.815385


In [12]:

svr = make_pipeline(preprocessing.StandardScaler(), svm.SVR())

svr_results_df = evaluate_continuous_model(svr, svm.SVR().__class__.__name__)
continuous_results_df = continuous_results_df.append(svr_results_df)

svr_results_df

Unnamed: 0,method,covar selection,covars,mean_absolute_error,binarised_auc,binarised_accuracy
0,SVR,volumetric,"[CBF, T10, T8, T6, T4]",102.175419,0.5,0.889744
0,SVR,volumetric + clinical,"[CBF, T10, T8, T6, T4, Age (calc.), Sex, Refer...",102.564593,0.5,0.889744


In [13]:
from sklearn.tree import DecisionTreeRegressor

DTregressor = DecisionTreeRegressor()

dtr_results = evaluate_continuous_model(DTregressor)
continuous_results_df = continuous_results_df.append(dtr_results)

dtr_results

Unnamed: 0,method,covar selection,covars,mean_absolute_error,binarised_auc,binarised_accuracy
0,DecisionTreeRegressor,volumetric,"[CBF, T10, T8, T6, T4]",183.542906,0.494739,0.789744
0,DecisionTreeRegressor,volumetric + clinical,"[CBF, T10, T8, T6, T4, Age (calc.), Sex, Refer...",193.628205,0.538369,0.794872


## Framing as binary problem

In [14]:
binary_results_columns = ['method', 'covar selection', 'covars', 'auc', 'accuracy', 'f1', 'precision', 'recall']
binary_results_df = pd.DataFrame(columns=binary_results_columns)

In [15]:

def evaluate_binary_model(model, model_name=None):
    if model_name is None:
        model_name = model.__class__.__name__
    scoring = ('roc_auc', 'accuracy', 'f1', 'precision', 'recall')
    volumetric_cv_results = cross_validate(model, volumetric_covars, binarised_outcome, cv=5, scoring=scoring)

    volumetric_result_df = pd.DataFrame([[model_name, 'volumetric', volumetric_covars_names,
                                  np.median(volumetric_cv_results['test_roc_auc']),
                                  np.median(volumetric_cv_results['test_accuracy']),
                                  np.median(volumetric_cv_results['test_f1']),
                                  np.median(volumetric_cv_results['test_precision']),
                                  np.median(volumetric_cv_results['test_recall']),
                                  ]], columns=binary_results_columns)

    all_covar_cv_results = cross_validate(model, all_covars, binarised_outcome, cv=5, scoring=scoring)

    all_covar_cv_result_df = pd.DataFrame([[model_name, 'volumetric + clinical',
                                            volumetric_covars_names + clinical_covars_names,
                                  np.median(all_covar_cv_results['test_roc_auc']),
                                  np.median(all_covar_cv_results['test_accuracy']),
                                  np.median(all_covar_cv_results['test_f1']),
                                  np.median(all_covar_cv_results['test_precision']),
                                  np.median(all_covar_cv_results['test_recall']),
                                  ]], columns=binary_results_columns)

    return pd.concat([volumetric_result_df, all_covar_cv_result_df])

In [16]:
list(metrics.SCORERS.keys())

['explained_variance',
 'r2',
 'max_error',
 'neg_median_absolute_error',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_root_mean_squared_error',
 'neg_mean_poisson_deviance',
 'neg_mean_gamma_deviance',
 'accuracy',
 'top_k_accuracy',
 'roc_auc',
 'roc_auc_ovr',
 'roc_auc_ovo',
 'roc_auc_ovr_weighted',
 'roc_auc_ovo_weighted',
 'balanced_accuracy',
 'average_precision',
 'neg_log_loss',
 'neg_brier_score',
 'adjusted_rand_score',
 'rand_score',
 'homogeneity_score',
 'completeness_score',
 'v_measure_score',
 'mutual_info_score',
 'adjusted_mutual_info_score',
 'normalized_mutual_info_score',
 'fowlkes_mallows_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'jaccard',
 'jaccard_macro',
 'jaccard_m

In [17]:
logreg = LogisticRegression()

logreg_results_df = evaluate_binary_model(logreg)
binary_results_df = binary_results_df.append(logreg_results_df)
logreg_results_df


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,method,covar selection,covars,auc,accuracy,f1,precision,recall
0,LogisticRegression,volumetric,"[CBF, T10, T8, T6, T4]",0.539286,0.884615,0.938776,0.884615,1.0
0,LogisticRegression,volumetric + clinical,"[CBF, T10, T8, T6, T4, Age (calc.), Sex, Refer...",0.464286,0.884615,0.938776,0.884615,1.0


In [18]:
from sklearn.ensemble import RandomForestClassifier

rfm = RandomForestClassifier()
rfm_results_df = evaluate_binary_model(rfm)
binary_results_df = binary_results_df.append(rfm_results_df)
rfm_results_df

Unnamed: 0,method,covar selection,covars,auc,accuracy,f1,precision,recall
0,RandomForestClassifier,volumetric,"[CBF, T10, T8, T6, T4]",0.557971,0.871795,0.931507,0.883117,0.985507
0,RandomForestClassifier,volumetric + clinical,"[CBF, T10, T8, T6, T4, Age (calc.), Sex, Refer...",0.626409,0.884615,0.938776,0.884615,1.0


In [19]:
clf_svm = svm.SVC()
clf_svm_results_df = evaluate_binary_model(clf_svm)
binary_results_df = binary_results_df.append(clf_svm_results_df)
clf_svm_results_df

Unnamed: 0,method,covar selection,covars,auc,accuracy,f1,precision,recall
0,SVC,volumetric,"[CBF, T10, T8, T6, T4]",0.414654,0.884615,0.938776,0.884615,1.0
0,SVC,volumetric + clinical,"[CBF, T10, T8, T6, T4, Age (calc.), Sex, Refer...",0.447665,0.884615,0.938776,0.884615,1.0


In [29]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier

tuned_parameters = {
        'activation': (['relu','logistic']),
        'hidden_layer_sizes':([[80,160,80],[78,156,78],[88,176,88],[80,160]]),
        'alpha':     ([0.01, 0.001, 0.0001]),
        'batch_size':         [32,64],
        'learning_rate_init':    [0.01, 0.001],
        'solver': ["adam"]}

mlp = MLPClassifier(max_iter=5000)
mlp_clf = RandomizedSearchCV(mlp, tuned_parameters)

mlp_svm_results_df = evaluate_binary_model(mlp_clf)
binary_results_df = binary_results_df.append(mlp_svm_results_df)
mlp_svm_results_df

Unnamed: 0,method,covar selection,covars,auc,accuracy,f1,precision,recall
0,MLPClassifier,volumetric,"[CBF, T10, T8, T6, T4]",0.453571,0.871795,0.930556,0.893333,0.971014
0,MLPClassifier,volumetric + clinical,"[CBF, T10, T8, T6, T4, Age (calc.), Sex, Refer...",0.497585,0.871795,0.931507,0.890411,0.971429


In [27]:
always_true = np.ones(len(binarised_outcome))
always_true_results = pd.DataFrame([['always_true', 'none', [],
                                  metrics.roc_auc_score(binarised_outcome, always_true),
                                  metrics.accuracy_score(binarised_outcome, always_true),
                                  metrics.f1_score(binarised_outcome, always_true),
                                  metrics.precision_score(binarised_outcome, always_true),
                                  metrics.recall_score(binarised_outcome, always_true),
                                  ]], columns=binary_results_columns)

binary_results_df = binary_results_df.append(always_true_results)

always_true_results

Unnamed: 0,method,covar selection,covars,auc,accuracy,f1,precision,recall
0,always_true,none,[],0.5,0.889744,0.941655,0.889744,1.0


In [28]:
import os
output_dir = '/Users/jk1/OneDrive - unige.ch/stroke_research/scope/onset_output/volumetric_performance'
binary_results_df.to_excel(os.path.join(output_dir, 'binary_prediction_from_volumetric_variables.xlsx'))
continuous_results_df.to_excel(os.path.join(output_dir, 'continuous_prediction_from_volumetric_variables.xlsx'))