# Additional experiments

- Models:
    - Logistic Regresion
    - Random Forest
    - Imbalanced bagging classifier
- Feature selection:
    - SelectFromModel (final feature selection used)
    - RFE (Ranking Feature)

## Import libraries

In [1]:
# Data handling
import pandas as pd
import numpy as np

# Stadistics
from scipy.stats import chi2_contingency,mannwhitneyu, ttest_ind
import statsmodels
import statsmodels.api as sm

# Machine Learning
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import roc_auc_score, precision_score, recall_score, confusion_matrix, roc_curve, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedBaggingClassifier 
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE

# Notebooks options
%pylab inline
pd.options.display.max_columns = None
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings('ignore')

Populating the interactive namespace from numpy and matplotlib


## Load data


In [2]:
df = pd.read_excel("/home/ubuntu/TROMIA/data_proc/data_clean_final.xlsx")
df.shape

(1150, 265)

## Model variables
These variables have been chosen after a database cleaning and preprocessing process.

In [3]:
vars_model = ['age', 'gender', 'weight', 'height', 'bmi', 'bsa', 'hta', 'diabetes', 'smoking',
              'fa_type_1.0', 'fa_type_2.0', 'atrial_flutter', 'priorstroke', 'priorhemorrhagicstroke',
              'prior_tia', 'prior_systemic_embolization_all', 'prior_peripheral_embolization',
              'prior_heart_failure', 'prior_vascular_disease', 'previous_cad', 'previous_mi',
              'previous_pci', 'previous_cabg', 'prior_valvsurgery_description_grouped_1',
              'prior_valvsurgery_description_grouped_2', 'prior_valvsurgery_description_grouped_3',
              'chads2', 'chads2-vasc', 'has-bled', 'creatclcockroft_new', 'insuficiencia_renal_grave',
              'prior_dialysis', 'liver_disease_clean', 'prior_bleeding_clean',
              'priorbleeding_number_of_episodes', 'labile_inr_clean', 'alcohol',
              'chronicnsaiduse', 'prior_high_risk_falls_clean', 'pre_aas', 'pre_adp_inhibitor',
              'pre_coumadin_warfarin', 'pre_othersanticoag', 'pre_e_lvef', 'pre_severe_mitral_regurgitation',
              'pre_severe_mitral_stenosis', 'procedure_anesthesia', 'procedural_tee', 'procedure_combined',
              'procedure_reposition_(if_available)', 'procedure_rhythm', 'procedure_contrast',
              'laa_morphology_comb_1.0', 'laa_morphology_comb_2.0', 'laa_morphology_comb_3.0',
              'laa_morphology_comb_4.0', 'laa_morphology_comb_5.0', 'laa_closure_device_size',
              'laa_closure_device_number', 'procedure_complications', 'pr_comp_other',
              'pr_comp_major_bleeding', 'pr_comp_major_vascular_complications', 'pr_comp_device_embolization',
              'pr_comp_cardiac_tamponade', 'h_days', 'h_stroke', 'h_systemic_embolism', 'h_pericardialefussion',
              'h_dialysis', 'd_asa', 'd_adp_inhibitor', 'd_coumadin_warfarin', 'd_otheranticoag', 'd_enoxaparin',
              'complete_occlusion']

## Functions

In [4]:
def specificity(y_true, y_pred):
    '''calculate specificty
    Args:
        y_true: Array of the label to be predicted
        y_pred: Array of the label predicted by the model
    Returns:
        Specificity
    '''
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn+fp)

In [5]:
def experimentMLmethod(featureSelectionMethod,
                       model_ml,
                       gkf,
                       df = df.copy(), 
                       vars_model = vars_model,
                       endpoint = "last_fu_tee_thrombus"):
    ''' Feature selection + Modeling by fold
    Args:
        featureSelectionMethod: method of feature selector
        model_ml: model machine learning
        gkf: generator k-fold
        df: dataframe
        vars_model: name os predictors var
        endpoint: name of var to predict
    Returns:
        new_df = data frame with the results by k-fold
    '''
    
    # Preprare data
    inds = df[endpoint].isna() == False

    X = df.loc[inds,vars_model]
    Y = df.loc[inds,endpoint]

    i = 0
    results = {}

    # Training for each k-fold
    for train_index, test_index in gkf.split(X,Y):
        i+=1
        fold = '{} fold'.format(i)
        result = {}
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

        # Feature selection (SelectFromModel)
        selector = featureSelectionMethod.fit(X_train, Y_train)
        X_new = selector.transform(X)

        X_train, X_test = X_new[train_index], X_new[test_index]

        feature_mask = np.array(selector.get_support())
        feature_names = np.array(X.columns.tolist())
        selected_features = list(feature_names[feature_mask])

        # Perform the training of the model
        model = model_ml.fit(X_train, Y_train)
        ## cut-off short
        prtr = model.predict_proba(X_train)
        fpr, tpr, thresholds = roc_curve(Y_train, prtr[:,1])
        ROC_scores = []
        for thresh in thresholds:
            ROC_scores.append(roc_auc_score(Y_train, [m > thresh for m in prtr[:,1]]))

        ROCs = np.array(ROC_scores)
        max_ROC = ROCs.max() 
        pc =  thresholds[ROCs.argmax()]

        result['feats'] = list(selector.get_feature_names_out())
        try:
            result['coefs'] = model.coef_[0]
        except:
            result['coefs'] = np.nan

        preds = model.predict_proba(X_test)
        try:
            ## Results
            result['ROC'] = roc_auc_score(Y_test, preds[:,1])
            result['specificity'] = specificity(Y_test, np.array([1 if p>pc else 0 for p in preds[:,1]]))
            result['sensitivity'] = recall_score(Y_test, np.array([1 if p>pc else 0 for p in preds[:,1]]))
        except ValueError:
            pass
        results[fold] = result
        
    new_df = pd.DataFrame()
    for k in results.keys():
        df_temp = pd.DataFrame()
        df_temp['fold'] = [int("".join(filter(str.isdigit, k)))]*len(results[k]['feats'])
        df_temp['feats'] = results[k]['feats']
        df_temp['coefs'] = results[k]['coefs']
        if 'ROC' in results[k].keys():
            df_temp['ROC'] = [results[k]['ROC']]*len(results[k]['feats'])
            df_temp['specificity'] = [results[k]['specificity']]*len(results[k]['feats'])
            df_temp['sensitivity'] = [results[k]['sensitivity']]*len(results[k]['feats'])
        else:
            df_temp['ROC'] = [None]*len(results[k]['feats'])
            df_temp['specificity'] = [None]*len(results[k]['feats'])
            df_temp['sensitivity'] = [None]*len(results[k]['feats'])
        new_df = pd.concat([new_df,df_temp])
    return new_df

## Experiments

In [6]:
cvfolds = 2
rep = 5
seed = 0

endpoint = "last_fu_tee_thrombus"

# generator k-fold
gkf = RepeatedKFold(n_splits= cvfolds, n_repeats= rep, random_state= seed)

### Experiment IV
Model: **Logistic Regresion**  
Feature selection: **Select From Model**

In [7]:
# Generate the LR with a fixed seed
lr = LogisticRegression(class_weight="auto",random_state=seed, C=1.75)
# Feature selection
featureSelectionMethod = SelectFromModel(estimator=lr)

# Experiment IV
df_KFOLD_ExpIV = experimentMLmethod(featureSelectionMethod = featureSelectionMethod, 
                                   model_ml = lr,
                                   gkf = gkf)

In [8]:
print('AUC:\t\t',df_KFOLD_ExpIV['ROC'].mean(), '±', df_KFOLD_ExpIV['ROC'].std())
print('sensitivity:\t',df_KFOLD_ExpIV['sensitivity'].mean(), '±', df_KFOLD_ExpIV['sensitivity'].std())
print('specificity:\t',df_KFOLD_ExpIV['specificity'].mean(), '±', df_KFOLD_ExpIV['specificity'].std())

AUC:		 0.5325048793233625 ± 0.03493085393122986
sensitivity:	 0.28035204918164286 ± 0.07707287614858943
specificity:	 0.7634286134994225 ± 0.05176101563325354


### Experiment V
Model: **Logistic Regresion**  
Feature selection: **RFE**

In [9]:
# Generate the LR with a fixed seed
lr = LogisticRegression(class_weight="auto",random_state=seed, C=1.75)
# Feature selection
featureSelectionMethod = RFE(estimator=lr, )

# Experiment V
df_KFOLD_ExpV = experimentMLmethod(featureSelectionMethod = featureSelectionMethod, 
                                   model_ml = lr,
                                   gkf = gkf)

In [10]:
print('AUC:\t\t',df_KFOLD_ExpV['ROC'].mean(), '±', df_KFOLD_ExpV['ROC'].std())
print('sensitivity:\t',df_KFOLD_ExpV['sensitivity'].mean(), '±', df_KFOLD_ExpV['sensitivity'].std())
print('specificity:\t',df_KFOLD_ExpV['specificity'].mean(), '±', df_KFOLD_ExpV['specificity'].std())

AUC:		 0.5274065823848352 ± 0.026172201643472226
sensitivity:	 0.1606302521008403 ± 0.10119488447369068
specificity:	 0.8125560150001486 ± 0.06139524479854509


### Experiment VI
Model: **Random Forest**  
Feature selection: **Select From Model**

In [11]:
# Generate the RF with a fixed seed
rf = RandomForestClassifier()
# Feature selection
featureSelectionMethod = SelectFromModel(estimator=rf, )

# Experiment VI
df_KFOLD_ExpVI = experimentMLmethod(featureSelectionMethod = featureSelectionMethod, 
                                   model_ml = rf,
                                   gkf = gkf)

In [12]:
print('AUC:\t\t',df_KFOLD_ExpVI['ROC'].mean(), '±', df_KFOLD_ExpVI['ROC'].std())
print('sensitivity:\t',df_KFOLD_ExpVI['sensitivity'].mean(), '±', df_KFOLD_ExpVI['sensitivity'].std())
print('specificity:\t',df_KFOLD_ExpVI['specificity'].mean(), '±', df_KFOLD_ExpVI['specificity'].std())

AUC:		 0.4157303810590389 ± 0.05807137988651967
sensitivity:	 0.0273567329905358 ± 0.027464489871399603
specificity:	 0.9030806413696075 ± 0.02207370324530736


### Experiment VII
Model: **Random Forest**  
Feature selection: **RFE**

In [13]:
# Generate the RF with a fixed seed
rf = RandomForestClassifier()
# Feature selection
featureSelectionMethod = RFE(estimator=rf, )

# Experiment VII
df_KFOLD_ExpVII = experimentMLmethod(featureSelectionMethod = featureSelectionMethod, 
                                   model_ml = rf,
                                   gkf = gkf)

In [14]:
print('AUC:\t\t',df_KFOLD_ExpVII['ROC'].mean(), '±', df_KFOLD_ExpVII['ROC'].std())
print('sensitivity:\t',df_KFOLD_ExpVII['sensitivity'].mean(), '±', df_KFOLD_ExpVII['sensitivity'].std())
print('specificity:\t',df_KFOLD_ExpVII['specificity'].mean(), '±', df_KFOLD_ExpVII['specificity'].std())

AUC:		 0.47545187023649715 ± 0.07107261957499394
sensitivity:	 0.050961718020541544 ± 0.06144089185910559
specificity:	 0.9130765172332662 ± 0.02874075243001848


### Experiment VIII
Model: **Balanced Bagging Classifier**  
Feature selection: **Select From Model**

In [15]:
# Generate the LR y BBC with a fixed seed
lr = LogisticRegression(class_weight="auto",random_state=seed, C=1.75)
bbc = BalancedBaggingClassifier(random_state=seed)

# Feature selection
featureSelectionMethod = SelectFromModel(estimator=lr, )

# Experiment VIII
df_KFOLD_ExpVIII = experimentMLmethod(featureSelectionMethod = featureSelectionMethod, 
                                   model_ml = bbc,
                                   gkf = gkf)

In [16]:
print('AUC:\t\t',df_KFOLD_ExpVIII['ROC'].mean(), '±', df_KFOLD_ExpVIII['ROC'].std())
print('sensitivity:\t',df_KFOLD_ExpVIII['sensitivity'].mean(), '±', df_KFOLD_ExpVIII['sensitivity'].std())
print('specificity:\t',df_KFOLD_ExpVIII['specificity'].mean(), '±', df_KFOLD_ExpVIII['specificity'].std())

AUC:		 0.5311312919941665 ± 0.05731413301655689
sensitivity:	 0.18695986985859636 ± 0.09611166893396776
specificity:	 0.8500080687209852 ± 0.06876877508222827


### Experiment IX
Model: **Balanced Bagging Classifier**  
Feature selection: **RFE**

In [17]:
# Generate the LR y BBC with a fixed seed
lr = LogisticRegression(class_weight="auto",random_state=seed, C=1.75)
bbc = BalancedBaggingClassifier(random_state=seed)

# Feature selection
featureSelectionMethod = RFE(estimator=lr, )

# Experiment IX
df_KFOLD_ExpIX = experimentMLmethod(featureSelectionMethod = featureSelectionMethod, 
                                   model_ml = bbc,
                                   gkf = gkf)

In [18]:
print('AUC:\t\t',df_KFOLD_ExpIX['ROC'].mean(), '±', df_KFOLD_ExpIX['ROC'].std())
print('sensitivity:\t',df_KFOLD_ExpIX['sensitivity'].mean(), '±', df_KFOLD_ExpIX['sensitivity'].std())
print('specificity:\t',df_KFOLD_ExpIX['specificity'].mean(), '±', df_KFOLD_ExpIX['specificity'].std())

AUC:		 0.5027853202408172 ± 0.05949585770020268
sensitivity:	 0.1895798319327731 ± 0.15866304599082545
specificity:	 0.7961599879651461 ± 0.1082821820983072
