In [1]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from eipy.ei import EnsembleIntegration
import eipy.utils as ut
from eipy.additional_ensembles import MeanAggregation, CES
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn import datasets
import pickle as pkl
import os
from sklearn.impute import KNNImputer
pd.set_option('display.max_columns', None)

In [2]:
data_csvs = "/home/opc/block_vol/COVID-19 data/Modalities"
modalities = {}
for file_name in os.listdir(data_csvs):
        file_path = os.path.join(data_csvs, file_name)
        modality = os.path.splitext(file_name)[0]

        data = pd.read_csv(file_path)
        data = data.drop(columns=["NEW_MASKED_MRN"])
        modalities[modality] = data
y = pd.read_csv("/home/opc/block_vol/COVID-19 data/outcome.csv")


In [3]:
y

Unnamed: 0,NEW_MASKED_MRN,DECEASED_INDICATOR,DECEASED_in_0-3_DAYS,DECEASED_in_3-5_DAYS,DECEASED_in_5-7_DAYS,DECEASED_in_7-10_DAYS,DECEASED_in_0-5_DAYS,DECEASED_in_0-7_DAYS,DECEASED_in_0-10_DAYS,DECEASED_after_10_DAYS,DECEASED_after_5_DAYS
0,000226E8E50EBB57EEA5AC7328FA4903687A04405E72DC...,0,0.0,,,,0.0,0.0,0.0,,
1,0002B45EE1826821EA5F494FC2372029D281F7CB437925...,0,,,,,,,,,
2,00088A4B347D5C2A76701EAA3EB005878F4069991D428D...,1,,,,1.0,,,1.0,,1.0
3,0019BE48F6F4826E17B7E7198670B4222D14E152B5B95C...,0,0.0,,,,0.0,0.0,0.0,,
4,00204453E214D454BDE77BB70F7863CC5114CCCFA65A4E...,1,1.0,,,,1.0,1.0,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...
4778,FFD96857C2177E45055C3C70DDE3EC3877D9AA7FC8FA2C...,0,,,0.0,,,0.0,0.0,,0.0
4779,FFEB14484DB6A8CAE7D1B1DA07CE927E385EF7448D703D...,1,,1.0,,,1.0,1.0,1.0,,
4780,FFED69BBC5388C89549E1269D40CF294955B3D41DDA770...,0,,,,,,,,0.0,0.0
4781,FFEE8497E642E5B17E0FA82B8D43E9DDA375C67BD30468...,0,0.0,,,,0.0,0.0,0.0,,


In [4]:
y = y.drop(columns=["NEW_MASKED_MRN", "DECEASED_in_0-5_DAYS", "DECEASED_in_0-7_DAYS", "DECEASED_in_0-10_DAYS", "DECEASED_after_5_DAYS"], axis=1)
y['DECEASED_INDICATOR'] = y['DECEASED_INDICATOR'].replace({0: 1, 1: 0})
y = y.rename(columns={'DECEASED_INDICATOR': 'SURVIVED_INDICATOR'})
y=y.fillna(0)

In [5]:
y

Unnamed: 0,SURVIVED_INDICATOR,DECEASED_in_0-3_DAYS,DECEASED_in_3-5_DAYS,DECEASED_in_5-7_DAYS,DECEASED_in_7-10_DAYS,DECEASED_after_10_DAYS
0,1,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0
2,0,0.0,0.0,0.0,1.0,0.0
3,1,0.0,0.0,0.0,0.0,0.0
4,0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
4778,1,0.0,0.0,0.0,0.0,0.0
4779,0,0.0,1.0,0.0,0.0,0.0
4780,1,0.0,0.0,0.0,0.0,0.0
4781,1,0.0,0.0,0.0,0.0,0.0


In [6]:
row_sums = y.sum(axis=1)
bad_rows = y[(row_sums != 1)].index
bad_rows

Index([847, 1741, 2259, 2368], dtype='int64')

In [7]:
y = y[row_sums == 1]
y

Unnamed: 0,SURVIVED_INDICATOR,DECEASED_in_0-3_DAYS,DECEASED_in_3-5_DAYS,DECEASED_in_5-7_DAYS,DECEASED_in_7-10_DAYS,DECEASED_after_10_DAYS
0,1,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0
2,0,0.0,0.0,0.0,1.0,0.0
3,1,0.0,0.0,0.0,0.0,0.0
4,0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
4778,1,0.0,0.0,0.0,0.0,0.0
4779,0,0.0,1.0,0.0,0.0,0.0
4780,1,0.0,0.0,0.0,0.0,0.0
4781,1,0.0,0.0,0.0,0.0,0.0


In [8]:
for k,v in modalities.items():
    modalities[k] = v[~v.index.isin(bad_rows)]

In [9]:
modalities["admission"]

Unnamed: 0,Admission_AGE,Admission_DIASTOLIC_BP,Admission_HEART_RATE,Admission_O2_SAT,Admission_RESPIRATORY_RATE,Admission_SYSTOLIC_BP,Admission_TEMPERATURE,Admission_EMERGENCY_DEPARTMENT,Admission_RACE_ETHNICITY_COMBINED_AMERICAN INDIAN OR ALASKA NATIVE,Admission_RACE_ETHNICITY_COMBINED_ASIAN,Admission_RACE_ETHNICITY_COMBINED_BLACK OR AFRICAN-AMERICAN,Admission_RACE_ETHNICITY_COMBINED_HISPANIC,Admission_RACE_ETHNICITY_COMBINED_NATIVE HAWAIIAN OR PACIFIC ISLANDER,Admission_RACE_ETHNICITY_COMBINED_OTHER,Admission_RACE_ETHNICITY_COMBINED_UNKNOWN,Admission_RACE_ETHNICITY_COMBINED_WHITE,Admission_SEX_FEMALE,Admission_SEX_MALE,Admission_SMOKING_STATUS_NEVER,Admission_SMOKING_STATUS_NOT ASKED,Admission_SMOKING_STATUS_PASSIVE,Admission_SMOKING_STATUS_QUIT,Admission_SMOKING_STATUS_YES
0,50.0,65.0,124.0,99.0,20.0,102.0,103.3,1.0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0
1,62.0,91.0,97.0,94.0,18.0,150.0,99.0,1.0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0
2,75.0,74.0,112.0,100.0,20.0,118.0,98.9,1.0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0
3,50.0,82.0,85.0,85.0,18.0,126.0,98.9,1.0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0
4,90.0,,132.0,73.0,25.0,,99.5,1.0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4778,69.0,83.0,100.0,95.0,20.0,146.0,98.1,1.0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0
4779,85.0,75.0,97.0,88.0,26.0,134.0,98.4,1.0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0
4780,78.0,68.0,84.0,93.0,16.0,97.0,97.6,1.0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0
4781,49.0,96.0,89.0,95.0,18.0,178.0,97.7,1.0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0


In [10]:
imputer = KNNImputer(n_neighbors=6)
for k,v in modalities.items():
    modalities[k] = pd.DataFrame(imputer.fit_transform(v), columns=v.columns)

In [11]:
for k,v in modalities.items():
    if v.isna().any().any():
        print(f"There are NaN values in {k}")
    else:
        print(f"ALL G IN {k}")

ALL G IN labs
ALL G IN admission
ALL G IN comorbidities
ALL G IN vitals


In [12]:
for i, col in enumerate(y.columns):
    y = y.rename(columns={col: i})

In [13]:
y["labels"] = y.idxmax(axis=1)
y

Unnamed: 0,0,1,2,3,4,5,labels
0,1,0.0,0.0,0.0,0.0,0.0,0
1,1,0.0,0.0,0.0,0.0,0.0,0
2,0,0.0,0.0,0.0,1.0,0.0,4
3,1,0.0,0.0,0.0,0.0,0.0,0
4,0,1.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...
4778,1,0.0,0.0,0.0,0.0,0.0,0
4779,0,0.0,1.0,0.0,0.0,0.0,2
4780,1,0.0,0.0,0.0,0.0,0.0,0
4781,1,0.0,0.0,0.0,0.0,0.0,0


In [14]:
base_predictors = {
                    'ADAB': AdaBoostClassifier(),
                    'XGB': XGBClassifier(),
                    'DT': DecisionTreeClassifier(),
                    'RF': RandomForestClassifier(),
                    'GB': GradientBoostingClassifier(),
                    'KNN': KNeighborsClassifier(),
                    'LR': LogisticRegression(multi_class="auto", solver="lbfgs"),
                    'NB': GaussianNB(),
                    'MLP': MLPClassifier(),
                    'SVM': SVC(probability=True)
}

In [15]:
modalities.keys()

dict_keys(['labs', 'admission', 'comorbidities', 'vitals'])

In [16]:
labels = y["labels"].to_numpy()
labels

array([0, 0, 4, ..., 0, 0, 0])

In [17]:
labs_train, labs_test, y_train, y_test = train_test_split(modalities["labs"],labels, random_state=3, test_size=0.25, stratify=y)
admission_train, admission_test, _,_ = train_test_split(modalities["admission"],labels, random_state=3, test_size=0.25, stratify=y)
comorbidities_train, comorbidities_test, _,_ = train_test_split(modalities["comorbidities"],labels, random_state=3, test_size=0.25, stratify=y)
vitals_train, vitals_test, _,_ = train_test_split(modalities["vitals"],labels, random_state=3, test_size=0.25, stratify=y)

In [18]:
X_train = {
    "labs": labs_train,
    "admission": admission_train,
    "comorbities": comorbidities_train,
    "vitals" : vitals_train
}
X_test = {
    "labs": labs_test,
    "admission": admission_test,
    "comorbities": comorbidities_test,
    "vitals" : vitals_test
}

In [20]:
sampling_strategies = ["undersampling", "oversampling"] #hybrid as well?
metrics = ["accuracy", "precision", "recall", "f1"]

performance_frame = pd.DataFrame()

for strat in sampling_strategies:
    EI = EnsembleIntegration(
                    base_predictors=base_predictors,
                    k_outer=5,
                    k_inner=5,
                    n_samples=1,
                    sampling_strategy=strat,
                    sampling_aggregation=None,
                    n_jobs=-1,
                    random_state=42,
                    project_name="tadpole",
                    model_building=True,
                    )
    print(f"\n EI for {strat}")
    EI.fit_base(X_train, y_train)

    ut.predictive_multiclass_data(EI.meta_training_data)
    EI.fit_meta(meta_predictors=base_predictors)

    argmax = lambda x: np.argmax(x)
    cols_to_transform = [col for col in EI.meta_predictions.columns if col != 'labels']
    for column in cols_to_transform:
        EI.meta_predictions[column] = EI.meta_predictions[column].apply(argmax)
    
    df = EI.meta_predictions
    model_columns = df.columns[:-1]
    accuracies = {}
    for model in model_columns:
        correct_predictions = (df[model] == df["labels"]).sum()
        total_predictions = len(df)
        accuracy = correct_predictions / total_predictions
        accuracies[model] = accuracy
    accuracies = pd.Series(accuracies)
    accuracies = pd.DataFrame(accuracies)
    accuracies.rename(columns={0:"accuracy"}, inplace=True)
    accuracies = accuracies.T
    EI.meta_summary["metrics"] = pd.concat([EI.meta_summary["metrics"], accuracies])
    
    strat_scores = []
    for metric in metrics:
        preferred_meta_model = EI.meta_summary["metrics"].loc[metric].idxmax()
        y_pred = EI.predict(X_dict=X_test, meta_model_key=preferred_meta_model)
        y_pred = [np.argmax(np.array(y)) for y in y_pred]

        if metric == "accuracy":
            strat_scores.append(sum([1*(y==y_hat)+0*(y!=y_hat) for y,y_hat in list(zip(y_test, y_pred))])/len(y_test))
        elif metric == "precision":
            strat_scores.append(ut.precision_score(y_test, y_pred, average='macro'))
        elif metric == "recall":
            strat_scores.append(ut.recall_score(y_test, y_pred, average='macro'))
        elif metric == "f1":
            strat_scores.append(ut.f1_score(y_test, y_pred, average='macro'))
        print(strat_scores)
    
    performance_frame[strat] = strat_scores   


 EI for undersampling
Training base predictors on labs...
        
... for ensemble performance analysis...


Generating meta training data: |          |  0%

Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on admission...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on comorbities...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on vitals...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%






Analyzing ensembles: |██████████|100%
Training final meta models: |██████████|100%


[0.7271966527196653]
[0.7271966527196653, 0.25594422958829394]
[0.7271966527196653, 0.25594422958829394, 0.29446159545247125]
[0.7271966527196653, 0.25594422958829394, 0.29446159545247125, 0.26192340166084077]

 EI for oversampling
Training base predictors on labs...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on admission...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on comorbities...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%




Training base predictors on vitals...
        
... for ensemble performance analysis...


Generating meta training data: |██████████|100%
Generating meta test data: |██████████|100%



... for final ensemble...


Generating meta training data: |██████████|100%
Training final base predictors: |██████████|100%






Analyzing ensembles: |██████████|100%
Training final meta models: |██████████|100%


[0.7313807531380753]
[0.7313807531380753, 0.24398240299384819]
[0.7313807531380753, 0.24398240299384819, 0.2565406579792146]
[0.7313807531380753, 0.24398240299384819, 0.2565406579792146, 0.27485532734872425]


In [21]:
performance_frame = performance_frame.set_index(pd.Index(metrics))
performance_frame

Unnamed: 0,undersampling,oversampling
accuracy,0.727197,0.731381
precision,0.255944,0.243982
recall,0.294462,0.256541
f1,0.261923,0.274855
