In [None]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from eipy.metrics import fmax_score
from sklearn.metrics import roc_auc_score, matthews_corrcoef, f1_score
import pandas as pd
import numpy as np
import eipy.ei as e
import os
import pickle as pkl
import longitudinal_tadpole.pipeline as p

In [None]:
metrics = {
            'f_1': f1_score,
            'auc': roc_auc_score,
            'mcc': matthews_corrcoef
            }

In [None]:
base_predictors = {
                    'ADAB': AdaBoostClassifier(),
                    'XGB': XGBClassifier(),
                    'DT': DecisionTreeClassifier(),
                    'RF': RandomForestClassifier(),
                    'GB': GradientBoostingClassifier(),
                    'KNN': KNeighborsClassifier(),
                    'LR': LogisticRegression(),
                    'NB': GaussianNB(),
                    'MLP': MLPClassifier(),
                    'SVM': SVC(probability=True),
}

In [None]:
cwd = os.getcwd()
with open(f"{cwd}/longitudinal_tadpole/tadpole_data/tadpole_data_as_dfs/tadpole_data_time_imptn_norm_thrshld30_dfs.pickle", "rb") as file:
    data_nested_dict = pkl.load(file)
with open(f"{cwd}/longitudinal_tadpole/tadpole_data/tadpole_data_as_dfs/tadpole_labels_time_imptn_norm_thrshld30_dfs.pickle", "rb") as file:
    labels = pkl.load(file)

In [None]:
data_list_of_dicts = [data_nested_dict[k] for k in data_nested_dict.keys()]

In [None]:
for k,v in labels.items():
    labels[k] = v.reset_index(drop=True)

In [None]:
#intermediate transformation to make sure labels are ordered correctly in time
labels = pd.DataFrame(labels)

labels = labels.to_numpy()

In [None]:
#for multiclass version of data
encoding_dict = {'NL': 0, 'MCI': 1, 'Dementia': 2}

labels = np.vectorize(lambda x: encoding_dict[x])(labels)

## generate bp data

In [None]:
def get_column_names(df):
    column_names = []
    for i in range(df.columns.nlevels):
        if i == 0:
            column_names.append(df.columns.get_level_values(i).unique().drop("labels"))
            
        else:
            column_names.append(df.columns.get_level_values(i).unique().drop(''))
    
    return column_names

def fix_first_time_point(df):
    new_columns = get_column_names(df)
    classes=[0,1,2]
    new_columns.append(classes)
    new_mux=pd.MultiIndex.from_product(iterables=new_columns, names=["modality", "base predictor", "sample", "class"])
    new_df = pd.DataFrame(columns=new_mux)

    for col in new_df.columns:
        if col[-1] == 0:
            new_df[col] = 1 - df[col[:-1]]
        elif col[-1] == 1:
            new_df[col] = df[col[:-1]]
        else:
            new_df[col] = 0
    
    new_df['labels'] = df['labels']

    return new_df

In [None]:
def generate_RNN_data(seed, aligned=True):
    meta_data = []
    for t in range(len(data_list_of_dicts)):
        #time dependent data splitting
        X_train_test_timestep = data_list_of_dicts[t]
        #labels_at_timestep = labels[:,t] #for no sampling
        EI_for_timestep = e.EnsembleIntegration(
                            base_predictors=base_predictors,
                            k_outer=5,
                            k_inner=5,
                            n_samples=1,
                            sampling_strategy='oversampling',
                            n_jobs=-1,
                            metrics=metrics,
                            random_state=seed,
                            project_name=f'time step {t}',
                            model_building=True,
                            time_series= (1,t)
                            )
        print(f"generating metadata for timestep {t}")
        EI_for_timestep.fit_base(X_train_test_timestep, labels) #y=labels_at_timestep
        meta_data.append([EI_for_timestep.ensemble_training_data, EI_for_timestep.ensemble_test_data, EI_for_timestep.base_summary, EI_for_timestep])

    #swap arrangement across folds and time
    RNN_training_data = [[dfs[0][i] for dfs in meta_data] for i in range(5)]
    RNN_test_data = [[dfs[1][i] for dfs in meta_data] for i in range(5)]
    base_summaries = [x[-2] for x in meta_data]
    EIs = [x[-1] for x in meta_data]

    for i in range(len(RNN_training_data)):
        RNN_training_data[i][0] = fix_first_time_point(RNN_training_data[i][0])
        RNN_test_data[i][0] = fix_first_time_point(RNN_test_data[i][0])
    
    return RNN_training_data, RNN_test_data, base_summaries, EIs


In [None]:
#data at final timepoint is copy of data from 2nd to last timepoint. Only there to make downstream formatting easier.
for seed in range(5):
    with open(f"/Users/susmaa01/Documents/eipy/longitudinal_tadpole/base_predictions/multiclass/data_at_n_w_labels_at_n/oversampling/split_{seed}.pkl", "wb") as file:
        base_prediction_data = generate_RNN_data(seed=seed, aligned=True)
        pkl.dump(obj=base_prediction_data, file=file)