In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd
import numpy as np
import eipy.ei as e
import tensorflow as tf
import keras
from keras.callbacks import EarlyStopping, TensorBoard, Callback # type: ignore
from tensorboard import notebook
import keras.backend as K # type: ignore
from keras.models import Sequential, Model # type: ignore
from keras.layers import LSTM,Dense, Bidirectional, GRU, Dropout # type: ignore
from keras.layers import BatchNormalization, SimpleRNN, Input, Lambda, TimeDistributed # type: ignore
from keras.regularizers import l2 # type: ignore
import matplotlib.pyplot as plt
import os
import pickle as pkl
import longitudinal_tadpole.pipeline as p
from sklearn.model_selection import StratifiedKFold
from longitudinal_tadpole.threshmax import ThreshMax

In [None]:
cwd = os.getcwd()
with open(f"/Users/susmaa01/Documents/eipy/longitudinal_tadpole/tadpole_data/tadpole_data_time_imptn_norm_thrshld30.pickle", "rb") as file:
    data_nested_dict = pkl.load(file)
with open(f"/Users/susmaa01/Documents/eipy/longitudinal_tadpole/tadpole_data/tadpole_labels_time_imptn_norm_thrshld30.pickle", "rb") as file:
    labels = pkl.load(file)

In [None]:
concatenated_data = {}
for k,v in data_nested_dict.items():
    list_of_data_arrays = [data_array for data_array in v.values()]
    concatenated_data[k] = np.concatenate(list_of_data_arrays, axis=1)

In [None]:
X = np.stack([v for v in concatenated_data.values()], axis=1)

In [None]:
data, y = p.initial_data_label_prep(data_nested_dict, labels)

# TIME SERIES TIME

In [None]:
def ohe(y):
    labels_across_time = np.eye(3)[y]

    return labels_across_time

In [None]:
y_offset = y[:,1:]

In [None]:
#ordinal categorical crossentropy. Weighs error by how many classes output was off by. weights range from 1 to 2. assigns class weights across time.
# https://stats.stackexchange.com/questions/87826/machine-learning-with-ordered-labels
#gamma = d(MCI,CN) - d(MCI,Dementia) = d(MCI,CN) - 1

def occ_loss(gamma=0):
    global class_weights
    def loss(y_true, y_pred):
        y_true_ord = tf.argmax(y_true, axis=-1)
        y_pred_ord = tf.argmax(y_pred, axis=-1)

        losses_over_time = []
        for t in range(4):
            y_true_ord_t = y_true_ord[:,t]
            
            class_weights_t = class_weights[t]
            w_c_t = tf.gather(tf.constant(list(class_weights_t.values()), dtype=tf.float32),
                                y_true_ord_t)

            if gamma=='none':
                loss = tf.keras.losses.categorical_crossentropy(y_true[:,t], y_pred[:,t]) * w_c_t
            else:
                y_true_ord_gamma = y_true_ord_t + tf.cast(y_true_ord_t != 0, tf.int64) * gamma
                y_pred_ord_gamma = y_pred_ord[:,t] + tf.cast(y_pred_ord[:,t] != 0, tf.int64) * gamma
                w_o_t = tf.cast(tf.abs(y_true_ord_gamma - y_pred_ord_gamma) / (2 + gamma), dtype='float32') + 1
    
                loss = tf.keras.losses.categorical_crossentropy(y_true[:,t], y_pred[:,t]) * w_o_t * w_c_t

            losses_over_time.append(loss)
        return tf.reduce_mean(tf.stack(losses_over_time, axis=-1), axis=-1)
    return loss

In [None]:
def dem_prev(label_seq):
    return [np.sum(y==2) for y in label_seq]

In [None]:
def fit_restandardize(data):
    means = np.mean(data, axis=0)
    stds = np.std(data, axis=0)
    
    #for restandardizing bl class 2 columns to be all -1
    means[means==0] = 1
    stds[stds == 0] = 1

    restandardized_data = (data-means)/stds

    return restandardized_data, means, stds

def restandardize(data, means, stds):

    return (data-means)/stds

train lstms

In [None]:
cell_list = ["LSTM"]#, "GRU"]#, 'RNN', "biLSTM", "biGRU"]
def train_eval_RNNs(seed, decision, cells=cell_list, random_bl=False, sampling=None, multiclass_weights='longitudinal', gamma=0):
    global f_scores_test
    global p_scores_test
    global r_scores_test
    for cell in cells:
        y_preds_test = []
        y_tests = []
        
        y_preds_train = []
        y_trains = []
        
        
        skf = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
        for fold, (train_index, test_index) in enumerate(skf.split(X, dem_prev(y_offset)), 1):
            if random_bl==False:
                real="Real"
            else:
                real="Random"
            print(real, cell, "seed",seed+1, "fold", fold, "gamma=", gamma)
            
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y_offset[train_index], y_offset[test_index]

            X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, test_size=0.09,
                                                                stratify=dem_prev(y_train), random_state=seed**2)
            

            y_train_ohe, y_val_ohe = ohe(y_train), ohe(y_val)
            y_tests.append(y_test)
            y_trains.append(y_train)
            

            X_train, means, stds = fit_restandardize(X_train)
            X_test, X_val = restandardize(X_test, means=means, stds=stds), restandardize(X_val, means=means, stds=stds)

            global class_weights
            class_weights = [dict(zip(range(3), compute_class_weight(class_weight='balanced', classes=[0,1,2], y=y_train[:,t]))) for t in range(y_train.shape[1])]
            

            model = p.build_RNN(units=64, cell=cell, drout=0.2, deep=True, L2=0.00,
                               activation='tanh', reg_layer='batchnorm', gamma=gamma)
            
            early_stop = EarlyStopping(
                monitor='val_loss', patience=30, verbose=1,
                restore_best_weights=True, start_from_epoch=10)
            
            
            if random_bl:
                np.random.shuffle(y_train_ohe)
                np.random.shuffle(y_val_ohe)

            #fit model
            model.fit(X_train, y_train_ohe, epochs=2000, validation_data=[X_val, y_val_ohe],
                       verbose=1, callbacks=[early_stop])

            y_preds_train.append(model.predict(X_train))
            y_preds_test.append(model.predict(X_test))
        
        y_pred_test = np.concatenate(y_preds_test, axis=0)
        y_test = np.concatenate(y_tests, axis=0)
        
        y_pred_train = np.concatenate(y_preds_train, axis=0)
        y_train = np.concatenate(y_trains, axis=0)


        if decision == "argmax": # for doing traditional decision making
            f_scores_cell_seed_across_time_test = np.stack([f1_score(y_test[:,i], np.array([np.argmax(y_hat) for y_hat in y_pred_test[:,i,:]]), average=None) for i in range(y_test.shape[1])])
            p_scores_cell_seed_across_time_test = np.stack([precision_score(y_test[:,i], np.array([np.argmax(y_hat) for y_hat in y_pred_test[:,i,:]]), average=None) for i in range(y_test.shape[1])])
            r_scores_cell_seed_across_time_test = np.stack([recall_score(y_test[:,i], np.array([np.argmax(y_hat) for y_hat in y_pred_test[:,i,:]]), average=None) for i in range(y_test.shape[1])])

        elif decision == 'threshmax':
            tm = ThreshMax(classes=[0,1,2], thresh_class=1, class_to_optimize='avg')

            thresholds = [tm.find_tmax(y_trues=y_train[:,i], y_preds=y_pred_train[:,i,:]) for i in range(y_train.shape[1])]
            f_scores_cell_seed_across_time_test = np.stack([f1_score(y_test[:,i], np.array([tm.compute_threshmax(y_hat, thresholds[i]) for y_hat in y_pred_test[:,i,:]]), average=None) for i in range(y_test.shape[1])])
            p_scores_cell_seed_across_time_test = np.stack([precision_score(y_test[:,i], np.array([tm.compute_threshmax(y_hat, thresholds[i]) for y_hat in y_pred_test[:,i,:]]), average=None) for i in range(y_test.shape[1])])
            r_scores_cell_seed_across_time_test = np.stack([recall_score(y_test[:,i], np.array([tm.compute_threshmax(y_hat, thresholds[i]) for y_hat in y_pred_test[:,i,:]]), average=None) for i in range(y_test.shape[1])])
            
        f_scores_test[f'{real} {cell}'].append(f_scores_cell_seed_across_time_test)
        p_scores_test[f'{real} {cell}'].append(p_scores_cell_seed_across_time_test)
        r_scores_test[f'{real} {cell}'].append(r_scores_cell_seed_across_time_test)

In [None]:
seeds=5
f_scores_test = dict()
p_scores_test = dict()
r_scores_test = dict()
# f_scores_train = dict()
for cell in cell_list:
    for real in ['Real', 'Random']:
        f_scores_test[f'{real} {cell}'] = []
        p_scores_test[f'{real} {cell}'] = []
        r_scores_test[f'{real} {cell}'] = []
        # f_scores_train[f'{real} {cell}'] = []
# f_scores_train = {k: [] for k in cell_list} #gets score distribution for diff cells across splits
for seed in range(seeds):
    for gamma in range(5):
        train_eval_RNNs(seed=seed, decision='threshmax', random_bl=False, sampling=None, multiclass_weights='sklearn', gamma=2)

In [None]:
f_scores_gamma = { f'gamma={gamma}': np.stack([item for index, item in enumerate(f_scores_test['Real LSTM']) if index % 5 == gamma], axis=0) for gamma in range(5)}
p_scores_gamma = { f'gamma={gamma}': np.stack([item for index, item in enumerate(p_scores_test['Real LSTM']) if index % 5 == gamma], axis=0) for gamma in range(5)}
r_scores_gamma = { f'gamma={gamma}': np.stack([item for index, item in enumerate(r_scores_test['Real LSTM']) if index % 5 == gamma], axis=0) for gamma in range(5)}

In [None]:
for seed in range(seeds):
    train_eval_RNNs(seed=seed, decision='threshmax', random_bl=False, sampling=None, multiclass_weights='sklearn', gamma='none')

In [None]:
f_scores_gamma['no gamma'] = np.stack(f_scores_test['Real LSTM'][-5:], axis=0)
p_scores_gamma['no gamma'] = np.stack(p_scores_test['Real LSTM'][-5:], axis=0)
r_scores_gamma['no gamma'] = np.stack(r_scores_test['Real LSTM'][-5:], axis=0)