In [None]:
import json
import torch as ch
import matplotlib.pyplot as plt
import os, traceback
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, matthews_corrcoef
import argparse
import sys
import os
from functools import partial
from torch import optim, nn, utils, Tensor
from torch.utils.data import TensorDataset, DataLoader
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
import pytorch_lightning as pl
from torchmetrics import AUROC
from sklearn.preprocessing import StandardScaler
sys.path.insert(0, '/home/guillaume/julian/OPSUM/')


from prediction.outcome_prediction.LSTM.training.utils import initiate_log_files
from prediction.outcome_prediction.data_loading.data_formatting import format_to_2d_table_with_time, \
    link_patient_id_to_outcome, features_to_numpy, numpy_to_lookup_table, feature_order_verification
from prediction.utils.scoring import precision, matthews, recall
from prediction.utils.utils import generate_balanced_arrays, check_data, ensure_dir, save_json
from prediction.outcome_prediction.LSTM.LSTM import lstm_generator

from prediction.outcome_prediction.Transformer.architecture import OPSUMTransformer


def prep(features_path: str, labels_path:str, outcome:str, test_size:float,seed=0, n_splits=7):
    ### LOAD THE DATA
    X, y = format_to_2d_table_with_time(feature_df_path=features_path, outcome_df_path=labels_path,
                                        outcome=outcome)

    n_time_steps = X.relative_sample_date_hourly_cat.max() + 1
    n_channels = X.sample_label.unique().shape[0]

    # test if data is corrupted
    check_data(X)

    """
    SPLITTING DATA
    Splitting is done by patient id (and not admission id) as in case of the rare multiple admissions per patient there
    would be a risk of data leakage otherwise split 'pid' in TRAIN and TEST pid = unique patient_id
    """
    # Reduce every patient to a single outcome (to avoid duplicates)
    all_pids_with_outcome = link_patient_id_to_outcome(y, outcome)
    pid_train, pid_test, y_pid_train, y_pid_test = train_test_split(all_pids_with_outcome.patient_id.tolist(),
                                                                    all_pids_with_outcome.outcome.tolist(),
                                                                    stratify=all_pids_with_outcome.outcome.tolist(),
                                                                    test_size=test_size,
                                                                    random_state=seed)

    test_X = X[X.patient_id.isin(pid_test)]
    # Here test data is not needed anymore, but for reference should be loaded as such: test_y = y[y.patient_id.isin(pid_test)]

    # define K fold
    kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

    
    ### TRAIN MODEL USING K-FOLD CROSS-VALIDATION
    i = 0
    for fold_pid_train_idx, fold_pid_val_idx in kfold.split(pid_train, y_pid_train):
        fold_train_pidx = np.array(pid_train)[fold_pid_train_idx]
        fold_val_pidx = np.array(pid_train)[fold_pid_val_idx]

        fold_X_train_df = X.loc[X.patient_id.isin(fold_train_pidx)]
        fold_y_train_df = y.loc[y.patient_id.isin(fold_train_pidx)]
        fold_X_val_df = X.loc[X.patient_id.isin(fold_val_pidx)]
        fold_y_val_df = y.loc[y.patient_id.isin(fold_val_pidx)]

        fold_X_train = features_to_numpy(fold_X_train_df, ['case_admission_id', 'relative_sample_date_hourly_cat', 'sample_label', 'value'])
        fold_X_val = features_to_numpy(fold_X_val_df, ['case_admission_id', 'relative_sample_date_hourly_cat', 'sample_label', 'value'])

        fold_y_train = np.array([fold_y_train_df[fold_y_train_df.case_admission_id == cid].outcome.values[0] for cid in fold_X_train[:, 0, 0, 0]]).astype('float32')
        fold_y_val = np.array([fold_y_val_df[fold_y_val_df.case_admission_id == cid].outcome.values[0] for cid in fold_X_val[:, 0, 0, 0]]).astype('float32')

        fold_X_train = fold_X_train[:, :, :, -1].astype('float32')
        fold_X_val = fold_X_val[:, :, :, -1].astype('float32')
        
        yield fold_X_train, fold_X_val, fold_y_train, fold_y_val


In [None]:
scenarios = list(prep('/home/guillaume/julian/preprocessed_features_01012023_233050.csv', '/home/guillaume/julian/preprocessed_outcomes_01012023_233050.csv', outcome="3M mRS 0-2",
    test_size=0.2, seed=5))

In [None]:
ch.save(scenarios, 'data_splits')

In [None]:
pwd

In [None]:
aa.nbytes / 2**30

In [None]:
from pytorch_lightning.loggers.logger import Logger
class DictLogger(Logger):
    """PyTorch Lightning `dict` logger."""

    def __init__(self, version):
        super(DictLogger, self).__init__()
        self.metrics = []
        self._version = version

    def log_metrics(self, metrics, step=None):
        self.metrics.append(metrics)

    @property
    def version(self):
        return self._version

    @property
    def experiment(self):
        """Return the experiment object associated with this logger."""

    def log_hyperparams(self, params):
        """
        Record hyperparameters.
        Args:
            params: :class:`~argparse.Namespace` containing the hyperparameters
        """

    @property
    def name(self):
        """Return the experiment name."""
        return 'optuna'

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.reshape(-1, 84)).reshape(X_train.shape)
X_val = scaler.transform(X_val.reshape(-1, 84)).reshape(X_val.shape)
train_dataset = TensorDataset(ch.from_numpy(X_train).cuda(), ch.from_numpy(y_train.astype(np.int32)).cuda())
train_loader = DataLoader(train_dataset, batch_size=256)
val_dataset = TensorDataset(ch.from_numpy(X_val).cuda(), ch.from_numpy(y_val.astype(np.int32)).cuda())
val_loader = DataLoader(val_dataset, batch_size=256)

In [None]:

# define any number of nn.Modules (or use your current ones)
encoder = nn.Sequential(nn.Linear(28 * 28, 64), nn.ReLU(), nn.Linear(64, 3))
decoder = nn.Sequential(nn.Linear(3, 64), nn.ReLU(), nn.Linear(64, 28 * 28))


# define the LightningModule
class LitModel(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.criterion = ch.nn.BCEWithLogitsLoss()
        self.train_auroc = AUROC(task="binary")
        self.val_auroc = AUROC(task="binary")

    def training_step(self, batch, batch_idx, mode='train'):
        x, y = batch
        predictions = self.model(x).squeeze().ravel()
        y = y.unsqueeze(1).repeat(1, x.shape[1]).ravel()
        loss = self.criterion(predictions, y.float()).ravel()
        self.train_auroc(ch.sigmoid(predictions.ravel()), y.ravel())
        cur_lr = self.trainer.optimizers[0].param_groups[0]['lr']
        self.log("lr", cur_lr, prog_bar=True, on_step=True)
        self.log("train_auroc", self.train_auroc, on_step=False, on_epoch=True, prog_bar=True)
        
        return loss
    
    def validation_step(self,batch, batch_idx, mode='train'):
        x, y = batch
        predictions = self.model(x).squeeze().ravel()
        y = y.unsqueeze(1).repeat(1, x.shape[1]).ravel()
        loss = self.criterion(predictions, y.float()).ravel()
        self.val_auroc(ch.sigmoid(predictions.ravel()), y.ravel())
        self.log("val_auroc", self.val_auroc, on_step=False, on_epoch=True, prog_bar=True)
        return loss
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), weight_decay=0.0005)
        return optimizer


In [None]:
from torch.optim.lr_scheduler import ExponentialLR, LambdaLR
from pytorch_lightning.callbacks import LearningRateMonitor

In [None]:
plt.plot([x['lr'] for x in logger.metrics if 'lr' in x])

In [None]:
plt.plot([x['train_auroc'] for x in logger.metrics if 'train_auroc' in x])
plt.plot([x['val_auroc'] for x in logger.metrics if 'val_auroc' in x])

In [None]:
plt.plot([x['train_auroc'] for x in logger.metrics if 'train_auroc' in x])
plt.plot([x['val_auroc'] for x in logger.metrics if 'val_auroc' in x])

In [None]:
model = OPSUMTransformer(
    input_dim=X_train.shape[2],
    num_layers=6,
    model_dim=128,
    dropout=0.99,
    ff_dim=256,
    num_heads=8,
    num_classes=1,
    max_dim=500,
    pos_encode_factor=1e-3
)

In [None]:
module = LitModel(model)

In [None]:
logger = DictLogger(1)
lr_monitor = LearningRateMonitor(logging_interval='step')
trainer = pl.Trainer(accelerator='gpu', devices=1, max_epochs=50,
                     callbacks=[lr_monitor], logger=logger)
trainer.fit(model=module, train_dataloaders=train_loader, val_dataloaders=val_loader)

In [None]:
prediction = model.cuda()(ch.from_numpy(X_train).cuda())

In [None]:
np.prod(X_train.shape) * 4 / 2**20

In [None]:
def prep_average(features, labels):
    avg_features = np.cumsum(features, 1) / (np.arange(1, features.shape[1] + 1)[None, :, None])
    min_features = np.minimum.accumulate(features, 1)
    max_features = np.maximum.accumulate(features, 1)
    all_features = np.concatenate([features, avg_features, min_features, max_features], 2)
    all_features = all_features.reshape(-1, all_features.shape[-1])
    labels = labels[:, None].repeat(72, 1).ravel()
    print(labels.shape)
    return all_features, labels

In [None]:
classifier = XGBClassifier(n_estimators=65, learning_rate=0.1, reg_lambda=50, alpha=70)

In [None]:
flat_features_train, flat_labels_train = prep_average(X_train, y_train)
flat_features_val, flat_labels_val = prep_average(X_val, y_val)

In [None]:
%time classifier.fit(flat_features_train, flat_labels_train)

In [None]:
prediction_train = classifier.predict_proba(flat_features_train)[:, 1].reshape(-1, 72).T
prediction_val = classifier.predict_proba(flat_features_val)[:, 1].reshape(-1, 72).T
scores_train = []
scores_val = []
for time in range(72):
    scores_train.append(roc_auc_score(y_train, prediction_train[time]))
    scores_val.append(roc_auc_score(y_val, prediction_val[time]))

In [None]:
roc_auc_score(flat_labels_val, prediction_val.T.ravel())

In [None]:
plt.plot(scores_val, label='Val set')
plt.gca().set_ylabel('ROC AUC')
plt.xlabel('Hours from admission')
plt.axhline(roc_auc_score(flat_labels_val, prediction_val.T.ravel()), label='average over time')
plt.plot(scores_train, label='Test set')
plt.grid()
plt.legend()

In [None]:
from sklearn.metrics import roc_curve

In [None]:
roc_auc_score

In [None]:
a, b, c  = roc_curve(flat_labels_val, prediction_val.T.ravel())

In [None]:
plt.plot(a, b)
plt.plot(1 - b, 1 - a)
plt.grid()
plt.gca().set_aspect('equal')

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
model2 = MLPClassifier((128, 128, 64), learning_rate='adaptive', alpha=12.5)

In [None]:
%time model2.fit(flat_features_train, flat_labels_train)

In [None]:
prediction_train = model2.predict_proba(flat_features_train)[:, 1].reshape(-1, 72).T
prediction_val = model2.predict_proba(flat_features_val)[:, 1].reshape(-1, 72).T
scores_train = []
scores_val = []
for time in range(72):
    scores_train.append(roc_auc_score(y_train, prediction_train[time]))
    scores_val.append(roc_auc_score(y_val, prediction_val[time]))

In [None]:
roc_auc_score(flat_labels_val, prediction_val.T.ravel())

In [None]:
plt.plot(scores_val, label='Val set')
plt.gca().set_ylabel('ROC AUC')
plt.xlabel('Hours from admission')
plt.axhline(roc_auc_score(flat_labels_val, prediction_val.T.ravel()), label='average over time')
plt.plot(scores_train, label='Train set')
plt.grid()
plt.legend()