In [None]:
import numpy as np
import pandas as pd

In [None]:
model_weights_path = '/Users/jk1/temp/opsum_prediction_output/LSTM_72h_test_results/test_LSTM_sigmoid_all_unchanged_0.4_1_True_RMSprop_3M mRS 0-2_8_4/sigmoid_all_unchanged_0.4_1_True_RMSprop_3M mRS 0-2_8_4.hdf5'
features_path = '/Users/jk1/temp/opsum_prepro_output/old_preprocessing/preprocessed_features_14052022_123333.csv'
labels_path = '/Users/jk1/temp/opsum_prepro_output/old_preprocessing/preprocessed_outcomes_14052022_123333.csv'

In [None]:
outcome = '3M mRS 0-2'
masking = True
units = 8
activation = 'sigmoid'
dropout = 0.4
layers = 1
optimizer = 'RMSprop'
seed = 42
test_size = 0.20

In [None]:
from prediction.mrs_outcome_prediction.data_loading.data_formatting import format_to_2d_table_with_time

# load the dataset
X, y = format_to_2d_table_with_time(feature_df_path=features_path, outcome_df_path=labels_path,
                                    outcome=outcome)



In [None]:
n_time_steps = X.relative_sample_date_hourly_cat.max() + 1
n_channels = X.sample_label.unique().shape[0]

In [None]:
from sklearn.model_selection import train_test_split
from prediction.mrs_outcome_prediction.data_loading.data_formatting import features_to_numpy, link_patient_id_to_outcome

# Reduce every patient to a single outcome (to avoid duplicates)
all_pids_with_outcome = link_patient_id_to_outcome(y, outcome)
pid_train, pid_test, y_pid_train, y_pid_test = train_test_split(all_pids_with_outcome.patient_id.tolist(),
                                                                all_pids_with_outcome.outcome.tolist(),
                                                                stratify=all_pids_with_outcome.outcome.tolist(),
                                                                test_size=test_size,
                                                                random_state=seed)

test_X_df = X[X.patient_id.isin(pid_test)]
test_y_df = y[y.patient_id.isin(pid_test)]
train_X_df = X[X.patient_id.isin(pid_train)]
train_y_df = y[y.patient_id.isin(pid_train)]

train_X_np = features_to_numpy(train_X_df,
                                 ['case_admission_id', 'relative_sample_date_hourly_cat', 'sample_label', 'value'])
test_X_np = features_to_numpy(test_X_df,
                              ['case_admission_id', 'relative_sample_date_hourly_cat', 'sample_label', 'value'])
train_y_np = np.array([train_y_df[train_y_df.case_admission_id == cid].outcome.values[0] for cid in
                         train_X_np[:, 0, 0, 0]]).astype('float32')
test_y_np = np.array([test_y_df[test_y_df.case_admission_id == cid].outcome.values[0] for cid in
                      test_X_np[:, 0, 0, 0]]).astype('float32')

# create look-up table for case_admission_ids, sample_labels and relative_sample_date_hourly_cat
# save_json(numpy_to_lookup_table(test_X_np),
#           os.path.join(output_dir, 'test_lookup_dict.json'))
# save_json(numpy_to_lookup_table(train_X_np),
#             os.path.join(output_dir, 'train_lookup_dict.json'))

# Remove the case_admission_id, sample_label, and time_step_label columns from the data
test_X_np = test_X_np[:, :, :, -1].astype('float32')
train_X_np = train_X_np[:, :, :, -1].astype('float32')

In [None]:
modified_n_time_steps = n_time_steps -1

In [None]:
from prediction.utils.scoring import precision, recall, matthews
from prediction.mrs_outcome_prediction.LSTM.LSTM import lstm_generator

model = lstm_generator(x_time_shape=modified_n_time_steps, x_channels_shape=n_channels, masking=masking, n_units=units,
                           activation=activation, dropout=dropout, n_layers=layers)

model.compile(loss='binary_crossentropy', optimizer=optimizer,
              metrics=['accuracy', precision, recall, matthews])

model.load_weights(model_weights_path)

In [None]:
y_pred = model.predict(test_X_np)

In [None]:
y_pred.shape, test_y_np.shape

In [None]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, _ = roc_curve(test_y_np, y_pred)
roc_auc = auc(fpr, tpr)

In [None]:
roc_auc

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(7.5, 5))
lw = 2
plt.plot(
    fpr,
    tpr,
    color="darkorange",
    lw=lw,
    label="ROC curve (area = %0.2f)" % roc_auc,
)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC for LSTM model")
plt.legend(loc="lower right")
# adjust figure size
plt.show()

In [None]:
test_X_np[:,0:-1,:].shape

In [None]:
model.predict(test_X_np[:,0:-1,:])

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_scores = []

for ts in range(n_time_steps):
    modified_time_steps = ts + 1
    model = lstm_generator(x_time_shape=modified_time_steps, x_channels_shape=n_channels, masking=masking, n_units=units,
                           activation=activation, dropout=dropout, n_layers=layers)

    model.compile(loss='binary_crossentropy', optimizer=optimizer,
                  metrics=['accuracy', precision, recall, matthews])

    model.load_weights(model_weights_path)

    test_X_with_first_n_ts = test_X_np[:,0:modified_time_steps,:]

    y_pred = model.predict(test_X_with_first_n_ts)
    roc_auc_scores.append([modified_time_steps, roc_auc_score(test_y_np, y_pred)])



In [None]:
results_df = pd.DataFrame(roc_auc_scores, columns=['n_hours', 'roc_auc_score'])
results_df.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import os

ax = sns.lineplot(x='n_hours', y='roc_auc_score', data=results_df, legend=True)
ax.set_title('Model performance in the holdout test dataset as a function of observation period')
ax.set_xlabel('Time after admission (hours)')
ax.set_ylabel('ROC AUC')

plt.tight_layout()

plt.savefig(os.path.join('/Users/jk1/Downloads', 'roc_auc_scores.png'), bbox_inches='tight')