In [None]:
import pandas as pd
import numpy as np

In [None]:
features_df_path = '/Users/jk1/temp/opsum_prepro_output/preprocessed_features_02092022_083046.csv'
outcome_df_path = '/Users/jk1/temp/opsum_prepro_output/preprocessed_outcomes_02092022_083046.csv'

In [None]:
features_df = pd.read_csv(features_df_path)
outcome_df = pd.read_csv(outcome_df_path)
features_df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
features_df.head()

In [None]:
outcome_df

## Linear data representation without time dimension

Goal: can be fed into a tree or simple NN model

In [None]:
# keep one row per case_admission_id
pivoted_features = pd.pivot_table(features_df, index='case_admission_id', values=['value'], columns=['relative_sample_date_hourly_cat', 'sample_label'])

In [None]:
pivoted_features.head()

In [None]:
pivoted_features.columns = [f'{col[2]}_hcat_{col[1]}' for col in pivoted_features.columns.values]

In [None]:
# check for duplicated index
pivoted_features.index.duplicated().sum()

In [None]:
pivoted_features.reset_index()


In [None]:
pivoted_features_np = pivoted_features.reset_index().values
pivoted_features_np

In [None]:
X = pivoted_features_np[:, 1:]

In [None]:
outcome_value = '3M mRS 0-2'

In [None]:
y = [outcome_df[outcome_df.case_admission_id == id][outcome_value].values[0]
     if len(outcome_df[outcome_df.case_admission_id == id][outcome_value].values) > 0
     else np.nan
     for id in pivoted_features_np[:,0]]

In [None]:
pivoted_features_np[:, 1:].shape

In [None]:
pd.DataFrame(y).value_counts()

In [None]:
# count nan in y
y.count(np.nan)

In [None]:
y = list(map(binarize_to_int, y))

In [None]:
np.isnan(y).sum()

In [None]:
np.nansum(y)

In [None]:
# find case_admission_ids where y is nan
cid_with_no_outcome = pivoted_features_np[np.isnan(y), 0]
print('Found {} case_admission_ids with no outcome'.format(len(cid_with_no_outcome)))

In [None]:
# remove values in X and y where y is nan
X = X[~np.isnan(y)]
y = np.array(y)[~np.isnan(y)]

## 2-dimensional data representation with time dimension

In [None]:
features_df['patient_id'] = features_df['case_admission_id'].apply(lambda x: x.split('_')[0])


In [None]:
features_df.head()

In [None]:
# number of unique case_admission_ids per patient_id
features_df.groupby('patient_id')['case_admission_id'].nunique()

In [None]:
outcome = '3M mRS 0-2'

In [None]:
X = features_df.copy()
y = pd.DataFrame(X['case_admission_id'].unique(), columns=['case_admission_id'])

y['patient_id'] = y['case_admission_id'].apply(lambda x: x.split('_')[0])
y['outcome'] = y.case_admission_id.apply(lambda x:
                                         outcome_df[outcome_df.case_admission_id == x][outcome].values[0]
                                         if len(outcome_df[outcome_df.case_admission_id == x][outcome].values) > 0
                                         else np.nan)


In [None]:
y.head()

In [None]:
from prediction.mrs_outcome_prediction.data_loading.data_formatting import binarize_to_int

y['outcome'] = y['outcome'].apply(binarize_to_int)


In [None]:
y.head()

In [None]:
y[y.outcome.isna()]['case_admission_id'].unique()

In [None]:
y[y.case_admission_id == '10699578052_02032019']

In [None]:
# find case_admission_ids where y is nan
cid_with_no_outcome = y[y.outcome.isna()]['case_admission_id'].unique()
print('Found {} case_admission_ids with no outcome. These will be excluded.'.format(len(cid_with_no_outcome)))

In [None]:
# remove values in X and y where y is nan
X = X[~X.case_admission_id.isin(cid_with_no_outcome)]
y = y[~y.case_admission_id.isin(cid_with_no_outcome)]

In [None]:
X.head()

## Transform pandas dataframe to multidimensional numpy array

In [None]:
X.values

In [None]:
n_time_steps = X.relative_sample_date_hourly_cat.max() + 1
n_channels = X.sample_label.unique().shape[0]

In [None]:
df = X[['case_admission_id', 'relative_sample_date_hourly_cat', 'sample_label', 'value']].copy()
# create 4d numpy array with shape (n_cases, n_time_steps, n_sample_labels, n_features)


In [None]:
gb_cid = [x for _, x in df.groupby('case_admission_id')]

In [None]:
gb_cid[0]

In [None]:
gb_time = np.array([[x for _, x in gb_cid_x.groupby('relative_sample_date_hourly_cat')] for gb_cid_x in gb_cid])

In [None]:
gb_time.shape

In [None]:
from prediction.mrs_outcome_prediction.data_loading.data_formatting import features_to_numpy

df_np = features_to_numpy(X)

In [None]:
df_np.shape

In [None]:
# verify that the order of sample_labels is the same in every case_admission_id and time_step
df_np[:, 0, :, 2]

In [None]:
case_admission_id_lookup = {cid: i for i, cid in enumerate(df_np[:, 0, 0, 0])}
time_step_lookup = {t: i for i, t in enumerate(df_np[0, :, 0, 1])}
sample_label_lookup = {sl: i for i, sl in enumerate(df_np[0, 0, :, 2])}


In [None]:
sample_label_lookup

In [None]:
# sample random item from gb_time
idx_cid = np.random.randint(0, df_np.shape[0])
idx_time = np.random.randint(0, df_np.shape[1])
idx_sl = np.random.randint(0, df_np.shape[2])

print(df_np[idx_cid, idx_time, idx_sl, :])

# check that this corresponds to the same information as the original dataframe
print(df[df.case_admission_id == df_np[idx_cid, idx_time, idx_sl, 0]][df.relative_sample_date_hourly_cat == df_np[idx_cid, idx_time, idx_sl, 1]][df.sample_label == df_np[idx_cid, idx_time, idx_sl, 2]]['value'].values)

df[df.case_admission_id == df_np[idx_cid, idx_time, idx_sl, 0]][df.relative_sample_date_hourly_cat == df_np[idx_cid, idx_time, idx_sl, 1]][df.sample_label == df_np[idx_cid, idx_time, idx_sl, 2]]['value'].values[0] == df_np[idx_cid, idx_time, idx_sl, 3]

## Link patient id to a single outcome

In [None]:
all_pids = y[['patient_id', 'outcome']].copy()

In [None]:
all_pids.head()

In [None]:
# replaces duplicated patient_ids with a single patient_id with minimum outcome
duplicated_pids = all_pids[all_pids.duplicated(subset='patient_id', keep=False)].copy()
reduced_pids = duplicated_pids.groupby('patient_id').min().reset_index()
reduced_pids

In [None]:
all_pids_no_duplicates = all_pids[~all_pids.duplicated(subset='patient_id', keep=False)].copy()
all_pids_no_duplicates = all_pids_no_duplicates.append(reduced_pids)

In [None]:
all_pids_no_duplicates.patient_id.isnull().sum()

In [None]:
all_pids_no_duplicates