In [None]:
import pandas
import numpy as np
import pandas as pd

In [None]:
data_path = '/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction_20220815'
admission_data_path = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_unit_dataset/data/stroke_registry/post_hoc_modified/stroke_registry_post_hoc_modified.xlsx'
patient_selection_path = '/Users/jk1/temp/opsum_extraction_output/high_frequency_data_patient_selection_with_details.csv'

In [None]:
from preprocessing.variable_assembly.variable_database_assembly import assemble_variable_database

feature_df = assemble_variable_database(data_path, admission_data_path, patient_selection_path)

In [None]:
from preprocessing.variable_assembly.relative_timestamps import transform_to_relative_timestamps

restricted_feature_df = transform_to_relative_timestamps(feature_df, drop_old_columns=False, restrict_to_time_range=True)

In [None]:
from preprocessing.encoding_categorical_variables.encode_categorical_variables import encode_categorical_variables

cat_encoded_restricted_feature_df = encode_categorical_variables(restricted_feature_df)

In [None]:
cat_encoded_restricted_feature_df.head()

In [None]:
cat_encoded_restricted_feature_df['relative_sample_date_hourly_cat'] = np.floor(cat_encoded_restricted_feature_df['relative_sample_date'])

In [None]:
cat_encoded_restricted_feature_df.head()

In [None]:
variables_to_down_sample = [
    'NIHSS',
    'oxygen_saturation',
    'systolic_blood_pressure',
    'diastolic_blood_pressure',
    'mean_blood_pressure',
    'heart_rate',
    'respiratory_rate'
]

In [None]:
print('These values will not be downsampled (if more than one sample per hour is present, take the median)')
for variable in cat_encoded_restricted_feature_df.sample_label.unique():
    if variable not in variables_to_down_sample:
        print(f"'{variable}',")

In [None]:
# find hourly median value for NIHSS
median_NIHSS = cat_encoded_restricted_feature_df[cat_encoded_restricted_feature_df.sample_label == 'NIHSS'].groupby([
    'case_admission_id',
    'relative_sample_date_hourly_cat'
])['value'].median().reset_index()
median_NIHSS

In [None]:
cat_encoded_restricted_feature_df[cat_encoded_restricted_feature_df.sample_label == 'NIHSS'].groupby([
    'case_admission_id',
    'relative_sample_date_hourly_cat'
])['value'].min().reset_index()

In [None]:
df = cat_encoded_restricted_feature_df

In [None]:
# resampling demands keeping only minimal columns
columns_to_keep = [
    'case_admission_id',
    'relative_sample_date_hourly_cat',
    'sample_label',
    'source',
    'value'
]
resampled_df = df[columns_to_keep].copy()

In [None]:
verbose = True
for variable in variables_to_down_sample:
    if verbose:
        print(f"Downsampling: {variable}")
    # extract median
    median_variable_df = df[
        df.sample_label == variable].groupby([
        'case_admission_id',
        'relative_sample_date_hourly_cat'
    ])['value'].median().reset_index()
    median_variable_df['sample_label'] = f'median_{variable}'
    # extract max
    max_variable_df = df[
        df.sample_label == variable].groupby([
        'case_admission_id',
        'relative_sample_date_hourly_cat'
    ])['value'].max().reset_index()
    max_variable_df['sample_label'] = f'max_{variable}'
    # extract min
    min_variable_df = df[
        df.sample_label == variable].groupby([
        'case_admission_id',
        'relative_sample_date_hourly_cat'
    ])['value'].min().reset_index()
    min_variable_df['sample_label'] = f'min_{variable}'
    temp_df = pd.concat([median_variable_df, max_variable_df, min_variable_df], axis=0)
    # all variables to downsample are from EHR
    temp_df['source'] = 'EHR'
    resampled_df = resampled_df.append(
        temp_df)
    # drop all rows of sample label variable
    resampled_df = \
        resampled_df[
            resampled_df.sample_label != variable]

In [None]:
resampled_df.sample_label.unique()

In [None]:
all_other_vars = [sample_label for sample_label in
                  df.sample_label.unique()
                  if sample_label not in variables_to_down_sample]

In [None]:
# for all other variables, when more than one sample per hour is present, take the median
for variable in all_other_vars:
    median_variable_df = df[
        df.sample_label == variable].groupby([
        'case_admission_id',
        'relative_sample_date_hourly_cat'
    ])['value'].median().reset_index()
    median_variable_df['sample_label'] = f'{variable}'

    median_variable_df['source'] = df[
        df.sample_label == variable]['source'].mode()[0]
    # Using mode as source leads to errors for the following labels: LDL cholesterol calcule, weight, cholesterol total
    # (they are wrongly labeled as coming from the stroke registry

    # drop old rows of the variable
    resampled_df = \
        resampled_df[
            resampled_df.sample_label != variable]
    resampled_df = resampled_df.append(
        median_variable_df)


In [None]:
# count number of values per case_admission_id, relative_sample_date_hourly_cat, sample_label
resampled_df.groupby(['case_admission_id', 'relative_sample_date_hourly_cat', 'sample_label']).count().reset_index()

In [None]:
cat_encoded_restricted_feature_df[cat_encoded_restricted_feature_df.sample_label == 'LDL cholesterol calcule']


## Test final function

In [None]:
from preprocessing.resample_to_time_bins.resample_to_hourly_features import resample_to_hourly_features

resampled_df = resample_to_hourly_features(cat_encoded_restricted_feature_df)
resampled_df.head()

In [None]:
(resampled_df.groupby(['case_admission_id', 'relative_sample_date_hourly_cat', 'sample_label']).count().reset_index().value == 1).all()

In [None]:
resampled_df.groupby('sample_label').source.unique()

Note: weight is indeed more frequent in stroke registry after applying 72h timeframe