In [None]:
import argparse
import os

import pandas as pd
import time

from preprocessing.encoding_categorical_variables.encode_categorical_variables import encode_categorical_variables
from preprocessing.handling_missing_values.impute_missing_values import impute_missing_values
from preprocessing.normalisation.normalisation import normalise_data
from preprocessing.resample_to_time_bins.resample_to_hourly_features import resample_to_hourly_features
from preprocessing.variable_assembly.variable_database_assembly import assemble_variable_database
from preprocessing.variable_assembly.relative_timestamps import transform_to_relative_timestamps


In [None]:
ehr_data_path ='/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction20220629'
stroke_registry_data_path = "/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_unit_dataset/data/stroke_registry/post_hoc_modified/stroke_registry_post_hoc_modified.xlsx"
patient_selection_path = '/Users/jk1/temp/opsum_extration_output/high_frequency_data_patient_selection_with_details.csv'
verbose:bool=True

In [None]:

# 1. Restrict to patient selection
# 2. Preprocess EHR and stroke registry variables
# 3. Restrict to variable selection
# 4. Assemble database from lab/scales/ventilation/vitals + stroke registry subparts
print('STARTING VARIABLE PREPROCESSING')
feature_df = assemble_variable_database(ehr_data_path, stroke_registry_data_path, patient_selection_path, verbose=verbose)

In [None]:
feature_df.head()

In [None]:
feature_df[feature_df.value.isna()].sample_label.unique()

In [None]:
feature_df.case_admission_id.nunique()

In [None]:
# 5. Transform timestamps to relative timestamps from first measure
# 6. Restrict to time range
print('TRANSFORMING TO RELATIVE TIME AND RESTRICTING TIME RANGE')
restricted_feature_df = transform_to_relative_timestamps(feature_df, drop_old_columns=False,
                                                             restrict_to_time_range=True, desired_time_range=72,
                                                             enforce_min_time_range=True, min_time_range=12)

In [None]:
restricted_feature_df.head()

In [None]:
restricted_feature_df[restricted_feature_df.value.isna()].sample_label.unique()


In [None]:
restricted_feature_df.case_admission_id.nunique()

In [None]:
# 7. Encoding categorical variables (one-hot)
print('ENCODING CATEGORICAL VARIABLES')
cat_encoded_restricted_feature_df = encode_categorical_variables(restricted_feature_df, verbose=verbose)

In [None]:
cat_encoded_restricted_feature_df.head()

In [None]:
cat_encoded_restricted_feature_df[cat_encoded_restricted_feature_df.value.isna()].sample_label.unique()


In [None]:
cat_encoded_restricted_feature_df.case_admission_id.nunique()

In [None]:
# 8. Resampling to hourly frequency
print('RESAMPLING TO HOURLY FREQUENCY')
resampled_df = resample_to_hourly_features(cat_encoded_restricted_feature_df, verbose=verbose)

In [None]:
resampled_df.head()

In [None]:
resampled_df[resampled_df.value.isna()].sample_label.unique()


In [None]:
resampled_df.case_admission_id.nunique()

In [None]:
# 9. imputation of missing values
print('IMPUTING MISSING VALUES')
imputed_missing_df = impute_missing_values(resampled_df, verbose=verbose)

In [None]:
imputed_missing_df.head()

In [None]:
imputed_missing_df[imputed_missing_df.value.isna()]

In [None]:
imputed_missing_df.case_admission_id.nunique()

In [None]:
# 10. normalisation
print('APPLYING NORMALISATION')
normalised_df = normalise_data(imputed_missing_df, verbose=verbose)

In [None]:
normalised_df.head(1000)

In [None]:
normalised_df[normalised_df.value.isna()].sample_label.unique()

In [None]:
len(normalised_df.case_admission_id.unique())

In [None]:
imputed_missing_df[imputed_missing_df.sample_label == 'FIO2'].describe()


In [None]:
temp = imputed_missing_df[imputed_missing_df.sample_label == 'FIO2'].value.copy()

In [None]:
temp.quantile(0.25) - 1.5 * (temp.quantile(0.75) - temp.quantile(0.25))

In [None]:
temp.quantile(0.25) - 1.5 * (temp.quantile(0.75) - temp.quantile(0.25)) == temp.quantile(0.75) + 1.5 * (temp.quantile(0.75) - temp.quantile(0.25))

In [None]:
temp = temp.clip(lower=temp.quantile(0.25) - 1.5 * (temp.quantile(0.75) - temp.quantile(0.25) - temp.median()/1000),
                 upper=temp.quantile(0.75) + 1.5 * (temp.quantile(0.75) - temp.quantile(0.25)))

In [None]:
temp.describe()

In [None]:
imputed_missing_df[imputed_missing_df.sample_label == 'Glasgow Coma Scale'].describe()

In [None]:
save_file_prefix:str = 'preprocessed_data'
output_dir = '/Users/jk1/temp/opsum_prepro_output'

In [None]:
timestamp = time.strftime("%d%m%Y_%H%M%S")
save_path = os.path.join(output_dir, f'{save_file_prefix}_{timestamp}.csv')

In [None]:
# normalised_df.to_csv(save_path)