In [None]:
import argparse
import os

import pandas as pd
import time

from preprocessing.preprocessing_tools.encoding_categorical_variables.encode_categorical_variables import encode_categorical_variables
from preprocessing.preprocessing_tools.handling_missing_values.impute_missing_values import impute_missing_values
from preprocessing.preprocessing_tools.normalisation.normalisation import normalise_data
from preprocessing.preprocessing_tools.resample_to_time_bins.resample_to_hourly_features import resample_to_hourly_features
from preprocessing.geneva_stroke_unit_preprocessing.variable_assembly.variable_database_assembly import assemble_variable_database
from preprocessing.geneva_stroke_unit_preprocessing.variable_assembly.relative_timestamps import transform_to_relative_timestamps

In [None]:
ehr_data_path = '/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction_20220815'
stroke_registry_data_path = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_unit_dataset/data/stroke_registry/post_hoc_modified/stroke_registry_post_hoc_modified.xlsx'
patient_selection_path = '/Users/jk1/temp/opsum_extraction_output/gsu_extraction_01012023_222140/high_frequency_data_patient_selection_with_details.csv'
log_dir = '/Users/jk1/temp/opsum_prepro_output/temp_output'

verbose:bool=True

In [None]:
# 1. Restrict to patient selection (& filter out patients with no EHR data or EHR data with wrong dates)
# 2. Preprocess EHR and stroke registry variables
# 3. Restrict to variable selection
# 4. Assemble database from lab/scales/ventilation/vitals + stroke registry subparts
print('STARTING VARIABLE PREPROCESSING')
feature_df = assemble_variable_database(ehr_data_path, stroke_registry_data_path, patient_selection_path,
                                        log_dir=log_dir, verbose=verbose)
print(f'A. Number of patients: {feature_df.case_admission_id.nunique()}')

In [None]:
from preprocessing.geneva_stroke_unit_preprocessing.utils import create_registry_case_identification_column

patient_selection = pd.read_csv(patient_selection_path, dtype=str)
patient_selection['case_admission_id'] = create_registry_case_identification_column(patient_selection)


In [None]:
patient_selection.case_admission_id.nunique()

In [None]:
feature_df.head()

In [None]:
feature_df[feature_df.value.isna()].sample_label.unique()

In [None]:
feature_df.case_admission_id.nunique()

In [None]:
# 5. Transform timestamps to relative timestamps from first measure
# 6. Restrict to time range
# - Exclude patients with data sampled in a time window < 12h
# - Restrict to desired time range: 72h
print('TRANSFORMING TO RELATIVE TIME AND RESTRICTING TIME RANGE')
restricted_feature_df = transform_to_relative_timestamps(feature_df, drop_old_columns=False,
                                                         restrict_to_time_range=True, desired_time_range=72,
                                                         enforce_min_time_range=True, min_time_range=12,
                                                         log_dir=log_dir)
print(f'B. Number of patients: {restricted_feature_df.case_admission_id.nunique()}')

In [None]:
restricted_feature_df.head()

In [None]:
restricted_feature_df[restricted_feature_df.value.isna()].sample_label.unique()


In [None]:
restricted_feature_df.case_admission_id.nunique()

In [None]:
# 7. Encoding categorical variables (one-hot)
print('ENCODING CATEGORICAL VARIABLES')
cat_encoded_restricted_feature_df = encode_categorical_variables(restricted_feature_df, verbose=verbose,
                                                                 log_dir=log_dir)

In [None]:
cat_encoded_restricted_feature_df.head()

In [None]:
cat_encoded_restricted_feature_df[cat_encoded_restricted_feature_df.value.isna()].sample_label.unique()


In [None]:
cat_encoded_restricted_feature_df.case_admission_id.nunique()

In [None]:
# 8. Resampling to hourly frequency
print('RESAMPLING TO HOURLY FREQUENCY')
resampled_df = resample_to_hourly_features(cat_encoded_restricted_feature_df, verbose=verbose)
print(f'D. Number of patients: {resampled_df.case_admission_id.nunique()}')

In [None]:
resampled_df.head()

In [None]:
resampled_df[resampled_df.value.isna()].sample_label.unique()


In [None]:
resampled_df.case_admission_id.nunique()

In [None]:
# 9. imputation of missing values
print('IMPUTING MISSING VALUES')
imputed_missing_df = impute_missing_values(resampled_df, verbose=verbose)
print(f'E. Number of patients: {imputed_missing_df.case_admission_id.nunique()}')

In [None]:
imputed_missing_df.head()

In [None]:
imputed_missing_df[imputed_missing_df.value.isna()]

In [None]:
imputed_missing_df.case_admission_id.nunique()

In [None]:
# 10. normalisation
print('APPLYING NORMALISATION')
normalised_df = normalise_data(imputed_missing_df, verbose=verbose, log_dir=log_dir)
print(f'F. Number of patients: {normalised_df.case_admission_id.nunique()}')

In [None]:
normalised_df.head(1000)

In [None]:
normalised_df[normalised_df.value.isna()].sample_label.unique()

In [None]:
len(normalised_df.case_admission_id.unique())

In [None]:
imputed_missing_df[imputed_missing_df.sample_label == 'creatinine'].describe()


In [None]:
imputed_missing_df[imputed_missing_df.sample_label == 'max_heart_rate'].describe()

In [None]:
TODO:
- check if all selected variables are present every hour

In [None]:
temp = normalised_df.head(1000)