In [None]:
import pandas
import numpy as np
import pandas as pd

In [None]:
data_path = '/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction_20211110'
admission_data_path = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_unit_dataset/data/stroke_registry/post_hoc_modified/stroke_registry_post_hoc_modified.xlsx'
patient_selection_path = '/Users/jk1/temp/opsum_extration_output/high_frequency_data_patient_selection.csv'

In [None]:
from preprocessing.variable_assembly.variable_database_assembly import assemble_variable_database

feature_df = assemble_variable_database(data_path, admission_data_path, patient_selection_path)

In [None]:
from preprocessing.variable_assembly.relative_timestamps import transform_to_relative_timestamps

restricted_feature_df = transform_to_relative_timestamps(feature_df, drop_old_columns=False, restrict_to_time_range=True)

In [None]:
from preprocessing.normalisation.normalisation import normalise_data

normalised_restricted_feature_df = normalise_data(restricted_feature_df, verbose=True)

In [None]:
normalised_restricted_feature_df.head()

In [None]:
variables_to_normalize = [
'proBNP',
'bilirubine totale',
'thrombocytes',
'creatinine',
'calcium corrige',
'hemoglobine',
'INR',
'potassium',
'glycemie moyenne estimee',
'hematocrite',
'uree',
'erythrocytes',
'glucose',
'leucocytes',
'hemoglobine glyquee',
'sodium',
'proteine C-reactive',
'ALAT',
'FIO2',
'oxygen_saturation',
'systolic_blood_pressure',
'diastolic_blood_pressure',
'mean_blood_pressure',
'heart_rate',
'respiratory_rate',
'temperature',
'weight',
'age',
'NIHSS',
'triglycerides',
'ASAT',
'cholesterol HDL',
'Glasgow Coma Scale',
'fibrinogene',
'PTT',
'cholesterol total',
'LDL cholesterol calcule',
]

In [None]:
for variable in normalised_restricted_feature_df.sample_label.unique():
    if variable in variables_to_normalize:
        continue
    print(f"'{variable}', {len(normalised_restricted_feature_df[normalised_restricted_feature_df.sample_label == variable]['value'].unique())}")

## Encoding categorical variables

difference between binary and non-binary variables is irrelevant when dummy encoding variables:
- binary variables are encoded as 0 and 1 of one of the two categories as (variable_category1)
- non-binary variables are encoded as 0 and 1 of for the n-1 categories as (variable_category1, variable_category2, ..., variable_category_n-1)

In [None]:
categorical_variables = [var for var in normalised_restricted_feature_df.sample_label.unique()
    if (var not in variables_to_normalize)]
categorical_variables

In [None]:
non_binary_categorical_variables = [
    'Referral',
    'Prestroke disability (Rankin)',
    'categorical_onset_to_admission_time',
    'categorical_IVT',
    'categorical_IAT'
]

In [None]:
binary_variables =  [var for var in normalised_restricted_feature_df.sample_label.unique()
    if (var not in variables_to_normalize) & (var not in non_binary_categorical_variables)]

In [None]:
normalised_restricted_feature_df[normalised_restricted_feature_df.sample_label.isin(categorical_variables)].head()

In [None]:
dummy_coded_temp = pd.get_dummies(normalised_restricted_feature_df[normalised_restricted_feature_df.sample_label == 'Referral'], columns=['value'], prefix='referral', drop_first=True)
dummy_coded_temp.columns = [str(col).lower().replace(' ', '_') for col in dummy_coded_temp.columns]
dummy_coded_temp.head()

In [None]:
dummy_coded_temp.drop(columns=['sample_label'], inplace=True)
dummy_coded_temp.melt(id_vars=['case_admission_id', 'sample_date', 'source', 'first_sample_date', 'relative_sample_date'], var_name='sample_label', value_name='value')

In [None]:
set(dummy_coded_temp.columns)

In [None]:
one_hot_normalised_restricted_features = normalised_restricted_feature_df.copy()

In [None]:
hot_one_encoded_variables = []
for categorical_variable in categorical_variables:
    dummy_coded_temp = pd.get_dummies(one_hot_normalised_restricted_features[one_hot_normalised_restricted_features.sample_label == categorical_variable], columns=['value'], prefix=str(categorical_variable).lower(), drop_first=True)
    # find baseline value
    baseline_value = [var
                      for var in one_hot_normalised_restricted_features[one_hot_normalised_restricted_features.sample_label == categorical_variable]['value'].unique()
                      if str(var) not in
                      [col_name.split(str(categorical_variable).lower() + '_')[-1] for col_name in dummy_coded_temp.columns]
                      ]
    print(f'Baseline for {categorical_variable}: {baseline_value}')
    dummy_coded_temp.columns = [str(col).lower().replace(' ', '_') for col in dummy_coded_temp.columns]
    hot_one_encoded_variables += list(dummy_coded_temp.columns)
    dummy_coded_temp.drop(columns=['sample_label'], inplace=True)
    dummy_coded_temp = dummy_coded_temp.melt(id_vars=['case_admission_id', 'sample_date', 'source', 'first_sample_date', 'relative_sample_date'], var_name='sample_label', value_name='value')
    one_hot_normalised_restricted_features = one_hot_normalised_restricted_features.append(dummy_coded_temp)

    # drop original non-binary categorical variable
    one_hot_normalised_restricted_features = one_hot_normalised_restricted_features[one_hot_normalised_restricted_features.sample_label != categorical_variable]

In [None]:
set(hot_one_encoded_variables)

In [None]:
one_hot_normalised_restricted_features.sample_label.unique()

In [None]:
one_hot_normalised_restricted_features[one_hot_normalised_restricted_features.sample_label.isin(hot_one_encoded_variables)]

In [None]:
from preprocessing.encoding_categorical_variables.encode_categorical_variables import encode_categorical_variables

encode_categorical_variables(normalised_restricted_feature_df)