In [None]:
import pandas
import numpy as np
import pandas as pd

In [None]:
data_path = '/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction_20220815'
admission_data_path = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_unit_dataset/data/stroke_registry/post_hoc_modified/stroke_registry_post_hoc_modified.xlsx'
patient_selection_path = '/Users/jk1/temp/opsum_extraction_output/high_frequency_data_patient_selection_with_details.csv'

In [None]:
from preprocessing.variable_assembly.variable_database_assembly import assemble_variable_database

feature_df = assemble_variable_database(data_path, admission_data_path, patient_selection_path)

In [None]:
from preprocessing.variable_assembly.relative_timestamps import transform_to_relative_timestamps

restricted_feature_df = transform_to_relative_timestamps(feature_df, drop_old_columns=False, restrict_to_time_range=True)

## Encoding categorical variables

difference between binary and non-binary variables is irrelevant when dummy encoding variables:
- binary variables are encoded as 0 and 1 of one of the two categories as (variable_category1)
- non-binary variables are encoded as 0 and 1 of for the n-1 categories as (variable_category1, variable_category2, ..., variable_category_n-1)

In [None]:
categorical_variables = [
    'Sex',
 'Referral',
 'Prestroke disability (Rankin)',
 'Antihypert. drugs pre-stroke',
 'Lipid lowering drugs pre-stroke',
 'Antiplatelet drugs',
 'Anticoagulants',
 'MedHist Hypertension',
 'MedHist Diabetes',
 'MedHist Hyperlipidemia',
 'MedHist Smoking',
 'MedHist Atrial Fibr.',
 'MedHist CHD',
 'MedHist PAD',
 'MedHist cerebrovascular_event',
 'categorical_onset_to_admission_time',
 'wake_up_stroke',
 'categorical_IVT',
 'categorical_IAT'
]

In [None]:
for variable in restricted_feature_df.sample_label.unique():
    if variable in categorical_variables:
        print(f"'{variable}', {len(restricted_feature_df[restricted_feature_df.sample_label == variable]['value'].unique())}")


The following variables will not be one hot encoded as they are considered to be continuous

In [None]:
for variable in restricted_feature_df.sample_label.unique():
    if variable not in categorical_variables:
        print(f"'{variable}', {len(restricted_feature_df[restricted_feature_df.sample_label == variable]['value'].unique())}")


In [None]:
restricted_feature_df[restricted_feature_df.sample_label.isin(categorical_variables)].head()

In [None]:
dummy_coded_temp = pd.get_dummies(restricted_feature_df[restricted_feature_df.sample_label == 'Referral'], columns=['value'], prefix='referral', drop_first=True)
dummy_coded_temp.columns = [str(col).lower().replace(' ', '_') for col in dummy_coded_temp.columns]
dummy_coded_temp.head()

In [None]:
dummy_coded_temp.drop(columns=['sample_label'], inplace=True)
dummy_coded_temp.melt(id_vars=['case_admission_id', 'sample_date', 'source', 'first_sample_date', 'relative_sample_date'], var_name='sample_label', value_name='value')

In [None]:
set(dummy_coded_temp.columns)

In [None]:
one_hot_encoded_df = restricted_feature_df.copy()

In [None]:
hot_one_encoded_variables = []
verbose = True
for categorical_variable in categorical_variables:
    dummy_coded_temp = pd.get_dummies(one_hot_encoded_df[
                                          one_hot_encoded_df.sample_label == categorical_variable],
                                      columns=['value'], prefix=str(categorical_variable).lower(), drop_first=True)

    if verbose:
        # find baseline value
        baseline_value = [var
                          for var in one_hot_encoded_df[
                              one_hot_encoded_df.sample_label == categorical_variable][
                              'value'].unique()
                          if str(var) not in
                          [col_name.split(str(categorical_variable).lower() + '_')[-1] for col_name in
                           dummy_coded_temp.columns]
                          ]
        print(f'Baseline for {categorical_variable}: {baseline_value}')

    dummy_coded_temp.columns = [str(col).lower().replace(' ', '_') for col in dummy_coded_temp.columns]
    hot_one_encoded_variables += list(dummy_coded_temp.columns)
    dummy_coded_temp.drop(columns=['sample_label'], inplace=True)
    dummy_coded_temp = dummy_coded_temp.melt(
        id_vars=['case_admission_id', 'sample_date', 'source', 'first_sample_date', 'relative_sample_date'],
        var_name='sample_label', value_name='value')
    one_hot_encoded_df = one_hot_encoded_df.append(dummy_coded_temp)

    # drop original non-binary categorical variable
    one_hot_encoded_df = one_hot_encoded_df[
        one_hot_encoded_df.sample_label != categorical_variable]

In [None]:
set(hot_one_encoded_variables)

In [None]:
one_hot_encoded_df.head()

In [None]:
one_hot_encoded_df.sample_label.unique()

In [None]:
one_hot_encoded_df[one_hot_encoded_df.sample_label.isin(hot_one_encoded_variables)]

### Testing final function

In [None]:
from preprocessing.encoding_categorical_variables.encode_categorical_variables import encode_categorical_variables

log_dir = '/Users/jk1/temp/opsum_prepro_output/temp_output'

encoded_df = encode_categorical_variables(restricted_feature_df, log_dir=log_dir)

In [None]:
encoded_df.head()

In [None]:
set(encoded_df.sample_label.unique()) - set(restricted_feature_df.sample_label.unique())

In [None]:
encoded_df.groupby('sample_label').count()