In [None]:
import os.path

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
data_path = '/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction_20220815'
admission_data_path = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_unit_dataset/data/stroke_registry/post_hoc_modified/stroke_registry_post_hoc_modified.xlsx'
patient_selection_path = '/Users/jk1/temp/opsum_extraction_output/high_frequency_data_patient_selection_with_details.csv'

In [None]:
verbose = True
log_dir = ''

In [None]:
from preprocessing.variable_assembly.variable_database_assembly import assemble_variable_database

feature_df = assemble_variable_database(data_path, admission_data_path, patient_selection_path)

In [None]:
from preprocessing.handling_missing_values.impute_missing_values import impute_missing_values
from preprocessing.resample_to_time_bins.resample_to_hourly_features import resample_to_hourly_features
from preprocessing.encoding_categorical_variables.encode_categorical_variables import encode_categorical_variables
from preprocessing.variable_assembly.relative_timestamps import transform_to_relative_timestamps

restricted_feature_df = transform_to_relative_timestamps(feature_df, drop_old_columns=False,
                                                         restrict_to_time_range=True, desired_time_range=72,
                                                         enforce_min_time_range=True, min_time_range=12,
                                                         log_dir=log_dir)


cat_encoded_restricted_feature_df = encode_categorical_variables(restricted_feature_df, verbose=verbose,
                                                                 log_dir=log_dir)

resampled_df = resample_to_hourly_features(cat_encoded_restricted_feature_df, verbose=verbose)

imputed_missing_df = impute_missing_values(resampled_df, verbose=verbose)


In [None]:
imputed_missing_df.head()

In [None]:
# get a list of all non binary sample labels
non_binary_sample_labels = []
for variable in imputed_missing_df.sample_label.unique():
    if imputed_missing_df[imputed_missing_df.sample_label == variable].value.nunique() > 2:
        non_binary_sample_labels.append(variable)

imputed_missing_df_non_binary_feature_df = imputed_missing_df[imputed_missing_df.sample_label.isin(non_binary_sample_labels)]

In [None]:
for variable in imputed_missing_df_non_binary_feature_df.sample_label.unique():
    print(f"'{variable}',")

In [None]:
variables_to_normalize = [
'proBNP',
'bilirubine totale',
'thrombocytes',
'creatinine',
'calcium corrige',
'hemoglobine',
'INR',
'potassium',
'glycemie moyenne estimee',
'hematocrite',
'uree',
'erythrocytes',
'glucose',
'leucocytes',
'hemoglobine glyquee',
'sodium',
'proteine C-reactive',
'ALAT',
'FIO2',
'max_NIHSS',
'max_diastolic_blood_pressure',
'max_heart_rate',
'max_mean_blood_pressure',
'max_oxygen_saturation',
'max_respiratory_rate',
'max_systolic_blood_pressure',
'min_NIHSS',
'min_diastolic_blood_pressure',
'min_heart_rate',
'min_mean_blood_pressure',
'min_oxygen_saturation',
'min_respiratory_rate',
'min_systolic_blood_pressure',
'median_NIHSS',
'median_diastolic_blood_pressure',
'median_heart_rate',
'median_mean_blood_pressure',
'median_oxygen_saturation',
'median_respiratory_rate',
'median_systolic_blood_pressure',
'temperature',
'weight',
'age',
'triglycerides',
'ASAT',
'cholesterol HDL',
'Glasgow Coma Scale',
'fibrinogene',
'PTT',
'cholesterol total',
'LDL cholesterol calcule',
'chlore',
'lactate',
]

## Following variables are not normalized

In [None]:
# find variables that will not be normalized
print(f'Following variables are not normalized:')
not_normalized_variables = []
for variable in imputed_missing_df.sample_label.unique():
    if variable not in variables_to_normalize:
        print(f"'{variable}',")
        not_normalized_variables.append(variable)

## Normalize variables

For continuous variables:
  - Winsorize values outside the upper and lower bounds of 1⋅5 times the IQR are set to the upper and lower limits of the range
  - Scale to a mean of 0 with an SD of 1

In [None]:
imputed_missing_df[imputed_missing_df.sample_label == 'median_heart_rate'].value.plot.hist(bins=50)
plt.show()

In [None]:
temp = imputed_missing_df[imputed_missing_df.sample_label == 'median_heart_rate'].value.copy()
print(temp.quantile(0.75) - temp.quantile(0.25))
print(temp.quantile(0.75) + 1.5*(temp.quantile(0.75) - temp.quantile(0.25)))
print(temp.quantile(0.25) - 1.5*(temp.quantile(0.75) - temp.quantile(0.25)))
temp = temp.clip(lower=temp.quantile(0.25) - 1.5*(temp.quantile(0.75) - temp.quantile(0.25)),
                 upper=temp.quantile(0.75) + 1.5*(temp.quantile(0.75) - temp.quantile(0.25)))

In [None]:
temp.plot.hist(bins=20)
plt.show()

In [None]:
temp = (temp - temp.mean()) / temp.std()

In [None]:
temp.plot.hist(bins=20)
plt.show()

1. Winsorize

In [None]:
winsorized_restricted_feature_df = imputed_missing_df.copy()
for variable in tqdm(variables_to_normalize):
    temp = winsorized_restricted_feature_df[winsorized_restricted_feature_df.sample_label == variable].value.copy()
    # skip variables with insufficient range (FiO2, GCS)
    if temp.quantile(0.75) == temp.quantile(0.25):
        continue
    temp = temp.clip(lower=temp.quantile(0.25) - 1.5 * (temp.quantile(0.75) - temp.quantile(0.25)),
                     upper=temp.quantile(0.75) + 1.5 * (temp.quantile(0.75) - temp.quantile(0.25)))
    winsorized_restricted_feature_df.loc[winsorized_restricted_feature_df.sample_label == variable, 'value'] = temp

2. Normalize

In [None]:
# Scale to a mean of 0 with an SD of 1
normalized_winsorized_restricted_feature_df = winsorized_restricted_feature_df.copy()
normalisation_parameters_columns = ['variable', 'original_mean', 'original_std']
normalisation_parameters_df = pd.DataFrame(columns=normalisation_parameters_columns)
for variable in tqdm(variables_to_normalize):
    temp = normalized_winsorized_restricted_feature_df[
        normalized_winsorized_restricted_feature_df.sample_label == variable].value.copy()
    normalisation_parameters_df = normalisation_parameters_df.append(pd.DataFrame([[variable, temp.mean(), temp.std()]], columns=normalisation_parameters_columns))
    temp = (temp - temp.mean()) / temp.std()
    normalized_winsorized_restricted_feature_df.loc[
        normalized_winsorized_restricted_feature_df.sample_label == variable, 'value'] = temp

In [None]:
from matplotlib.pyplot import cm
import numpy as np

colors=cm.hsv(np.linspace(0,1,len(imputed_missing_df.sample_label.unique())))
for variable in variables_to_normalize:
    if variable not in imputed_missing_df.sample_label.unique():
        print(f'{variable} is not present in Dataframe')
        continue
    # fig, axes = plt.subplot_mosaic('ABC', sharey=True)
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5), sharey=True)
    color = colors[imputed_missing_df.sample_label.unique().tolist().index(variable)]

    # plot original distribution
    ax1 = sns.histplot(imputed_missing_df[imputed_missing_df.sample_label == variable].value, bins=50, ax=ax1, color=color)
    ax1.axes.set_xlabel(variable)
    ax1.set_title(f'Distribution of {variable}')

    # plot winsorized distribution
    ax2 = sns.histplot(winsorized_restricted_feature_df[winsorized_restricted_feature_df.sample_label == variable].value, bins=50,  ax=ax2, color=color)
    ax2.set_xlabel(variable)
    ax2.set_title(f'Distribution of {variable} after Winsorization')

    # plot normalized distribution
    ax3 = sns.histplot(normalized_winsorized_restricted_feature_df[normalized_winsorized_restricted_feature_df.sample_label == variable].value, bins=50, ax=ax3, color=color)
    ax3.set_xlabel(variable)
    ax3.set_title(f'Distribution of {variable} after Normalization')

    plt.show()

In [None]:
normalisation_parameters_df

### testing the inverse transformation

In [None]:
reverse_normalized_winsorized_restricted_feature_df = normalized_winsorized_restricted_feature_df.copy()
for variable in tqdm(normalisation_parameters_df.variable.unique()):
    if variable not in reverse_normalized_winsorized_restricted_feature_df.sample_label.unique():
        print(f'{variable} is not present in Dataframe')
        continue

    temp = reverse_normalized_winsorized_restricted_feature_df[
        reverse_normalized_winsorized_restricted_feature_df.sample_label == variable].value.copy()
    std = normalisation_parameters_df[normalisation_parameters_df.variable == variable].original_std.iloc[0]
    mean = normalisation_parameters_df[normalisation_parameters_df.variable == variable].original_mean.iloc[0]
    temp = (temp * std) + mean
    reverse_normalized_winsorized_restricted_feature_df.loc[
        reverse_normalized_winsorized_restricted_feature_df.sample_label == variable, 'value'] = temp

In [None]:
normalized_winsorized_restricted_feature_df.head()

In [None]:
reverse_normalized_winsorized_restricted_feature_df.head()

In [None]:
reverse_normalized_winsorized_restricted_feature_df[reverse_normalized_winsorized_restricted_feature_df.value != winsorized_restricted_feature_df.value].value.iloc[0]

In [None]:
winsorized_restricted_feature_df[reverse_normalized_winsorized_restricted_feature_df.value != winsorized_restricted_feature_df.value].value.iloc[0]


Testing if reverse operation recovers state before normalisation (approximation with 10e-5 tolerance)

In [None]:
(np.isclose(reverse_normalized_winsorized_restricted_feature_df.value, winsorized_restricted_feature_df.value, rtol=1e-05, atol=1e-05)).all()

## Testing final implementation

In [None]:
log_dir = '/Users/jk1/temp/opsum_prepro_output/temp_output'

In [None]:
from preprocessing.normalisation.normalisation import normalise_data

test_df = normalise_data(imputed_missing_df, verbose=True, log_dir=log_dir)

In [None]:
from matplotlib.pyplot import cm
import numpy as np

colors=cm.hsv(np.linspace(0,1,len(imputed_missing_df.sample_label.unique())))
for variable in variables_to_normalize:
    if variable not in imputed_missing_df.sample_label.unique():
        print(f'{variable} is not present in Dataframe')
        continue
    # fig, axes = plt.subplot_mosaic('ABC', sharey=True)
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5), sharey=True)
    color = colors[imputed_missing_df.sample_label.unique().tolist().index(variable)]

    # plot original distribution
    ax1 = sns.histplot(imputed_missing_df[imputed_missing_df.sample_label == variable].value, bins=50, ax=ax1, color=color)
    ax1.axes.set_xlabel(variable)
    ax1.set_title(f'Distribution of {variable}')


    # plot normalized distribution
    ax2 = sns.histplot(test_df[test_df.sample_label == variable].value, bins=50, ax=ax2, color=color)
    ax2.set_xlabel(variable)
    ax2.set_title(f'Distribution of {variable} after Normalization')

    plt.show()

In [None]:
for variable in variables_to_normalize:
    if variable not in test_df.sample_label.unique():
        print(f'{variable} is not present in Dataframe')
        continue

    print(f'{variable}: {test_df[test_df.sample_label == variable].value.mean()}')
    print(f'{variable}: {test_df[test_df.sample_label == variable].value.std()}')