In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data_path = '/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction_20211110'
admission_data_path = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_unit_dataset/data/stroke_registry/post_hoc_modified/stroke_registry_post_hoc_modified.xlsx'
patient_selection_path = '/Users/jk1/temp/opsum_extration_output/high_frequency_data_patient_selection.csv'

In [None]:
from preprocessing.variable_assembly.variable_database_assembly import assemble_variable_database

feature_df = assemble_variable_database(data_path, admission_data_path, patient_selection_path)

In [None]:
from preprocessing.variable_assembly.relative_timestamps import transform_to_relative_timestamps

restricted_feature_df = transform_to_relative_timestamps(feature_df, drop_old_columns=False, restrict_to_time_range=True)

In [None]:
restricted_feature_df.head()

In [None]:
# get a list of all non binary sample labels
non_binary_sample_labels = []
for variable in restricted_feature_df.sample_label.unique():
    if restricted_feature_df[restricted_feature_df.sample_label == variable].value.nunique() > 2:
        non_binary_sample_labels.append(variable)

restricted_non_binary_feature_df = restricted_feature_df[restricted_feature_df.sample_label.isin(non_binary_sample_labels)]

In [None]:
for variable in restricted_non_binary_feature_df.sample_label.unique():
    print(f"'{variable}',")

In [None]:
variables_to_normalize = [
'proBNP',
'bilirubine totale',
'thrombocytes',
'creatinine',
'calcium corrige',
'hemoglobine',
'INR',
'potassium',
'glycemie moyenne estimee',
'hematocrite',
'uree',
'erythrocytes',
'glucose',
'leucocytes',
'hemoglobine glyquee',
'sodium',
'proteine C-reactive',
'ALAT',
'FIO2',
'oxygen_saturation',
'systolic_blood_pressure',
'diastolic_blood_pressure',
'mean_blood_pressure',
'heart_rate',
'respiratory_rate',
'temperature',
'weight',
'age',
'NIHSS',
'triglycerides',
'ASAT',
'cholesterol HDL',
'Glasgow Coma Scale',
'fibrinogene',
'PTT',
'cholesterol total',
'LDL cholesterol calcule',
]

## Following variables are not normalized

In [None]:
# find variables that will not be normalized
print(f'Following variables are not normalized:')
not_normalized_variables = []
for variable in restricted_feature_df.sample_label.unique():
    if variable not in variables_to_normalize:
        print(f"'{variable}',")
        not_normalized_variables.append(variable)

## Normalize variables

For continuous variables:
  - Winsorize values outside the upper and lower bounds of 1⋅5 times the IQR are set to the upper and lower limits of the range
  - Scale to a mean of 0 with an SD of 1

In [None]:
restricted_feature_df[restricted_feature_df.sample_label == 'heart_rate'].value.plot.hist(bins=50)
plt.show()

In [None]:
temp = restricted_feature_df[restricted_feature_df.sample_label == 'heart_rate'].value.copy()
print(temp.quantile(0.75) - temp.quantile(0.25))
print(temp.quantile(0.75) + 1.5*(temp.quantile(0.75) - temp.quantile(0.25)))
print(temp.quantile(0.25) - 1.5*(temp.quantile(0.75) - temp.quantile(0.25)))
temp = temp.clip(lower=temp.quantile(0.25) - 1.5*(temp.quantile(0.75) - temp.quantile(0.25)),
                 upper=temp.quantile(0.75) + 1.5*(temp.quantile(0.75) - temp.quantile(0.25)))

In [None]:
temp.plot.hist(bins=20)
plt.show()

In [None]:
temp = (temp - temp.mean()) / temp.std()

In [None]:
temp.plot.hist(bins=20)
plt.show()

1. Winsorize

In [None]:
winsorized_restricted_feature_df = restricted_feature_df.copy()
for variable in variables_to_normalize:
    temp = winsorized_restricted_feature_df[winsorized_restricted_feature_df.sample_label == variable].value.copy()
    temp = temp.clip(lower=temp.quantile(0.25) - 1.5*(temp.quantile(0.75) - temp.quantile(0.25)),
                     upper=temp.quantile(0.75) + 1.5*(temp.quantile(0.75) - temp.quantile(0.25)))
    winsorized_restricted_feature_df.loc[winsorized_restricted_feature_df.sample_label == variable, 'value'] = temp

2. Normalize

In [None]:
normalized_winsorized_restricted_feature_df = winsorized_restricted_feature_df.copy()
for variable in variables_to_normalize:
    temp = normalized_winsorized_restricted_feature_df[normalized_winsorized_restricted_feature_df.sample_label == variable].value.copy()
    temp = (temp - temp.mean()) / temp.std()
    normalized_winsorized_restricted_feature_df.loc[normalized_winsorized_restricted_feature_df.sample_label == variable, 'value'] = temp

In [None]:
from matplotlib.pyplot import cm
import numpy as np

colors=cm.hsv(np.linspace(0,1,len(feature_df.sample_label.unique())))
for variable in variables_to_normalize:
    # fig, axes = plt.subplot_mosaic('ABC', sharey=True)
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5), sharey=True)
    color = colors[restricted_feature_df.sample_label.unique().tolist().index(variable)]

    # plot original distribution
    ax1 = sns.histplot(restricted_feature_df[restricted_feature_df.sample_label == variable].value, bins=50, ax=ax1, color=color)
    ax1.axes.set_xlabel(variable)
    ax1.set_title(f'Distribution of {variable}')

    # plot winsorized distribution
    ax2 = sns.histplot(winsorized_restricted_feature_df[winsorized_restricted_feature_df.sample_label == variable].value, bins=50,  ax=ax2, color=color)
    ax2.set_xlabel(variable)
    ax2.set_title(f'Distribution of {variable} after Winsorization')

    # plot normalized distribution
    ax3 = sns.histplot(normalized_winsorized_restricted_feature_df[normalized_winsorized_restricted_feature_df.sample_label == variable].value, bins=50, ax=ax3, color=color)
    ax3.set_xlabel(variable)
    ax3.set_title(f'Distribution of {variable} after Normalization')

    plt.show()

## Testing final implementation

In [None]:
from preprocessing.normalisation.normalisation import normalise_data

test_df = normalise_data(restricted_feature_df, verbose=True)

In [None]:
from matplotlib.pyplot import cm
import numpy as np

colors=cm.hsv(np.linspace(0,1,len(feature_df.sample_label.unique())))
for variable in variables_to_normalize:
    # fig, axes = plt.subplot_mosaic('ABC', sharey=True)
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5), sharey=True)
    color = colors[restricted_feature_df.sample_label.unique().tolist().index(variable)]

    # plot original distribution
    ax1 = sns.histplot(restricted_feature_df[restricted_feature_df.sample_label == variable].value, bins=50, ax=ax1, color=color)
    ax1.axes.set_xlabel(variable)
    ax1.set_title(f'Distribution of {variable}')


    # plot normalized distribution
    ax2 = sns.histplot(test_df[test_df.sample_label == variable].value, bins=50, ax=ax2, color=color)
    ax2.set_xlabel(variable)
    ax2.set_title(f'Distribution of {variable} after Normalization')

    plt.show()