# Handling missing data

> Missing values, including absent datapoints due to up-sampling, were imputed by last observation carried forward (LOCF). Population medians in the datasets were used for missing values occurring before the first actual measurement.

This should be done before normalisation (but after dummy encoding).

In [None]:
import pandas as pd
from tqdm import tqdm

In [None]:
verbose = True

In [None]:
data_path = '/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction_20220815'
admission_data_path = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_unit_dataset/data/stroke_registry/post_hoc_modified/stroke_registry_post_hoc_modified.xlsx'
patient_selection_path = '/Users/jk1/temp/opsum_extraction_output/high_frequency_data_patient_selection_with_details.csv'

In [None]:
from preprocessing.variable_assembly.variable_database_assembly import assemble_variable_database

feature_df = assemble_variable_database(data_path, admission_data_path, patient_selection_path)

In [None]:
from preprocessing.variable_assembly.relative_timestamps import transform_to_relative_timestamps
from preprocessing.encoding_categorical_variables.encode_categorical_variables import encode_categorical_variables

restricted_feature_df = transform_to_relative_timestamps(feature_df, drop_old_columns=False, restrict_to_time_range=True)
cat_encoded_restricted_feature_df = encode_categorical_variables(restricted_feature_df)

In [None]:
from preprocessing.resample_to_time_bins.resample_to_hourly_features import resample_to_hourly_features

resampled_df = resample_to_hourly_features(cat_encoded_restricted_feature_df)

In [None]:
resampled_df.head(500000)

In [None]:
for label in resampled_df.sample_label.unique():
   print(f"'{label}',")

In [None]:
categorical_vars = [
    'sex_male',
'referral_in-hospital_event',
'referral_other_hospital',
'referral_self_referral_or_gp',
'prestroke_disability_(rankin)_1.0',
'prestroke_disability_(rankin)_2.0',
'prestroke_disability_(rankin)_3.0',
'prestroke_disability_(rankin)_4.0',
'prestroke_disability_(rankin)_5.0',
'antihypert._drugs_pre-stroke_yes',
'lipid_lowering_drugs_pre-stroke_yes',
'antiplatelet_drugs_yes',
'anticoagulants_yes',
'medhist_hypertension_yes',
'medhist_diabetes_yes',
'medhist_hyperlipidemia_yes',
'medhist_smoking_yes',
'medhist_atrial_fibr._yes',
'medhist_chd_yes',
'medhist_pad_yes',
'medhist_cerebrovascular_event_true',
'categorical_onset_to_admission_time_541-1440min',
'categorical_onset_to_admission_time_<270min',
'categorical_onset_to_admission_time_>1440min',
'categorical_onset_to_admission_time_intra_hospital',
'categorical_onset_to_admission_time_onset_unknown',
'wake_up_stroke_true',
'categorical_ivt_91-270min',
'categorical_ivt_<90min',
'categorical_ivt_>540min',
'categorical_ivt_no_ivt',
'categorical_iat_<270min',
'categorical_iat_>540min',
'categorical_iat_no_iat',
]

In [None]:
for label in resampled_df.sample_label.unique():
   if label not in categorical_vars:
        print(f"'{label}',")

## First hour missing values

In [None]:
len(resampled_df.case_admission_id.unique())

In [None]:
# count number of values per sample_label in the first hour
resampled_df[resampled_df.relative_sample_date_hourly_cat == 0].groupby('sample_label').count()

In [None]:
# Continuous vars
# first hour population means for sample_labels not in categorical_vars
resampled_df[(resampled_df.relative_sample_date_hourly_cat == 0) & (~resampled_df.sample_label.isin(categorical_vars))].groupby('sample_label').value.mean()

In [None]:
resampled_df[(resampled_df.relative_sample_date_hourly_cat == 0) & (~resampled_df.sample_label.isin(categorical_vars))].groupby('sample_label').value.median()

Median seems to be a better imputation method than mean.

In [None]:
# categorical vars
# first hour population mode for sample_labels in categorical_vars
resampled_df[(resampled_df.relative_sample_date_hourly_cat == 0) & (resampled_df.sample_label.isin(categorical_vars))].groupby('sample_label').value.apply(lambda x: x.mode()[0])

Imputing missing values after categorical encoding seems to be ok, as mutual exclusivity is not violated.

In [None]:
imputed_missing_df = resampled_df.copy()

In [None]:
# find case_admission_id with no sample_label == FiO2
n_subj_noFIO2 = len(set(imputed_missing_df.case_admission_id.unique()).difference(set(imputed_missing_df[(imputed_missing_df.sample_label == 'FIO2') & (imputed_missing_df.relative_sample_date_hourly_cat == 0)].case_admission_id.unique())))
print(f'{n_subj_noFIO2} subjects with no FiO2 in first hour. Value will be replaced with 21%')

In [None]:
# Handle first missing values (timebin 0)
# -> fill with population median/mode
if verbose:
    print('Fill fist missing values via population mean/median.')
for sample_label in tqdm(imputed_missing_df.sample_label.unique()):
    # find case_admission_ids with no value for sample_label in first timebin
    patients_with_no_sample_label_tp0 = set(imputed_missing_df.case_admission_id.unique()).difference(set(
        imputed_missing_df[(imputed_missing_df.sample_label == sample_label) & (
                    imputed_missing_df.relative_sample_date_hourly_cat == 0)].case_admission_id.unique()))

    if sample_label == 'FIO2':
        # for FIO2, impute with 21.0%
        imputed_tp0_value = 21.0
    elif sample_label in categorical_vars:
        # for categorical vars, impute with mode
        imputed_tp0_value = imputed_missing_df[(imputed_missing_df.sample_label == sample_label) & (
                    imputed_missing_df.relative_sample_date_hourly_cat == 0)].value.mode()[0]
    else:
        # for numerical vars, impute with median
        imputed_tp0_value = imputed_missing_df[(imputed_missing_df.sample_label == sample_label) & (
                    imputed_missing_df.relative_sample_date_hourly_cat == 0)].value.median()
    if verbose:
        print(
            f'{len(patients_with_no_sample_label_tp0)} patients with no {sample_label} in first timebin for which {imputed_tp0_value} was imputed')

    sample_label_original_source = \
        imputed_missing_df[imputed_missing_df.sample_label == sample_label].source.mode(dropna=True)[0]

    imputed_sample_label = pd.DataFrame({'case_admission_id': list(patients_with_no_sample_label_tp0),
                                         'sample_label': sample_label,
                                         'relative_sample_date_hourly_cat': 0,
                                         'source': f'{sample_label_original_source}_pop_imputed',
                                         'value': imputed_tp0_value})

    # impute missing values for sample_label in first timebin
    imputed_missing_df = imputed_missing_df.append(imputed_sample_label, ignore_index=True)




## Following Missing timebins

> Fill missing timebin values by last observation carried forward

In [None]:
locf_imputed_missing_df = imputed_missing_df.copy()

In [None]:

temp = locf_imputed_missing_df[(locf_imputed_missing_df.case_admission_id == '571703_7379') & (locf_imputed_missing_df.sample_label == 'FIO2')]
temp

In [None]:
temp[(temp.case_admission_id == '571703_7379') & (temp.sample_label == 'FIO2')]

In [None]:
temp.set_index('relative_sample_date_hourly_cat').reindex(range(0,70)).fillna(method='ffill')

In [None]:
import numpy as np

In [None]:
temp = locf_imputed_missing_df[(locf_imputed_missing_df.case_admission_id.isin(['571703_7379', '100023_4784']))]

In [None]:
temp

In [None]:
# following missing values (timebin > 0)
# -> Fill missing timebin values by last observation carried forward
if verbose:
    print('Fill missing values via LOCF.')

locf_imputed_missing_df = temp.groupby(['case_admission_id', 'sample_label']).apply(
    lambda x: x.set_index('relative_sample_date_hourly_cat').reindex(range(0, 72)))
locf_imputed_missing_df.value = locf_imputed_missing_df.value.fillna(method='ffill')
locf_imputed_missing_df.sample_label = locf_imputed_missing_df.sample_label.fillna(method='ffill')
locf_imputed_missing_df.case_admission_id = locf_imputed_missing_df.case_admission_id.fillna(method='ffill')

locf_imputed_missing_df['source_imputation'] = locf_imputed_missing_df.source.apply(lambda x: '' if type(x) == str else np.nan)
locf_imputed_missing_df.source_imputation = locf_imputed_missing_df.source_imputation.fillna('_locf_imputed')
locf_imputed_missing_df.source = locf_imputed_missing_df.source.fillna(method='ffill')
locf_imputed_missing_df.source += locf_imputed_missing_df.source_imputation
locf_imputed_missing_df.drop(columns=['source_imputation'], inplace=True)

# reset relative_sample_date_hourly_cat as column
locf_imputed_missing_df.reset_index(level=2, inplace=True)
# drop groupby index
locf_imputed_missing_df.reset_index(inplace=True, drop=True)

In [None]:
locf_imputed_missing_df.source.unique()

In [None]:
locf_imputed_missing_df

## Testing final function

In [None]:
log_dir = '/Users/jk1/temp/opsum_prepro_output/temp_output'

In [None]:
from preprocessing.handling_missing_values.impute_missing_values import impute_missing_values

imputed_resampled_df = impute_missing_values(resampled_df, log_dir=log_dir)

In [None]:
temp = imputed_resampled_df.head(5000)

In [None]:
def assert_selected_variables_presence(df: pd.DataFrame, variable_selection_path: str):
    """
    Asserts that all variables from the variable selection file are present in the dataframe.
    :param df: the dataframe to be checked
    :param variable_selection_path: the path to the variable selection file
    :return: None
    """
    selected_variables = pd.read_excel(variable_selection_path)['included']
    missing_variables = []
    for variable in selected_variables:
        if (len([s for s in df.sample_label.unique() if variable in s]) == 0)\
                & (len([s for s in df.sample_label.unique() if variable.lower().replace(' ', '_') in s]) == 0):
            missing_variables.append(variable)

    # missing_variables = set(selected_variables).difference(set(df.sample_label.unique()))
    if len(missing_variables) > 0:
        raise ValueError(f'The following variables are missing from the dataframe: {missing_variables}')

    return True

In [None]:
desired_time_range = 72

In [None]:
# from preprocessing.variable_assembly.variable_selection import assert_selected_variables_presence

all_variables_present = []

selected_variables_path = '/Users/jk1/stroke_research/OPSUM/preprocessing/variable_assembly/selected_variables.xlsx'
# selected_variables_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'variable_assembly/selected_variables.xlsx')

for cid in tqdm(imputed_resampled_df.case_admission_id.unique()):
    temp_cid_df = imputed_resampled_df[(imputed_resampled_df.case_admission_id == cid)]
    for time_bin in range(desired_time_range):
        all_variables_present.append(assert_selected_variables_presence(temp_cid_df[temp_cid_df.relative_sample_date_hourly_cat == time_bin], selected_variables_path))

In [None]:
all(all_variables_present)

In [None]:
temp_cid_df[temp_cid_df.relative_sample_date_hourly_cat == time_bin]

In [None]:

assert_selected_variables_presence(temp.groupby(['case_admission_id', 'relative_sample_date_hourly_cat']), )