In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt


In [None]:
stroke_registry_data_path = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_unit_dataset/data/stroke_registry/post_hoc_modified/stroke_registry_post_hoc_modified.xlsx'
output_path = '/Users/jk1/temp/opsum_output'
patient_selection = '/Users/jk1/temp/opsum_extraction_output/high_frequency_data_patient_selection_with_details.csv'

In [None]:
all_admission_data_columns = [
"Non-Swiss",
"Arrival time",
"Age (calc.)",
"Sex",
"Ethnicity",
"Other ethnicity",
"Onset date",
"Onset time",
"Wake-up date",
"Wake-up time",
"Referral",
"Patient referred to",
"Transport",
"Prestroke living situation",
"Prestroke disability (Rankin)",
"Stroke syndrom (Bamford)",
"NIH on admission",
"GCS on admission",
"1st syst. bp",
"1st diast. bp",
"Height",
"Weight",
"BMI",
"Aspirin pre-stroke",
"Clopidogrel pre-stroke",
"Prasugrel pre-stroke",
"Ticagrelor pre-stroke",
"Dipyridamole pre-stroke",
"Vit. K ag pre-stroke",
"Vit. K ag INR",
"Rivaroxaban pre-stroke",
"Dabigatran pre-stroke",
"Apixaban pre-stroke",
"Edoxaban pre-stroke",
"Parenteral ac pre-stroke",
"Antihypert. drugs pre-stroke",
"Lipid lowering drugs pre-stroke",
"Hormone repl. or contracept.",
"Antiplatelet drugs",
"Anticoagulants",
"Initial hospitalization",
"MedHist Stroke",
"MedHist TIA",
"MedHist ICH",
"MedHist Hypertension",
"MedHist Diabetes",
"MedHist Hyperlipidemia",
"MedHist Smoking",
"MedHist Atrial Fibr.",
"MedHist CHD",
"MedHist Prost. heart valves",
"MedHist Low ejection fract.",
"MedHist PAD",
"Average sleep",
"Last night sleep",
"Snoring",
"Daytime sleepiness",
"1st glucose",
"1st cholesterol total",
"1st cholesterol LDL",
"1st creatinine",
]

In [None]:
patient_selection_df = pd.read_csv(patient_selection, dtype=str)

In [None]:
full_data_df = pd.read_excel(stroke_registry_data_path)
full_data_df['patient_id'] = full_data_df['Case ID'].apply(lambda x: x[8:-4])
full_data_df['EDS_last_4_digits'] = full_data_df['Case ID'].apply(lambda x: x[-4:])
full_data_df['case_admission_id'] = full_data_df['patient_id'].astype(str) \
                                 + full_data_df['EDS_last_4_digits'].astype(str) \
                                 + '_' + pd.to_datetime(full_data_df['Arrival at hospital'], format='%Y%m%d').dt.strftime('%d%m%Y').astype(str)

In [None]:
patient_selection_df['case_admission_id'] = patient_selection_df['patient_id'].astype(str) \
                                 + patient_selection_df['EDS_last_4_digits'].astype(str) \
                                 + '_' + pd.to_datetime(patient_selection_df['Arrival at hospital'], format='%Y%m%d').dt.strftime('%d%m%Y').astype(str)
selected_full_data_df = full_data_df[
    full_data_df['case_admission_id'].isin(patient_selection_df['case_admission_id'].tolist())]


In [None]:
selected_full_data_df.head()

In [None]:
all_admission_data_df = selected_full_data_df[all_admission_data_columns+['case_admission_id']]

In [None]:
# count nan in every column of all_admission_data_df
all_admission_data_df.isna().sum()

In [None]:
# plot percentage of present values in each column
((all_admission_data_df.shape[0] - all_admission_data_df.isna().sum())/all_admission_data_df.shape[0]*100).plot.barh(figsize=(10,15), title='Percentage of present values in each column')
plt.show()

In [None]:
selected_admission_data_columns = [
"Age (calc.)",
"Sex",
"Referral",
"Prestroke disability (Rankin)",
"NIH on admission",
"1st syst. bp",
"1st diast. bp",
"Weight",
"Antihypert. drugs pre-stroke",
"Lipid lowering drugs pre-stroke",
"Hormone repl. or contracept.",
"Antiplatelet drugs",
"Anticoagulants",
"MedHist Stroke",
"MedHist TIA",
"MedHist ICH",
"MedHist Hypertension",
"MedHist Diabetes",
"MedHist Hyperlipidemia",
"MedHist Smoking",
"MedHist Atrial Fibr.",
"MedHist CHD",
"MedHist Prost. heart valves",
"MedHist PAD",
"1st glucose",
"1st cholesterol total",
"1st cholesterol LDL",
"1st creatinine",
]

In [None]:
selected_admission_data_df = all_admission_data_df[selected_admission_data_columns+['case_admission_id']]

In [None]:
# plot percentage of present values in each column
((selected_admission_data_df.shape[0] - selected_admission_data_df.isna().sum())/selected_admission_data_df.shape[0]*100).plot.barh(figsize=(10,10), title='Percentage of present values in each column')
plt.show()

In [None]:
# dropping some columns because of insufficient data or irrelevance
admission_data_to_drop = [
    'MedHist Prost. heart valves',
    'Hormone repl. or contracept.'
]

In [None]:
selected_admission_data_df.drop(admission_data_to_drop, axis=1, inplace=True)

# Restricting to plausible ranges

In [None]:
possible_value_ranges_file = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(''))),
                                          'preprocessing', 'possible_ranges_for_variables.xlsx')
possible_value_ranges = pd.read_excel(possible_value_ranges_file)

In [None]:
def restrict_variable_to_possible_ranges(df, variable_name, possible_value_ranges, verbose=False):
    """
    Restricts a variable to the possible ranges in the possible_value_ranges dataframe.
    """
    variable_range = possible_value_ranges[possible_value_ranges['variable_label'] == variable_name]
    variable_range = variable_range.iloc[0]
    clean_df = df.copy()
    # set score to np.nan if outside of range
    clean_df.loc[(df[variable_name] < variable_range['Min']), variable_name] = np.nan
    clean_df.loc[(df[variable_name] > variable_range['Max']), variable_name] = np.nan
    if verbose:
        print(f'Excluding {clean_df[variable_name].isna().sum()} observations because out of range')
    excluded_df = df[clean_df[variable_name].isna()]
    return clean_df, excluded_df

In [None]:
for admission_column in selected_admission_data_columns:
    if (selected_admission_data_df[admission_column].dtype == 'object'):
        print(admission_column)
        print(selected_admission_data_df[admission_column].value_counts())
        print('\n')

In [None]:
for admission_column in selected_admission_data_columns:
    if (selected_admission_data_df[admission_column].dtype != 'object'):
        print(admission_column)
        print(selected_admission_data_df[admission_column].describe())
        print('\n')

In [None]:
selected_admission_data_df.rename(columns={'Weight':'weight'}, inplace=True)
selected_admission_data_df, excluded_weight_df = restrict_variable_to_possible_ranges(selected_admission_data_df, 'weight', possible_value_ranges, verbose=True)

In [None]:
selected_admission_data_df.rename(columns={'Age (calc.)':'age'}, inplace=True)
selected_admission_data_df, excluded_age_df = restrict_variable_to_possible_ranges(selected_admission_data_df, 'age', possible_value_ranges, verbose=True)

In [None]:
selected_admission_data_df.rename(columns={'1st syst. bp':'sys'}, inplace=True)
selected_admission_data_df, excluded_sys_df = restrict_variable_to_possible_ranges(selected_admission_data_df, 'sys', possible_value_ranges, verbose=True)
selected_admission_data_df.rename(columns={'sys':'systolic_blood_pressure'}, inplace=True)

In [None]:
selected_admission_data_df.rename(columns={'1st diast. bp':'dia'}, inplace=True)
selected_admission_data_df, excluded_dia_df = restrict_variable_to_possible_ranges(selected_admission_data_df, 'dia', possible_value_ranges, verbose=True)
selected_admission_data_df.rename(columns={'dia':'diastolic_blood_pressure'}, inplace=True)

In [None]:
selected_admission_data_df.rename(columns={'1st glucose':'glucose'}, inplace=True)
selected_admission_data_df, excluded_glucose_df = restrict_variable_to_possible_ranges(selected_admission_data_df, 'glucose', possible_value_ranges, verbose=True)

In [None]:
selected_admission_data_df.rename(columns={'1st creatinine':'creatinine'}, inplace=True)
selected_admission_data_df, excluded_creatinine_df = restrict_variable_to_possible_ranges(selected_admission_data_df, 'creatinine', possible_value_ranges, verbose=True)

Restrict to less categories
- Group Medhist Stroke/TIA/ICH
- Group Referral subgroups


In [None]:
selected_admission_data_df['MedHist cerebrovascular_event'] = (selected_admission_data_df[['MedHist Stroke', 'MedHist TIA', 'MedHist ICH']] == 'yes').any(axis=1)
selected_admission_data_df[['MedHist cerebrovascular_event', 'MedHist Stroke', 'MedHist TIA', 'MedHist ICH']].head()

In [None]:
selected_admission_data_df.drop(columns=['MedHist Stroke', 'MedHist TIA', 'MedHist ICH'], inplace=True)

In [None]:
selected_admission_data_df.loc[selected_admission_data_df['Referral'] == 'Other Stroke Unit or Stroke Center', 'Referral'] = 'Other hospital'
selected_admission_data_df.loc[selected_admission_data_df['Referral'] == 'General Practitioner', 'Referral'] = 'Self referral'

In [None]:
selected_admission_data_df['Referral'].value_counts()

In [None]:
# plot percentage of present values in each column
((selected_admission_data_df.shape[0] - selected_admission_data_df.isna().sum())/selected_admission_data_df.shape[0]*100).plot.barh(figsize=(10,10), title='Percentage of present values in each column')
plt.show()

In [None]:
for column in selected_admission_data_df.columns:
    print(f'{column}: {len(selected_admission_data_df[column].unique())} of which nan {selected_admission_data_df[column].isna().sum()}')

### Dealing with missing values
- for variables with DPI overlap -> leave NaN for now (should be dealt with after fusion)
- for age -> check eds database (does not work)
- for variables with no DPI overlap -> fill with median

In [None]:
eds_df_path = '/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction_20211110/eds_j1.csv'

In [None]:
eds_df = pd.read_csv(eds_df_path, sep=';', dtype=str)

In [None]:
eds_df.head()

In [None]:
from preprocessing.utils import create_ehr_case_identification_column

eds_df['case_admission_id'] = create_ehr_case_identification_column(eds_df)

In [None]:
eds_df['age'] = (pd.to_datetime(eds_df['begin_date'], format='%d.%m.%Y %H:%M') - pd.to_datetime(eds_df['DOB'], format='%d.%m.%Y %H:%M')).dt.days / 365.25
eds_df[['age', 'DOB', 'begin_date']].head()

In [None]:
for case_admission_id in selected_admission_data_df['case_admission_id'].unique():
    if selected_admission_data_df[selected_admission_data_df['case_admission_id'] == case_admission_id]['age'].isna().all():
        print(case_admission_id)
        print(eds_df[eds_df['case_admission_id'] == case_admission_id][['age', 'DOB', 'begin_date']])

Impression: fetching age from eds database does not work

In [None]:
variables_with_dpi_overlap = ['case_admission_id', 'systolic_blood_pressure', 'diastolic_blood_pressure', 'glucose', 'creatinine', 'NIH on admission', 'weight']
print('Variables without DPI overlap')
print(set(selected_admission_data_df.columns) - set(variables_with_dpi_overlap))

In [None]:
continuous_variables = ['age']

In [None]:
for variable in selected_admission_data_df.columns:
    if variable in variables_with_dpi_overlap:
        continue
    if variable in continuous_variables:
        selected_admission_data_df[variable].fillna(selected_admission_data_df[variable].median(skipna=True), inplace=True)
    else:
        selected_admission_data_df[variable].fillna(selected_admission_data_df[variable].mode(dropna=True)[0], inplace=True)

In [None]:
for column in selected_admission_data_df.columns:
    print(f'{column}: {len(selected_admission_data_df[column].unique())} of which nan {selected_admission_data_df[column].isna().sum()}')

In [None]:
pd.melt(selected_admission_data_df, id_vars=['case_admission_id'])