In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import os
import numpy as np

!Warning! For scale, patient_id seems to be overwritten by eds_final_patient_id

In [None]:
data_path = '/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction_20211110'
scales_file_start = 'scale'

In [None]:
scales_files = [pd.read_csv(os.path.join(data_path, f), delimiter=';', encoding='utf-8', dtype={"patient_id":"string", "eds_end_4digit":"string"})
             for f in os.listdir(data_path)
             if f.startswith(scales_file_start)]

In [None]:
scales_df = pd.concat(scales_files, ignore_index=True)

In [None]:
eds_df = pd.read_csv(os.path.join(data_path, 'eds_j1.csv'), delimiter=';', encoding='utf-8', dtype={"patient_id":"string", "eds_end_4digit":"string", "eds_final_patient_id":"string"})

In [None]:
eds_df.head()

In [None]:
scales_df['original_patient_id'] = scales_df['patient_id'].apply(lambda x: eds_df[eds_df['eds_final_patient_id'] == x]['patient_id'].iloc[0])
scales_df['original_eds_last_4digits'] = scales_df['patient_id'].apply(lambda x: eds_df[eds_df['eds_final_patient_id'] == x]['eds_end_4digit'].iloc[0])


In [None]:
scales_df['case_admission_id'] = scales_df['original_patient_id'].astype(str) \
                                 + scales_df['original_eds_last_4digits'].astype(str)\
                                 + '_' + scales_df['begin_date'].apply(lambda bd: ''.join(bd.split(' ')[0].split('.')))

In [None]:
columns_to_drop = ['nr', 'patient_id', 'eds_end_4digit', 'eds_manual', 'DOB', 'begin_date',
       'end_date', 'death_date', 'death_hosp', 'eds_final_id',
       'eds_final_begin', 'eds_final_end', 'eds_final_patient_id',
       'eds_final_birth', 'eds_final_death', 'eds_final_birth_str',
       'date_from', 'date_to']
scales_df.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
scales_df.head()



In [None]:
possible_value_ranges_file = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(''))),
                                          'preprocessing', 'possible_ranges_for_variables.xlsx')
possible_value_ranges = pd.read_excel(possible_value_ranges_file)

In [None]:
def restrict_variable_to_possible_ranges(df, variable_name, possible_value_ranges, verbose=False):
    """
    Restricts a variable to the possible ranges in the possible_value_ranges dataframe.
    """
    variable_range = possible_value_ranges[possible_value_ranges['variable_label'] == variable_name]
    variable_range = variable_range.iloc[0]
    clean_df = df.copy()
    # set score to np.nan if outside of range
    clean_df.loc[(scales_df['scale'] == variable_name) & (scales_df['score'] < variable_range['Min']), 'score'] = np.nan
    clean_df.loc[(scales_df['scale'] == variable_name) & (scales_df['score'] > variable_range['Max']), 'score'] = np.nan
    if verbose:
        print(f'Excluding {clean_df.score.isna().sum()} observations because out of range')
    excluded_df = df[clean_df.score.isna()]
    clean_df = clean_df.dropna()
    return clean_df, excluded_df

In [None]:
scales_df.scale.value_counts()

In [None]:
glasgow_equivalents = ['Glasgow + pupilles', 'Glasgow + pupilles + sensibilité/motricité', 'Glasgow', 'Glasgow  urgence',
                       'Neurologie - Glasgow']
scales_df.loc[scales_df['scale'].isin(glasgow_equivalents), 'scale'] = 'Glasgow Coma Scale'


In [None]:
NIHSS_equivalents = ['NIHSS - National Institute oh Health Stroke Scale', 'NIHSS - National Institute of Health Stroke Scale']
scales_df.loc[scales_df['scale'].isin(NIHSS_equivalents), 'scale'] = 'NIHSS'

In [None]:
pain_scale_equivalents = ['Douleur - b - Echelle numérique','Douleur - a - EVA', 'Douleur - c - Echelle verbale']
scales_df.loc[scales_df['scale'].isin(pain_scale_equivalents), 'scale'] = 'pain scale'
# drop rows with scale = 'Douleur - h - CPOT' as not comparable with other scales
scales_df.drop(scales_df[scales_df['scale'].str.contains('CPOT')].index, inplace=True)

In [None]:
scales_df.scale.value_counts()

In [None]:
cleaned_scales_df, NIHSS_excluded_df = restrict_variable_to_possible_ranges(scales_df, 'NIHSS', possible_value_ranges, verbose=True)
cleaned_scales_df, glasgow_excluded_df = restrict_variable_to_possible_ranges(cleaned_scales_df, 'Glasgow Coma Scale', possible_value_ranges, verbose=True)

In [None]:
cleaned_scales_df.head()

In [None]:
cleaned_scales_df[scales_df['scale'] == 'NIHSS'].plot.hist(bins=50)
plt.show()

In [None]:
cleaned_scales_df[scales_df['scale'] == 'Glasgow Coma Scale'].plot.hist(bins=50)
plt.show()


In [None]:
cleaned_scales_df[scales_df['scale'] == 'pain scale'].plot.hist(bins=50)
plt.show()


In [None]:
scales_df.groupby('scale')['score'].describe()

