In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import os
import numpy as np

!Warning! For scale, patient_id seems to be overwritten by eds_final_patient_id

In [None]:
data_path = '/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction_20220815'
scales_file_start = 'scale'

In [None]:
scales_files = [pd.read_csv(os.path.join(data_path, f), delimiter=';', encoding='utf-8', dtype=str)
             for f in os.listdir(data_path)
             if f.startswith(scales_file_start)]

In [None]:
scales_df = pd.concat(scales_files, ignore_index=True)

In [None]:
eds_df = pd.read_csv(os.path.join(data_path, 'eds_j1.csv'), delimiter=';', encoding='utf-8', dtype=str)

In [None]:
eds_df.head()

## Verfing patient ids

Verify if patient_id was overwritten

__This has been automated__: /ehr_extraction_verification/detect_overwritten_patient_ids.py


In [None]:
eds_df[eds_df.patient_id != eds_df.eds_final_patient_id]

In [None]:
scales_df[scales_df.patient_id != scales_df.eds_final_patient_id]

In [None]:
eds_df[eds_df.eds_final_patient_id == '279818']

In [None]:
scales_df[scales_df.eds_final_patient_id == '279818']

In [None]:
from ehr_extraction_verification.detect_overwritten_patient_ids import detect_overwritten_patient_ids

detect_overwritten_patient_ids(scales_df, eds_df, verbose=True)

Patient id seems to be have overwritten by final eds id

### Correct overwritten patient_id

__Method__:

Retrieve correct patient id by matching 'patient_id', 'eds_end_4digit', 'eds_manual', 'patient_id_manual' on scale_df side with 'eds_final_patient_id', 'eds_end_4digit', 'eds_manual', 'patient_id_manual' on eds_df

__This has been automated__: preprocessing.utils.correct_overwritten_patient_id

In [None]:
intermediate_retrieval_df = scales_df.apply(lambda x: str(x['patient_id']) + '/' + str(x['eds_end_4digit']) + '/' + str(x['eds_manual']) + '/' + str(x['patient_id_manual']), axis=1)

In [None]:
intermediate_retrieval_df.drop_duplicates(inplace=True)
intermediate_retrieval_df.nunique()

In [None]:
truncated_eds_df = eds_df[['patient_id', 'eds_end_4digit', 'eds_manual', 'patient_id_manual', 'eds_final_patient_id']]

In [None]:
truncated_eds_df['retrieval_id'] = truncated_eds_df.apply(lambda x: str(x['eds_final_patient_id']) + '/' + str(x['eds_end_4digit']) + '/' + str(x['eds_manual']) + '/' + str(x['patient_id_manual']), axis=1)

In [None]:
temp = pd.merge(intermediate_retrieval_df.to_frame(), truncated_eds_df, left_on=0, right_on='retrieval_id', how='left')

In [None]:
duplicated_id = temp[temp.retrieval_id.duplicated()].retrieval_id

In [None]:
temp[temp.retrieval_id.isin(duplicated_id)]

Extra rows are due to duplicated entries in eds_df

In [None]:
# TODO Try merge lefto on / right on
truncated_eds_df.drop_duplicates(inplace=True)
scales_df = pd.merge(scales_df, truncated_eds_df, left_on=['patient_id', 'eds_end_4digit', 'eds_manual', 'patient_id_manual'],
         right_on=['eds_final_patient_id', 'eds_end_4digit', 'eds_manual', 'patient_id_manual'], suffixes=('', '_eds'), how='left')

In [None]:
scales_df.drop(['patient_id', 'eds_final_patient_id_eds', 'nr'], axis=1, inplace=True)
scales_df.rename(columns={'patient_id_eds': 'patient_id'}, inplace=True)

In [None]:
scales_df.head()

In [None]:
scales_df[scales_df.eds_final_patient_id == '279818'][['patient_id', 'eds_end_4digit', 'eds_manual', 'eds_final_patient_id']]

## Preprocessing

In [None]:
from preprocessing.utils import create_ehr_case_identification_column

scales_df['case_admission_id'] = create_ehr_case_identification_column(scales_df)

In [None]:
columns_to_drop = ['patient_id', 'eds_end_4digit', 'eds_manual', 'DOB', 'begin_date',
                       'end_date', 'death_date', 'death_hosp', 'eds_final_id',
                       'eds_final_begin', 'eds_final_end', 'eds_final_patient_id',
                       'eds_final_birth', 'eds_final_death', 'eds_final_birth_str',
                       'date_from', 'date_to', 'patient_id_manual', 'stroke_onset_date', 'Referral', 'match_by', 'multiple_id']
scales_df.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
scales_df.head()

In [None]:
possible_value_ranges_file = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(''))),
                                          'preprocessing', 'possible_ranges_for_variables.xlsx')
possible_value_ranges = pd.read_excel(possible_value_ranges_file)

In [None]:
def restrict_variable_to_possible_ranges(df, variable_name, possible_value_ranges, verbose=False):
    """
    Restricts a variable to the possible ranges in the possible_value_ranges dataframe.
    """
    variable_range = possible_value_ranges[possible_value_ranges['variable_label'] == variable_name]
    variable_range = variable_range.iloc[0]
    clean_df = df.copy()
    # set score to np.nan if outside of range
    clean_df.loc[(df['scale'] == variable_name) & (df['score'] < variable_range['Min']), 'score'] = np.nan
    clean_df.loc[(df['scale'] == variable_name) & (df['score'] > variable_range['Max']), 'score'] = np.nan
    if verbose:
        print(f'Excluding {clean_df.score.isna().sum()} observations because out of range')
    excluded_df = df[clean_df.score.isna()]
    clean_df = clean_df.dropna(subset=['score'])
    return clean_df, excluded_df

In [None]:
scales_df.scale.value_counts()

In [None]:
glasgow_equivalents = ['Glasgow + pupilles', 'Glasgow + pupilles + sensibilité/motricité', 'Glasgow', 'Glasgow  urgence',
                       'Neurologie - Glasgow']
scales_df.loc[scales_df['scale'].isin(glasgow_equivalents), 'scale'] = 'Glasgow Coma Scale'


In [None]:
NIHSS_equivalents = ['NIHSS - National Institute oh Health Stroke Scale', 'NIHSS - National Institute of Health Stroke Scale']
scales_df.loc[scales_df['scale'].isin(NIHSS_equivalents), 'scale'] = 'NIHSS'

In [None]:
pain_scale_equivalents = ['EVA', 'Echelle douleur numérique', 'Douleur - b - Echelle numérique','Douleur - a - EVA', 'Douleur - c - Echelle verbale']
scales_df.loc[scales_df['scale'].isin(pain_scale_equivalents), 'scale'] = 'pain scale'
# drop rows with scale = 'Douleur - h - CPOT' as not comparable with other scales
scales_df.drop(scales_df[scales_df['scale'].str.contains('CPOT')].index, inplace=True)

In [None]:
scales_df.scale.value_counts()

In [None]:
scales_df.dropna(subset=['score'], inplace=True)
remaining_non_numerical_values = \
    scales_df[pd.to_numeric(scales_df['score'], errors='coerce').isnull()][
        'score'].unique()
print('Remaining non-numerical values:', remaining_non_numerical_values)
if len(remaining_non_numerical_values) > 0:
    raise ValueError(f'Remaining non-numerical values: {remaining_non_numerical_values}')
scales_df['score'] = pd.to_numeric(scales_df['score'], errors='coerce')


In [None]:
scales_df[scales_df.scale == 'NIHSS'].score.describe()

In [None]:
cleaned_scales_df, NIHSS_excluded_df = restrict_variable_to_possible_ranges(scales_df, 'NIHSS', possible_value_ranges, verbose=True)
cleaned_scales_df, glasgow_excluded_df = restrict_variable_to_possible_ranges(cleaned_scales_df, 'Glasgow Coma Scale', possible_value_ranges, verbose=True)

In [None]:
cleaned_scales_df.head()

In [None]:
cleaned_scales_df[scales_df['scale'] == 'NIHSS'].plot.hist(bins=50)
plt.show()

In [None]:
cleaned_scales_df[scales_df['scale'] == 'Glasgow Coma Scale'].plot.hist(bins=50)
plt.show()


In [None]:
cleaned_scales_df[scales_df['scale'] == 'pain scale'].plot.hist(bins=50)
plt.show()


In [None]:
scales_df.groupby('scale')['score'].describe()



## Testing final function

In [None]:
from preprocessing.scales_preprocessing.scales_preprocessing import preprocess_scales

preprocessed_scales = preprocess_scales(scales_df.copy(), eds_df, verbose=True)

In [None]:
preprocessed_scales.head()

In [None]:
preprocessed_scales.groupby('scale')['score'].describe()


In [None]:
preprocessed_scales[preprocessed_scales['scale'] == 'NIHSS'].plot.hist(bins=50)
plt.show()