In [None]:
import pandas as pd
import os
import numpy as np

from preprocessing.utils import create_ehr_case_identification_column

In [None]:
data_path = '/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction_20220815'
lab_file_start = 'labo'

In [None]:
lab_files = [pd.read_csv(os.path.join(data_path, f), delimiter=';', encoding='utf-8', dtype=str)
             for f in os.listdir(data_path)
             if f.startswith(lab_file_start)]

In [None]:
lab_df = pd.concat(lab_files, ignore_index=True)

In [None]:
lab_df['case_admission_id'] = create_ehr_case_identification_column(lab_df)

In [None]:
blood_material_equivalents = ['sga', 'sgv', 'sgvm', 'sgc', 'sgv ponction', 'sgv cathéter', 'sga cathéter', 'cathéter artériel', 'cathéter veineux', 'plasma', 'Sang', 'sg cordon']

In [None]:
columns_to_drop = ['nr', 'patient_id', 'eds_end_4digit', 'eds_manual', 'DOB', 'begin_date',
                   'end_date', 'death_date', 'death_hosp', 'eds_final_id',
                   'eds_final_begin', 'eds_final_end', 'eds_final_patient_id',
                   'eds_final_birth', 'eds_final_death', 'eds_final_birth_str',
                   'date_from', 'date_to', 'patient_id_manual', 'stroke_onset_date', 'Referral', 'match_by',
                   'multiple_id']

lab_df.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
lab_df.columns

### For lab files where columns are split by lab name

In [None]:
identification_columns = ['case_admission_id', 'sample_date']


In [None]:
lab_names = set([c.split('_')[0] for c in lab_df.columns if c not in identification_columns])
new_lab_column_headers = set(['_'.join(c.split('_')[1:]) for c in lab_df.columns if c not in identification_columns])

print(lab_names)

In [None]:
# split lab df into individual lab dfs for every lab name
lab_df_split_by_lab_name = []

for index, lab_name in enumerate(lab_names):
    print(index, lab_name)
    selected_columns = identification_columns + [c for c in lab_df.columns if c.split('_')[0] == lab_name]
    individual_lab_df = lab_df[selected_columns].dropna(subset=[f'{lab_name}_value'])
    individual_lab_df.columns = identification_columns + ['_'.join(c.split('_')[1:]) for c in individual_lab_df.columns if c.startswith(lab_name)]
    individual_lab_df['lab_name'] = lab_name
    lab_df_split_by_lab_name.append(individual_lab_df)

In [None]:
lab_name = 'pO2'
selected_columns = identification_columns + [c for c in lab_df.columns if c.split('_')[0] == lab_name]
individual_lab_df = lab_df[selected_columns].dropna(subset=[f'{lab_name}_value'])
individual_lab_df.columns = identification_columns + ['_'.join(c.split('_')[1:]) for c in individual_lab_df.columns if c.startswith(lab_name)]
individual_lab_df.reset_index(drop=True, inplace=True)
individual_lab_df

In [None]:
reorganised_lab_df = pd.concat(lab_df_split_by_lab_name, ignore_index=True)

In [None]:
reorganised_lab_df.head()

### For lab files where reorganisation can be skipped

In [None]:
equalized_reorganised_lab_df = lab_df

In [None]:
equivalent_labels_df = pd.read_csv('equivalent_labels.csv')
equivalent_labels_df

In [None]:
for column in equivalent_labels_df.columns:
    equivalence_list = equivalent_labels_df[f'{column}'].dropna().values
    equalized_reorganised_lab_df.loc[equalized_reorganised_lab_df['dosage_label'].isin(equivalence_list[1:]), 'dosage_label'] = equivalence_list[0]

In [None]:
equalized_reorganised_lab_df[equalized_reorganised_lab_df.dosage_label.str.contains('C-réactive')].dosage_label.unique()

In [None]:
equalized_reorganised_lab_df[(equalized_reorganised_lab_df.material_label.isin(blood_material_equivalents))].groupby('dosage_label').case_admission_id.nunique()

In [None]:
dosage_labels_to_exclude = ['érythrocytes agglutinés', 'Type d\'érythrocytes', 'Type des érythrocytes', 'érythrocytes en rouleaux',
                            'Cristaux cholestérol',
                            'potassium débit', 'urée débit', 'sodium débit', 'glucose débit',
                            'protéine C-réactive, POCT',
                            'activité anti-Xa (HBPM), autre posologie', 'activité anti-Xa (HBPM), thérapeutique, 1x /jour']

equalized_reorganised_lab_df = equalized_reorganised_lab_df[~equalized_reorganised_lab_df['dosage_label'].isin(dosage_labels_to_exclude)]

In [None]:
dosage_labels = equalized_reorganised_lab_df['dosage_label'].value_counts().reset_index()

In [None]:
dosage_units_df = pd.read_csv('dosage_units.csv')
dosage_units_df

In [None]:
for dosage_label in dosage_units_df.columns:
    equalized_reorganised_lab_df.drop(equalized_reorganised_lab_df[(equalized_reorganised_lab_df['dosage_label'] == dosage_label) & (~equalized_reorganised_lab_df.unit_of_measure.isin(dosage_units_df[dosage_label]))].index, inplace=True)


In [None]:
equalized_reorganised_lab_df[equalized_reorganised_lab_df.value == '----']

In [None]:
# check that units correspond
for dosage_label in equalized_reorganised_lab_df['dosage_label'].unique():
    print(dosage_label, equalized_reorganised_lab_df[equalized_reorganised_lab_df['dosage_label'] == dosage_label]['unit_of_measure'].unique())

In [None]:
dosage_label = 'pO2'
temp = equalized_reorganised_lab_df[
            ~((equalized_reorganised_lab_df['dosage_label'].str.contains(dosage_label)) &
            (equalized_reorganised_lab_df['material_label'] != 'sga'))
        ]
temp[temp.dosage_label.str.contains(dosage_label)]

In [None]:
equalized_reorganised_lab_df.loc[reorganised_lab_df['material_label'].isin(blood_material_equivalents), 'material_label'] = 'any_blood'

In [None]:
material_to_exclude = ['LCR', 'liqu. pleural', 'épanchement', 'sg cordon', 'liqu. abdo.', 'liqu. ascite', 'liqu.']
material_to_maybe_exclude = ['urine']
equalized_reorganised_lab_df = equalized_reorganised_lab_df[~equalized_reorganised_lab_df['material_label'].isin(material_to_exclude)]

In [None]:
# filter non numerical values in value column
non_numerical_values = equalized_reorganised_lab_df[pd.to_numeric(equalized_reorganised_lab_df['value'], errors='coerce').isnull()]
non_numerical_values['value'].unique()

In [None]:
# remove non numerical values in value column
non_numerical_values_to_remove = ['ERROR', 'nan', 'SANS RES.', 'Hémolysé', 'sans resultat',
       'NON REALISE', 'NON INTERPRÉT.', 'COA', 'TAM']
equalized_reorganised_lab_df = equalized_reorganised_lab_df[~equalized_reorganised_lab_df['value'].isin(non_numerical_values_to_remove)]
equalized_reorganised_lab_df.dropna(subset=['value'], inplace=True)
print('Remaining non-numerical values:', equalized_reorganised_lab_df[pd.to_numeric(equalized_reorganised_lab_df['value'], errors='coerce').isnull()]['value'].unique())

In [None]:
equalized_reorganised_lab_df.head()

In [None]:
dosage_label = 'pO2'
equalized_reorganised_lab_df[~equalized_reorganised_lab_df[
            (equalized_reorganised_lab_df['dosage_label'].str.contains(dosage_label)) &
            (equalized_reorganised_lab_df['material_label'] != 'sga')
        ]].head()

In [None]:
# get mean number of values per dosage label patient admission id
median_observations_per_case_admission_id = equalized_reorganised_lab_df.groupby(['case_admission_id', 'dosage_label'])['value'].count().reset_index()
median_observations_per_case_admission_id.groupby('dosage_label').median()

In [None]:
from matplotlib.dates import DateFormatter
import seaborn as sns
import matplotlib.pyplot as plt

pa_id = np.random.choice(equalized_reorganised_lab_df['case_admission_id'].unique())
# pa_id = '9996_05112020'
dosage_label = 'cholestérol HDL'
temp = equalized_reorganised_lab_df[(equalized_reorganised_lab_df['case_admission_id'] == pa_id) & (equalized_reorganised_lab_df['dosage_label'].isin([dosage_label]))].copy()
temp['value'] = pd.to_numeric(temp['value'], errors='coerce')
temp['sample_date'] = pd.to_datetime(temp['sample_date'], format='%d.%m.%Y %H:%M')
temp = temp.dropna(subset=['value'])
ax = sns.scatterplot(x='sample_date', y='value', data=temp, hue='value', legend=False)
# Define the date format
date_form = DateFormatter("%m-%d-%Y")
ax.xaxis.set_major_formatter(date_form)
ax.tick_params(axis="x", rotation=45)

plt.show()

In [None]:
equalized_reorganised_lab_df[(equalized_reorganised_lab_df['dosage_label'] == 'cholestérol HDL')
                             & (equalized_reorganised_lab_df['value'].notnull())].case_admission_id.unique().shape[0]

In [None]:
equalized_reorganised_lab_df[(equalized_reorganised_lab_df['case_admission_id'] == '978287281437_02032018')
                             & (equalized_reorganised_lab_df['dosage_label'].str.startswith('LDL'))]


# Standardisation

by substracting median and dividing by interquartile range (q75 - q25)

In [None]:
standardised_labs = equalized_reorganised_lab_df.copy()

for dosage_label in standardised_labs['dosage_label'].unique():
    temp = standardised_labs[standardised_labs['dosage_label'] == dosage_label].copy()
    temp['value'] = pd.to_numeric(temp['value'], errors='coerce')
    temp['value'] = (temp['value'] - temp['value'].median())/ (temp['value'].quantile(0.75) - temp['value'].quantile(0.25))
    standardised_labs.loc[standardised_labs['dosage_label'] == dosage_label, 'standardised_value'] = temp['value']

standardised_labs.head()

Edge cases:
- pH is missing for arterial blood
- lactate missing

Other remarks:
- urine samples should be excluded for now to simplify feature space

## Testing final script

In [None]:
data_path = '/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction_20220815'

In [None]:
from preprocessing.lab_preprocessing.lab_preprocessing import preprocess_labs
from preprocessing.patient_selection.filter_ehr_patients import filter_ehr_patients
from preprocessing.variable_assembly.variable_database_assembly import load_data_from_main_dir

lab_file_start = 'labo'
lab_df = load_data_from_main_dir(data_path, lab_file_start)
lab_df = filter_ehr_patients(lab_df)
preprocessed_lab_df = preprocess_labs(lab_df, verbose=True)

In [None]:
preprocessed_lab_df.head()