In [None]:
import os

import matplotlib.pyplot as plt
import pandas as pd

In [None]:
data_path = '/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction_20211110'
admission_data_path = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_unit_dataset/data/stroke_registry/post_hoc_modified/stroke_registry_post_hoc_modified.xlsx'
patient_selection_path = '/Users/jk1/temp/opsum_extration_output/high_frequency_data_patient_selection.csv'

In [None]:
from preprocessing.variable_assembly.variable_database_assembly import assemble_variable_database

feature_df = assemble_variable_database(data_path, admission_data_path, patient_selection_path)

In [None]:
feature_df.head()

In [None]:
feature_df['sample_label'].unique().tolist()

In [None]:
feature_df.groupby(['case_admission_id', 'sample_label'])['sample_label'].count().groupby('sample_label').describe()

In [None]:
# count how many patients have at least one occurence of each sample_label
n_patients = feature_df.case_admission_id.unique().shape[0]
count_df = pd.DataFrame(columns=['sample_label', 'count', 'n_missing', 'percentage_missing'])
for sample_label in feature_df.sample_label.unique():
    n_patients_with_sample_label = feature_df.query('sample_label == @sample_label').case_admission_id.unique().shape[0]
    count_df = count_df.append({'sample_label': sample_label,
                                'count': n_patients_with_sample_label,
                                'n_missing': n_patients - n_patients_with_sample_label,
                                'percentage_missing': (n_patients - n_patients_with_sample_label) / n_patients,
                                'percentage_present': n_patients_with_sample_label / n_patients},
                               ignore_index=True)
count_df

In [None]:
# append a row with missingness of any cholesterol
patients_with_cholesterol_HDL = feature_df.query('sample_label == "cholesterol HDL"').case_admission_id.unique()
patients_with_total_cholesterol = feature_df.query('sample_label == "cholesterol total"').case_admission_id.unique()
patients_with_cholesterol_LDL = feature_df.query('sample_label == "LDL cholesterol calcule"').case_admission_id.unique()
patients_with_any_cholesterol = set(patients_with_cholesterol_HDL).union(set(patients_with_total_cholesterol)).union(set(patients_with_cholesterol_LDL))
n_patients_with_any_cholesterol = len(patients_with_any_cholesterol)
count_df = count_df.append({'sample_label': 'any_cholesterol',
                            'count': n_patients_with_any_cholesterol,
                            'n_missing': n_patients - n_patients_with_any_cholesterol,
                            'percentage_missing': (n_patients - n_patients_with_any_cholesterol) / n_patients,
                            'percentage_present': n_patients_with_any_cholesterol / n_patients},
                           ignore_index=True)

In [None]:
# output_dir = '/Users/jk1/temp/opsum_extration_output'
# count_df.to_csv(os.path.join(output_dir, 'label_count_per_patient_df.csv'))

In [None]:
ax = count_df.plot.barh(x='sample_label', y='percentage_present', figsize=(10, 20), legend=False)
ax.set_xlabel('Number of patients with label')
ax.set_title('Number of patients per feature')
plt.show()

In [None]:
for sample_label in feature_df.sample_label.unique():
    print(sample_label)


In [None]:
# list of all case_admission_ids
case_admission_ids = feature_df.case_admission_id.unique().tolist()

In [None]:
# load list of patients with wrong EDS
list_of_wrong_EDS_patients_path = '/Users/jk1/temp/opsum_extration_output/patients_with_wrong_eds_start.csv'
list_of_wrong_EDS_patients_df = pd.read_csv(list_of_wrong_EDS_patients_path)

In [None]:
# load list of patients with no data
list_of_no_data_patients_path = '/Users/jk1/temp/opsum_extration_output/patients_with_missing_data.csv'
list_of_no_data_patients_df = pd.read_csv(list_of_no_data_patients_path)

Find patients with missing LDL /HDL / cholesterol data

In [None]:
# list of case_admission_id with missing cholesterol HDL
patients_with_cholesterol_HDL = feature_df.query('sample_label == "cholesterol HDL"').case_admission_id.unique()
patients_with_total_cholesterol = feature_df.query('sample_label == "cholesterol total"').case_admission_id.unique()
patients_with_cholesterol_LDL = feature_df.query('sample_label == "LDL cholesterol calcule"').case_admission_id.unique()
patients_without_cholesterol_HDL = [x for x in case_admission_ids if x not in patients_with_cholesterol_HDL]
patients_without_cholesterol_LDL = [x for x in case_admission_ids if x not in patients_with_cholesterol_LDL]
patients_without_total_cholesterol = [x for x in case_admission_ids if x not in patients_with_total_cholesterol]
patients_without_any_cholesterol_value = set(case_admission_ids) - set(patients_with_cholesterol_HDL) - set(patients_with_cholesterol_LDL) - set(patients_with_total_cholesterol)

In [None]:
restricted_patients_without_any_cholesterol_value = [x for x in patients_without_any_cholesterol_value if x not in list_of_wrong_EDS_patients_df.case_admission_id.unique() and x not in list_of_no_data_patients_df.case_admission_id.unique()]

In [None]:
restricted_patients_without_total_cholesterol = [x for x in patients_without_total_cholesterol if x not in list_of_wrong_EDS_patients_df.case_admission_id.unique() and x not in list_of_no_data_patients_df.case_admission_id.unique()]

In [None]:
# get patients with both LDL and HDL and total cholesterol
patients_with_both_LDL_and_HDL = set(patients_with_cholesterol_LDL) & set(patients_with_cholesterol_HDL) & set(patients_with_total_cholesterol)
patients_with_both_LDL_and_HDL

In [None]:
set(restricted_patients_without_any_cholesterol_value)

In [None]:
feature_df[(feature_df['case_admission_id'] == '5336975762_26102019') & (feature_df['sample_label'].str.startswith('LDL'))]

In [None]:
feature_df[(feature_df['case_admission_id'] == '5336975762_26102019')]

In [None]:
feature_df[(feature_df['case_admission_id'].str.startswith('97145347'))]

In [None]:
feature_df[(feature_df['case_admission_id'].str.startswith('533697'))]['case_admission_id'].unique()

In [None]:
from preprocessing.variable_assembly.relative_timestamps import transform_to_relative_timestamps
from preprocessing.encoding_categorical_variables.encode_categorical_variables import encode_categorical_variables
from preprocessing.normalisation.normalisation import normalise_data

restricted_feature_df = transform_to_relative_timestamps(feature_df, drop_old_columns=False, restrict_to_time_range=True)
normalised_restricted_feature_df = normalise_data(restricted_feature_df, verbose=True)
cat_encoded_normalised_restricted_feature_df = encode_categorical_variables(normalised_restricted_feature_df)

In [None]:
n_patients = cat_encoded_normalised_restricted_feature_df.case_admission_id.unique().shape[0]
count_df = pd.DataFrame(columns=['sample_label', 'count', 'n_missing', 'percentage_missing'])
for sample_label in cat_encoded_normalised_restricted_feature_df.sample_label.unique():
    n_patients_with_sample_label = cat_encoded_normalised_restricted_feature_df.query('sample_label == @sample_label').case_admission_id.unique().shape[0]
    count_df = count_df.append({'sample_label': sample_label,
                                'count': n_patients_with_sample_label,
                                'n_missing': n_patients - n_patients_with_sample_label,
                                'percentage_missing': (n_patients - n_patients_with_sample_label) / n_patients,
                                'percentage_present': n_patients_with_sample_label / n_patients},
                               ignore_index=True)

In [None]:
ax = count_df.plot.barh(x='sample_label', y='percentage_present', figsize=(10, 20), legend=False)
ax.set_xlabel('Number of patients with label')
ax.set_title('Number of patients per feature')
plt.show()