In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
data_path = '/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction_20220815'
admission_data_path = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_unit_dataset/data/stroke_registry/post_hoc_modified/stroke_registry_post_hoc_modified.xlsx'
patient_selection_path = '/Users/jk1/temp/opsum_extraction_output/high_frequency_data_patient_selection_with_details.csv'
output_path = '/Users/jk1/temp/opsum_prepro_output/temp_output'
log_dir = output_path
verbose = True

In [None]:
from preprocessing.variable_assembly.variable_database_assembly import assemble_variable_database

feature_database = assemble_variable_database(data_path, admission_data_path, patient_selection_path, verbose=True, log_dir=log_dir)


In [None]:
feature_database.head()

In [None]:
feature_database.case_admission_id.nunique()

In [None]:
temp = feature_database[feature_database['case_admission_id'] == '98535071_6410']

In [None]:
datatime_format = '%d.%m.%Y %H:%M'

pd.to_datetime(temp[temp.source == 'stroke_registry'].sample_date, format=datatime_format)

In [None]:
temp

In [None]:
# temp[temp['sample_date_dt'].isnull()]

In [None]:
feature_database['sample_date'] = pd.to_datetime(feature_database['sample_date'], format='%d.%m.%Y %H:%M')

In [None]:
first_sample_date = feature_database.groupby('case_admission_id').sample_date.min()
first_sample_date = first_sample_date.reset_index(level=0)
first_sample_date.columns = ['case_admission_id', 'first_sample_date']

In [None]:
last_sample_date = feature_database.groupby('case_admission_id').sample_date.max()
last_sample_date = last_sample_date.reset_index(level=0)
last_sample_date.columns = ['case_admission_id', 'last_sample_date']

In [None]:
merged_data = pd.merge(first_sample_date, last_sample_date, on='case_admission_id')
merged_data['sample_range'] = pd.to_datetime(merged_data['last_sample_date']) - pd.to_datetime(merged_data['first_sample_date'])

In [None]:
merged_data

In [None]:
# find data with sample range < 1 day
merged_data[merged_data['sample_range'] < pd.Timedelta('1 days')]

## Testing restricting to patients within stroke registry

In [None]:
patient_selection_df = pd.read_csv(patient_selection_path, dtype=str)


In [None]:
patient_selection_df['case_admission_id'] = patient_selection_df['patient_id'].astype(str) \
                                 + patient_selection_df['EDS_last_4_digits'].astype(str) \
                                 + '_' + pd.to_datetime(patient_selection_df['Arrival at hospital'], format='%Y%m%d').dt.strftime('%d%m%Y').astype(str)

In [None]:
restricted_to_registry_df = feature_database[feature_database['case_admission_id'].isin(patient_selection_df['case_admission_id'])]

In [None]:
len(restricted_to_registry_df['case_admission_id'].unique())

In [None]:
from preprocessing.patient_selection.restrict_to_patient_selection import restrict_to_patient_selection

functional_restricted_df = restrict_to_patient_selection(feature_database, patient_selection_path, verbose=verbose)

In [None]:
feature_database[feature_database['case_admission_id'] == '1005030884_08112018']

In [None]:
# find list of cases in patient_selection_df that are not in restricted_to_registry_df
case_admission_ids_with_missing_data = (set(patient_selection_df['case_admission_id'].unique()) - set(functional_restricted_df['case_admission_id'].unique()))
missing_patients = patient_selection_df[patient_selection_df['case_admission_id'].isin(case_admission_ids_with_missing_data)]
missing_patients

In [None]:
# missing_patients.to_csv(os.path.join(output_path, 'patients_with_missing_data.csv'), index=False)

## Testing fusion with admission data

In [None]:

from preprocessing.admission_params_preprocessing.admission_params_preprocessing import preprocess_admission_data

admission_data_files = [file for file in os.listdir(admission_data_path) if file.startswith('SSR_cases_of')]
admission_data_tables = [pd.read_excel(os.path.join(admission_data_path, file), skiprows=[0, 1, 2, 3, 4, 5, 7]) for file in admission_data_files]
admission_data_df = pd.concat(admission_data_tables)
admission_data_df = preprocess_admission_data(admission_data_df, patient_selection_df, verbose=verbose)

In [None]:
set(admission_data_df['case_admission_id']).difference(set(feature_database['case_admission_id']))

In [None]:
set(feature_database['case_admission_id']).difference(set(admission_data_df['case_admission_id']))

In [None]:
feature_database[feature_database['case_admission_id'].str.startswith('2001')]

In [None]:
# find case_admission_id starting with "846826" in admission_data_df
admission_data_df[admission_data_df['case_admission_id'].str.startswith('2001')]

In [None]:
pd.to_datetime(feature_database.groupby('case_admission_id')['sample_date'].first()) - pd.to_datetime(feature_database.groupby('case_admission_id')['begin_date'].first())