In [None]:
import pandas as pd
import numpy as np
from preprocessing.geneva_stroke_unit_preprocessing.utils import create_registry_case_identification_column

In [None]:
preprocessed_gsu_dataset_outcomes_path = '/Users/jk1/temp/opsum_prepro_output/gsu_prepro_01012023_233050/preprocessed_outcomes_01012023_233050.csv'
pre_extracted_imaging_path = '/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/stroke_research/geneva_stroke_dataset/volumetric_perfusion_data/original/Total2016_2019IVTEVT_RAPID_IMAGE.xlsx'

In [None]:
outcome = '3M mRS 0-2'

In [None]:
preprocessed_gsu_dataset_outcomes_df = pd.read_csv(preprocessed_gsu_dataset_outcomes_path)
pre_extracted_imaging_df = pd.read_excel(pre_extracted_imaging_path)

## Generate random subset of patients to test with imaging data

Size: 10%

- up to 2019 (as imaging data already partly extracted up to 2019)
- with all 

In [None]:
preprocessed_gsu_dataset_outcomes_df.head()

In [None]:
# restrict to patients with outcome
preprocessed_gsu_dataset_outcomes_df = preprocessed_gsu_dataset_outcomes_df[pd.notnull(preprocessed_gsu_dataset_outcomes_df[outcome])]

In [None]:
# compute 10% of the dataset size
random_subset_size = int(preprocessed_gsu_dataset_outcomes_df.shape[0] * 0.1)
random_subset_size

In [None]:
random_subset_all_years_df = preprocessed_gsu_dataset_outcomes_df.sample(n=random_subset_size, random_state=42)
random_subset_before_2020_df = preprocessed_gsu_dataset_outcomes_df[preprocessed_gsu_dataset_outcomes_df['Discharge date'] < 20200000].sample(n=random_subset_size, random_state=42)

## Find number of patients with imaging already extracted 

Goal: estimate workload for extracting imaging data for the random subset of patients

In [None]:
# if CASE ID does not start with 'SSR-HUG-':
    # CASE ID = 'SSR-HUG-' + CASE ID

pre_extracted_imaging_df['Case ID'] = pre_extracted_imaging_df['Case ID'].apply(lambda x: 'SSR-HUG-' + str(x) if not str(x).startswith('SSR-HUG-') else x)
pre_extracted_imaging_df['case_admission_id'] = create_registry_case_identification_column(pre_extracted_imaging_df)

In [None]:
pre_extracted_imaging_df['CTP_lookup'] = pre_extracted_imaging_df.apply(lambda x: 1 if pd.notnull(x['CBF']) else np.nan, axis=1)
# fill nas with 0 if comment = pas de CTP
pre_extracted_imaging_df['CTP_lookup'] = pre_extracted_imaging_df.apply(lambda x: 0 if x['Comment'] == 'pas de CTP' else x['CTP_lookup'], axis=1)

In [None]:
pre_extracted_imaging_df

In [None]:
columns_to_keep = ['case_admission_id', 'CTP_lookup', 'T10', 'T8', 'T6', 'T4', 'CBF', 'Comment']

In [None]:
pre_extracted_imaging_df[columns_to_keep + ['Acute perf. imaging type']]

merge with all preprocessed patients

In [None]:
preprocessed_gsu_dataset_outcomes_with_imaging_data = preprocessed_gsu_dataset_outcomes_df.merge(pre_extracted_imaging_df[columns_to_keep], left_on='case_admission_id', right_on='case_admission_id', how='left')

In [None]:
preprocessed_gsu_dataset_outcomes_with_imaging_data.CTP_lookup.value_counts()

join imaging data with random subset of patients

In [None]:
random_subset_all_years_df = random_subset_all_years_df.merge(pre_extracted_imaging_df[columns_to_keep], left_on='case_admission_id', right_on='case_admission_id', how='left')
random_subset_before_2020_df = random_subset_before_2020_df.merge(pre_extracted_imaging_df[columns_to_keep], left_on='case_admission_id', right_on='case_admission_id', how='left')

In [None]:
random_subset_before_2020_df.CTP_lookup.value_counts()
n_remaining_to_extract = random_subset_before_2020_df.CTP_lookup.isna().sum()
print(f'Number of patients with missing imaging data: {n_remaining_to_extract}')

In [None]:
random_subset_all_years_df.CTP_lookup.value_counts()
n_remaining_to_extract = random_subset_all_years_df.CTP_lookup.isna().sum()
print(f'Number of patients with missing imaging data: {n_remaining_to_extract}')