In [None]:
import pandas as pd
from utils import load_encrypted_xlsx

In [None]:
data_path = '/Users/jk1/temp/cereblink/data_saving/exclude_nan_outcome_False/DCI_ischemia_normalised_pupillometry_df.csv'
gcs_path = '/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/icu_research/dci_sah/data/pdms_data/Transfer Urs.pietsch@kssg.ch 22.01.24, 15_34/20240117_SAH_SOS_GCS.csv'
registry_data_path = '/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/icu_research/dci_sah/data/sos_sah_data/post_hoc_modified_aSAH_DATA_2009_2023_24122023.xlsx'
registry_ids_data_path = '/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/icu_research/dci_sah/data/sos_sah_data/saved_versions/post_hoc_modified_aSAH_DATA_2009_2023_24122023_version_at_time_of_publication.xlsx'
outcomes_data_path = '/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/icu_research/dci_sah/data/sos_sah_data/original_data/outcomes_aSAH_DATA_2009_2024_17022024.xlsx'
registry_pdms_correspondence_path = '/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/icu_research/dci_sah/data/pdms_data/registry_pdms_correspondence.csv'
output_dir = '/Users/jk1/Downloads/'

In [None]:
target = 'DCI_ischemia'

In [None]:
pupillometry_df = pd.read_csv(data_path)
registry_df = load_encrypted_xlsx(registry_data_path)
registry_ids_df = load_encrypted_xlsx(registry_ids_data_path)
outcomes_df = load_encrypted_xlsx(outcomes_data_path)
registry_pdms_correspondence_df = pd.read_csv(registry_pdms_correspondence_path)
gcs_df = pd.read_csv(gcs_path, sep=';', decimal='.')


In [None]:
pupillometry_df.head()

In [None]:
measures = ['NPI_r_value_normalised', 'NPI_l_value_normalised', 'CV_r_value_normalised', 'CV_l_value_normalised']
# if all in measures nan, drop the row
pupillometry_df = pupillometry_df.dropna(subset=measures, how='all')

In [None]:
included_admissions = pupillometry_df.pNr.unique()
n_pnr_nan = pupillometry_df.pNr.isna().sum()
print(f'Number of included admissions: {len(included_admissions)}')
print(f'Number of admissions with missing pNr: {n_pnr_nan}')

In [None]:
registry_pdms_correspondence_df.rename(columns={'JoinedName': 'Name'}, inplace=True)
registry_pdms_correspondence_df['Date_birth'] = pd.to_datetime(registry_pdms_correspondence_df['Date_birth'], format='%Y-%m-%d')
registry_df = registry_df.merge(registry_pdms_correspondence_df, on=['SOS-CENTER-YEAR-NO.', 'Name', 'Date_birth'], how='left')
registry_ids_df = registry_ids_df.merge(registry_pdms_correspondence_df, on=['SOS-CENTER-YEAR-NO.', 'Name', 'Date_birth'], how='left')
outcomes_df = outcomes_df.merge(registry_pdms_correspondence_df, on=['SOS-CENTER-YEAR-NO.', 'Name', 'Date_birth'], how='left')

In [None]:
registry_df = registry_df[registry_df['pNr'].isin(included_admissions)]
registry_ids_df = registry_ids_df[registry_ids_df['pNr'].isin(included_admissions)]
outcomes_df = outcomes_df[outcomes_df['pNr'].isin(included_admissions)]
# drop rows duplicate pNr
registry_df = registry_df.drop_duplicates(subset='pNr')
outcomes_df = outcomes_df.drop_duplicates(subset='pNr')

In [None]:
registry_df.pNr.nunique()

In [None]:
registry_df.head()

variables to extract
- age
- sex
- hta
- dm
- an location
- an treatment
- admission GCS
- admission WFNS
- admission Fisher
- mortality
- LOS icu / hospital
- 3 month mRS / GOS

in groups
- all
- DCI
- no DCI

In [None]:
registry_df

In [None]:
registry_df['an_loc_acoma'] = registry_df['Aneurysm_Artery_Code'] == 8
registry_df['an_loc_aca'] = registry_df['Aneurysm_Artery_Code'].isin([9, 22, 24])
registry_df['an_loc_mca'] = registry_df['Aneurysm_Artery_Code'].isin([7, 20, 21])
registry_df['an_loc_ica'] = registry_df['Aneurysm_Artery_Code'].isin([1,2,3,4,5,6,25, 18, 19, 25, 26, 27, 29, 31])
registry_df['an_loc_vert_bas_branches'] = registry_df['Aneurysm_Artery_Code'].isin([10, 11, 12, 13, 14, 15, 16, 17,])
registry_df['an_loc_pca'] = registry_df['Aneurysm_Artery_Code'].isin([23, 28])

In [None]:
# preprocess registry data
registry_df['Sex'] = registry_df['Sex'].str.upper().map({'M': 0, 'F': 1, 'W': 1})

In [None]:
gcs_df['GCS'] = gcs_df.eyes + gcs_df.verbal + gcs_df.movement
first_gcs_df = gcs_df.groupby('pNr').apply(lambda x: x.sort_values('timeGCS', ascending=True).iloc[0])
first_gcs_df.reset_index(drop=True, inplace=True)
first_gcs_df.rename(columns={'GCS': 'GCS_pdms', 'intubated': 'intubated_pdms'}, inplace=True)
registry_df = registry_df.merge(first_gcs_df[['pNr', 'GCS_pdms', 'intubated_pdms']], left_on='pNr', right_on='pNr', how='left')
registry_df['GCS_admission'] = registry_df['GCS_admission'].fillna(registry_df['GCS_pdms'])
registry_df['Intubated_on_admission_YN'] = registry_df['Intubated_on_admission_YN'].fillna(registry_df['intubated_pdms'])

In [None]:
registry_df['Fischer_Score'] = pd.to_numeric(registry_df['Fisher_Score'])

In [None]:
registry_df['los'] = (pd.to_datetime(registry_df['Date_Discharge']) - pd.to_datetime(registry_df['Date_admission'])).dt.days
registry_df['los_icu'] = (pd.to_datetime(registry_df['Date_discharge_ICU']) - pd.to_datetime(registry_df['Date_admission'])).dt.days

In [None]:
registry_df

In [None]:
# count patients where Coiling and Cliping are both 0
registry_df[(registry_df['Coiling'] == 0) & (registry_df['Clipping'] == 0)].shape[0] / registry_df.pNr.nunique()

In [None]:
def get_population_stats(registry_df, outcomes_df, pupillometry_df):
    population_df = pd.DataFrame()
    population_df['n_patients'] = [pupillometry_df.pNr.nunique()]
    
    population_df['age_median'] = registry_df.Age.median()
    population_df['age_q1'] = registry_df.Age.quantile(0.25)
    population_df['age_q3'] = registry_df.Age.quantile(0.75)
    population_df['age_str'] = f'{population_df.age_median.iloc[0]:.1f} ({population_df.age_q1.iloc[0]:.1f}-{population_df.age_q3.iloc[0]:.1f})'
    
    population_df['n_female'] = registry_df.Sex.sum()
    population_df['p_female'] = registry_df.Sex.sum() / registry_df.pNr.nunique()
    population_df['female_str'] = f'{population_df.n_female.iloc[0]} ({population_df.p_female.iloc[0]*100:.1f}%)'
    
    population_df['n_hta'] = registry_df.HTN.sum()
    population_df['p_hta'] = registry_df.HTN.sum() / registry_df.pNr.nunique()
    population_df['hta_str'] = f'{population_df.n_hta.iloc[0]:.0f} ({population_df.p_hta.iloc[0]*100:.1f}%)'
    
    population_df['n_dm'] = registry_df.DM.sum()
    population_df['p_dm'] = registry_df.DM.sum() / registry_df.pNr.nunique()
    population_df['dm_str'] = f'{population_df.n_dm.iloc[0]:.0f} ({population_df.p_dm.iloc[0]*100:.1f}%)'
    
    population_df['n_acoma'] = registry_df.an_loc_acoma.sum()
    population_df['p_acoma'] = registry_df.an_loc_acoma.sum() / registry_df.pNr.nunique()
    population_df['acoma_str'] = f'{population_df.n_acoma.iloc[0]} ({population_df.p_acoma.iloc[0]*100:.1f}%)'
    
    population_df['n_aca'] = registry_df.an_loc_aca.sum()
    population_df['p_aca'] = registry_df.an_loc_aca.sum() / registry_df.pNr.nunique()
    population_df['aca_str'] = f'{population_df.n_aca.iloc[0]} ({population_df.p_aca.iloc[0]*100:.1f}%)'
    
    population_df['n_mca'] = registry_df.an_loc_mca.sum()
    population_df['p_mca'] = registry_df.an_loc_mca.sum() / registry_df.pNr.nunique()
    population_df['mca_str'] = f'{population_df.n_mca.iloc[0]} ({population_df.p_mca.iloc[0]*100:.1f}%)'
    
    population_df['n_pca'] = registry_df.an_loc_pca.sum()
    population_df['p_pca'] = registry_df.an_loc_pca.sum() / registry_df.pNr.nunique()
    population_df['pca_str'] = f'{population_df.n_pca.iloc[0]} ({population_df.p_pca.iloc[0]*100:.1f}%)'
    
    population_df['n_ica'] = registry_df.an_loc_ica.sum()
    population_df['p_ica'] = registry_df.an_loc_ica.sum() / registry_df.pNr.nunique()
    population_df['ica_str'] = f'{population_df.n_ica.iloc[0]} ({population_df.p_ica.iloc[0]*100:.1f}%)'
    
    population_df['n_vert_bas_branches'] = registry_df.an_loc_vert_bas_branches.sum()
    population_df['p_vert_bas_branches'] = registry_df.an_loc_vert_bas_branches.sum() / registry_df.pNr.nunique()
    population_df['vert_bas_branches_str'] = f'{population_df.n_vert_bas_branches.iloc[0]} ({population_df.p_vert_bas_branches.iloc[0]*100:.1f}%)'
    
    population_df['loc_missing'] = registry_df.Aneurysm_Artery_Code.isna().sum()
    population_df['p_loc_missing'] = registry_df.Aneurysm_Artery_Code.isna().sum() / registry_df.pNr.nunique()
    population_df['loc_missing_str'] = f'{population_df.loc_missing.iloc[0]} ({population_df.p_loc_missing.iloc[0]*100:.1f}%)'
    
    population_df['gcs_admission_median'] = registry_df.GCS_admission.median()
    population_df['gcs_admission_q1'] = registry_df.GCS_admission.quantile(0.25)
    population_df['gcs_admission_q3'] = registry_df.GCS_admission.quantile(0.75)
    population_df['gcs_admission_str'] = f'{population_df.gcs_admission_median.iloc[0]:.0f} ({population_df.gcs_admission_q1.iloc[0]:.0f}-{population_df.gcs_admission_q3.iloc[0]:.0f})'
    
    population_df['wfns_median'] = registry_df.WFNS.median()
    population_df['wfns_q1'] = registry_df.WFNS.quantile(0.25)
    population_df['wfns_q3'] = registry_df.WFNS.quantile(0.75)
    population_df['wfns_str'] = f'{population_df.wfns_median.iloc[0]:.0f} ({population_df.wfns_q1.iloc[0]:.0f}-{population_df.wfns_q3.iloc[0]:.0f})'
    
    population_df['fisher_median'] = pd.to_numeric(registry_df['Fisher_Score']).median()
    population_df['fisher_q1'] = pd.to_numeric(registry_df['Fisher_Score']).quantile(0.25)
    population_df['fisher_q3'] = pd.to_numeric(registry_df['Fisher_Score']).quantile(0.75)
    population_df['fisher_str'] = f'{population_df.fisher_median.iloc[0]:.0f} ({population_df.fisher_q1.iloc[0]:.0f}-{population_df.fisher_q3.iloc[0]:.0f})'
    
    # coiling or stenting
    population_df['n_coiling'] = (registry_df.Coiling + registry_df.Stenting).astype(bool).astype(int).sum()
    population_df['p_coiling'] = (registry_df.Coiling + registry_df.Stenting).astype(bool).astype(int).sum() / registry_df.pNr.nunique()
    population_df['coiling_str'] = f'{population_df.n_coiling.iloc[0]:.0f} ({population_df.p_coiling.iloc[0]*100:.1f}%)'
    
    population_df['n_clipping'] = registry_df.Clipping.sum()
    population_df['p_clipping'] = registry_df.Clipping.sum() / registry_df.pNr.nunique()
    population_df['clipping_str'] = f'{population_df.n_clipping.iloc[0]:.0f} ({population_df.p_clipping.iloc[0]*100:.1f}%)'
    
    population_df['los_icu_median'] = registry_df.los_icu.median()
    population_df['los_icu_q1'] = registry_df.los_icu.quantile(0.25)
    population_df['los_icu_q3'] = registry_df.los_icu.quantile(0.75)
    population_df['los_icu_str'] = f'{population_df.los_icu_median.iloc[0]:.0f} ({population_df.los_icu_q1.iloc[0]:.0f}-{population_df.los_icu_q3.iloc[0]:.0f})'
    
    population_df['los_median'] = registry_df.los.median()
    population_df['los_q1'] = registry_df.los.quantile(0.25)
    population_df['los_q3'] = registry_df.los.quantile(0.75)
    population_df['los_str'] = f'{population_df.los_median.iloc[0]:.0f} ({population_df.los_q1.iloc[0]:.0f}-{population_df.los_q3.iloc[0]:.0f})'
    
    population_df['n_mortality'] = registry_df.Death.sum()
    population_df['p_mortality'] = registry_df.Death.sum() / registry_df.pNr.nunique()
    population_df['mortality_str'] = f'{population_df.n_mortality.iloc[0]:.0f} ({population_df.p_mortality.iloc[0]*100:.1f}%)'
    
    population_df['1y_mrs_median'] = outcomes_df['mRS_FU_1y'].median()
    population_df['1y_mrs_q1'] = pd.to_numeric(outcomes_df['mRS_FU_1y']).quantile(0.25)
    population_df['1y_mrs_q3'] = pd.to_numeric(outcomes_df['mRS_FU_1y']).quantile(0.75)
    population_df['1y_mrs_str'] = f'{population_df["1y_mrs_median"].iloc[0]:.0f} ({population_df["1y_mrs_q1"].iloc[0]:.0f}-{population_df["1y_mrs_q3"].iloc[0]:.0f})'
    
    return population_df

In [None]:
overall_population_df = get_population_stats(registry_df, outcomes_df, pupillometry_df)
overall_population_df

In [None]:
# registry_ids_df is used to identify the target group reflecting the exact inclusion population at the moment of the study (DB has been updated since)
dci_pnr = registry_ids_df[registry_ids_df[target] == 1].pNr.astype(int).unique()
dci_population_df = get_population_stats(registry_df[registry_df.pNr.isin(dci_pnr)], outcomes_df[outcomes_df.pNr.isin(dci_pnr)], pupillometry_df[pupillometry_df.pNr.isin(dci_pnr)])
dci_population_df

In [None]:
no_dci_pnr = registry_ids_df[registry_ids_df[target] == 0].pNr.astype(int).unique()
no_dci_population_df = get_population_stats(registry_df[registry_df.pNr.isin(no_dci_pnr)], outcomes_df[outcomes_df.pNr.isin(no_dci_pnr)], pupillometry_df[pupillometry_df.pNr.isin(no_dci_pnr)])
no_dci_population_df

In [None]:
full_population_df = pd.concat([overall_population_df, dci_population_df, no_dci_population_df], keys=['overall', 'dci', 'no_dci'])
full_population_df = full_population_df.droplevel(1).T

In [None]:
full_population_df

In [None]:
str_pop_df = full_population_df.loc[['n_patients', 'age_str', 'female_str', 
                                        'hta_str', 'dm_str', 
                                     'acoma_str', 'aca_str', 'mca_str', 'pca_str', 'ica_str', 'vert_bas_branches_str', 'loc_missing_str',
                                     'gcs_admission_str', 'wfns_str', 'fisher_str', 
                                     'coiling_str', 'clipping_str',
                                     'los_icu_str', 'los_str', 'mortality_str', '1y_mrs_str']]

In [None]:
# rename indices
str_pop_df.index = ['Number of patients', 'Age', 'Sex (Female)',
                    'Hypertension', 'Diabetes',
                    'Ant. communicating artery', 'Ant. cerebral artery', 'Middle cerebral artery', 'Post. cerebral artery', 'Internal carotid artery', 'Vertebral/basilar artery', 'Location unspecified',
                    'Admission GCS', 'Admission WFNS', 'Admission Fisher', 
                    'Coiling', 'Clipping',
                    'ICU length of stay', 'Hospital length of stay', 'Hospital mortality', '1 year mRS']
str_pop_df.rename(columns={'overall': 'Overall population', 'dci': 'DCI', 'no_dci': 'No DCI'}, inplace=True)
str_pop_df

In [None]:
str_pop_df.to_csv(output_dir + 'population_stats.csv')