In [None]:
import numpy as np
import pandas as pd

from meta_data.long_term_outcomes.geneva_stroke_unit_patient_characteristics import extract_patient_characteristics
from preprocessing.geneva_stroke_unit_preprocessing.utils import create_registry_case_identification_column

In [None]:
cids_path = '/Users/jk1/temp/opsum_end/preprocessing/gsu_Extraction_20220815_prepro_08062024_083500/case_admission_ids.csv'
outcomes_path = '/Users/jk1/temp/opsum_end/preprocessing/gsu_Extraction_20220815_prepro_08062024_083500/preprocessed_outcomes_short_term_08062024_083500.csv'
registry_path = '/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/stroke_research/geneva_stroke_unit_dataset/data/stroke_registry/post_hoc_modified/stroke_registry_post_hoc_modified.xlsx'

In [None]:
cids_df = pd.read_csv(cids_path)
outcomes_df = pd.read_csv(outcomes_path)
registry_df = pd.read_excel(registry_path)

In [None]:
registry_df['case_admission_id'] = create_registry_case_identification_column(registry_df)
registry_df = registry_df[registry_df.case_admission_id.isin(cids_df.case_admission_id)]

In [None]:
# add and END column to the registry_df (with all patients that are in outcomes_df 1 and all others 0) as well as a column with END timing (taken from the column relative_sample_date in outcomes_df)
registry_df['END'] = 0
registry_df.loc[registry_df.case_admission_id.isin(outcomes_df.case_admission_id), 'END'] = 1
registry_df['END_timing'] = np.NAN

for i, row in outcomes_df.iterrows():
    registry_df.loc[registry_df.case_admission_id == row.case_admission_id, 'END_timing'] = row.relative_sample_date


In [None]:
registry_df['Etiology TOAST'].value_counts()

In [None]:
registry_df['Etiology - Cardiac embolism'] = registry_df['Etiology TOAST'].apply(lambda x: 1 if x == 'Cardiac embolism' else 0)
registry_df['Etiology - Large artery atherosclerosis'] = registry_df['Etiology TOAST'].apply(lambda x: 1 if x == 'Large artery atherosclerosis' else 0)
registry_df['Etiology - Small vessel disease'] = registry_df['Etiology TOAST'].apply(lambda x: 1 if x == 'Small vessel disease' else 0)

In [None]:
CONTINUOUS_CHARACTERISTICS = [
    'Age (calc.)',
    'Prestroke disability (Rankin)',
    'NIH on admission',
    'BMI',
    ]

CATEGORICAL_CHARACTERISTICS = [
    'Sex',
    'IVT with rtPA',
    'IAT',
    'MedHist Hypertension',
    'MedHist Diabetes',
    'MedHist Hyperlipidemia',
    'MedHist Atrial Fibr.',
    'Etiology - Cardiac embolism',
    'Etiology - Large artery atherosclerosis',
    'Etiology - Small vessel disease',
]

In [None]:
# create a function that takes a registry_df like dataframe and outputs a popluation descriptive table
def create_population_table(df, continuous_characteristics, categorical_characteristics, count_nan=False):
    """
    Create a population descriptive table from a dataframe.

    Parameters:
    df (pd.DataFrame): Dataframe containing the data.

    Returns:
    pd.DataFrame: Population descriptive table.
    """ 
    population_df = pd.DataFrame()
    population_str_df = pd.DataFrame()
    
    n_cases = df.case_admission_id.nunique()
    n_patients = df.case_admission_id.apply(lambda x: x.split('_')[0]).nunique()
    
    population_df['n cases'] = [n_cases]
    population_df['n patients'] = [n_patients]
    population_str_df['n patients'] = [n_patients]
    population_str_df['n cases'] = [n_cases]
    
    for characteristic in continuous_characteristics:
        population_df[f'median {characteristic}'] = [df[characteristic].median()]
        population_df[f'Q25 {characteristic}'] = [df[characteristic].quantile(0.25)]
        population_df[f'Q75 {characteristic}'] = [df[characteristic].quantile(0.75)]
        # count number of missing values for characteristic
        population_df[f'n missing {characteristic}'] = [df[characteristic].isnull().sum()]
        population_str_df[f'{characteristic}'] = f'{population_df[f"median {characteristic}"][0]:.1f} ({population_df[f"Q25 {characteristic}"][0]:.1f}-{population_df[f"Q75 {characteristic}"][0]:.1f})'

    for characteristic in categorical_characteristics:
        # get number of most common value for each categorical characteristic
        max_category = df[characteristic].value_counts().idxmax()
        if 'yes' in df[characteristic].unique():
            display_category = 'yes'
        elif 1 in df[characteristic].unique():
            display_category = 1
        elif 'Female' in df[characteristic].unique():
            display_category = 'Female'
        else:
            display_category = max_category
        population_df[f'{characteristic} {display_category}'] = [df[characteristic].value_counts()[display_category]]
        if not count_nan:
            # get percentage as fraction of non_nan
            population_df[f'p {characteristic} {display_category}'] = [df[characteristic].value_counts()[display_category]/df[characteristic].count() * 100]
        else:
            # get percentage as fraction of total (including missing values)
            population_df[f'p {characteristic} {display_category}'] = [df[characteristic].value_counts()[display_category]/len(df) * 100]
        population_df[f'n missing {characteristic}'] = [df[characteristic].isnull().sum()]

        if display_category == 'yes' or display_category == 1:
            population_str_df[f'{characteristic}'] = f'{population_df[f"{characteristic} {display_category}"][0]} ({population_df[f"p {characteristic} {display_category}"][0]:.1f}%)'
        else:
            population_str_df[f'{characteristic} ({display_category})'] = f'{population_df[f"{characteristic} {display_category}"][0]} ({population_df[f"p {characteristic} {display_category}"][0]:.1f}%)'

    return population_df, population_str_df

In [None]:
overall_population_df, overall_population_str_df = create_population_table(registry_df, CONTINUOUS_CHARACTERISTICS, CATEGORICAL_CHARACTERISTICS)
no_end_population_df, no_end_population_str_df = create_population_table(registry_df[registry_df.END == 0], CONTINUOUS_CHARACTERISTICS, CATEGORICAL_CHARACTERISTICS)
end_population_df, end_population_str_df = create_population_table(registry_df[registry_df.END == 1], CONTINUOUS_CHARACTERISTICS, CATEGORICAL_CHARACTERISTICS)

In [None]:
# side to side end and no end population table
comparison_table_df = pd.concat([end_population_str_df.T, no_end_population_str_df.T], axis=1)
comparison_table_df.columns = ['END', 'No END']

In [None]:
comparison_table_df

In [None]:
comparison_table_df.to_csv('/Users/jk1/Downloads/end_table1.csv')