# Timeseries decomposition

Goal: decompose timeseries into timebins of X hours and seperate if end of timebin includes CT showing DCI/vasospasm or not

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [None]:
pupillometry_data_path = '/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/icu_research/pupillometry_sah/data/Pupillometry - Datenbank Päddy.xls'
combined_data_path = '/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/icu_research/pupillometry_sah/data/Pupillometry - Datenbank Joana.xlsx'


In [None]:
timebin_hours = 6

In [None]:
censure_data_after_first_positive_CT = True

In [None]:
pupillometry_df = pd.read_excel(pupillometry_data_path, sheet_name='Pupillometrie')
combined_data_df = pd.read_excel(combined_data_path, sheet_name='Database')

# replace 999 with nan
combined_data_df = combined_data_df.replace(999, np.nan)
pupillometry_df = pupillometry_df.replace(999, np.nan)


Restructure combined data (so that each row is a scan)

In [None]:
identifier_columns = ["patID", "pNr"]
constants_columns = [
    "Chronic_heart_failure",
    "Diabetes",
    "HbA1c",
    "COPD/Asthma",
    "Chronic_hemodialysis",
    "Liver_cirrhosis",
    "HIV",
    "Cancer",
    "Alcohol",
    "Time_symptoms",
    "Initial_GCS",
    "Time_initial_GCS",
    "Fisher_Scale",
    "Rad_intervention",
    "Surgery",
    "Surgery_type",
    "Mortality",
    "30 day mortality",
    "GOS_Admission",
    "GOS_ICU_Discharge",
    "GOS_Hospital_Discharge",
    "GOS_30 day"
]

# initial data has one row per patient and one column per scan, instead we want one row per scan (all constants are repeated)
# get columns that are not constants
non_constant_columns = [column_name for column_name in combined_data_df.columns
                        if column_name not in identifier_columns + constants_columns]

scan_idxs = [int(col.split('_')[-1]) for col in non_constant_columns]
max_number_of_scans = np.max(scan_idxs)

reorganized_data_df = pd.DataFrame()
for index, row in combined_data_df.iterrows():
    # for each scan create a new row with all the constants repeated
    for scan_idx in range(1, max_number_of_scans + 1):
        associated_scan_data = row[[col for col in non_constant_columns if col.endswith(f'_{scan_idx}')]]
        # if all values are nan, skip
        if associated_scan_data.isna().all():
            continue

        new_row = row[identifier_columns + constants_columns].copy()
        # add scan number to identifier columns
        new_row['scan_idx'] = scan_idx

        for col in associated_scan_data.index:
            new_row['_'.join(col.split('_')[:-1]).strip()] = associated_scan_data[col]

        # add new_row to reorganized_data_df as a new row
        reorganized_data_df = pd.concat([reorganized_data_df, new_row.to_frame().T])
        
reorganized_data_df.reset_index(drop=True, inplace=True)



CT_result categories:	
- 0: Without vasospasm, perfusion deficit or edema
- 1: With vasospasm
- 2: With perfusion deficit
- 3: With cerebral edema
- 4: With vasospasm and perfusion deficit
- 5: With vasospasm and edema
- 6: With perfusion deficit and edema
	
Simplified
- With vasospasm or perfusion deficit: 1, 2, 4, 5, 6
- Without vasospasm or perfusion deficit: 0, 3

In [None]:
# add dichotomized CT results
reorganized_data_df['CT_Result'] = reorganized_data_df['CT_Result'].astype(int)
reorganized_data_df['CT_Result_dichotomized'] = reorganized_data_df['CT_Result'].isin([1, 2, 4, 5, 6]).astype(int)

## Label all pupillometry data with CT results

In [None]:
reorganized_data_df

In [None]:
pupillometry_df

In [None]:
pupillometry_df['Pupillometrie_Zeitpunkt'] = pd.to_datetime(pupillometry_df['Pupillometrie_Zeitpunkt'])

In [None]:
# loop through CTs and label all pupillometry data with CT results
for index, row in reorganized_data_df.iterrows():
    # verify that patient is in pupillometry data
    if not row['pNr'] in pupillometry_df['pNr'].values:
        continue
        
    timebin_begin = pd.to_datetime(row['CT_Time']) - pd.Timedelta(timebin_hours, unit='h')
    timebin_end = pd.to_datetime(row['CT_Time'])
    
    # for all associated pupillometry entries add a 'within_CT_timebin' column
    pupillometry_df.loc[(pupillometry_df['pNr'] == row['pNr']) & 
                        (pupillometry_df['Pupillometrie_Zeitpunkt'] >= timebin_begin) &
                        (pupillometry_df['Pupillometrie_Zeitpunkt'] <= timebin_end), 'within_CT_timebin'] = 1
    
    # for all associated pupillometry entries add a 'associated_CT_result' column
    pupillometry_df.loc[(pupillometry_df['pNr'] == row['pNr']) & 
                        (pupillometry_df['Pupillometrie_Zeitpunkt'] >= timebin_begin) &
                        (pupillometry_df['Pupillometrie_Zeitpunkt'] <= timebin_end), 'associated_CT_result'] = row['CT_Result']
    
    # for all associated pupillometry entries add a 'associated_CT_time' column
    pupillometry_df.loc[(pupillometry_df['pNr'] == row['pNr']) & 
                        (pupillometry_df['Pupillometrie_Zeitpunkt'] >= timebin_begin) &
                        (pupillometry_df['Pupillometrie_Zeitpunkt'] <= timebin_end), 'associated_CT_time'] = row['CT_Time']
    
    # for all associated pupillometry entries add a 'within_positive_CT_timebin' column
    pupillometry_df.loc[(pupillometry_df['pNr'] == row['pNr']) & 
                        (pupillometry_df['Pupillometrie_Zeitpunkt'] >= timebin_begin) &
                        (pupillometry_df['Pupillometrie_Zeitpunkt'] <= timebin_end), 'within_positive_CT_timebin'] = row['CT_Result_dichotomized']
    
pupillometry_df['within_CT_timebin'] = pupillometry_df['within_CT_timebin'].fillna(0).astype(int)
pupillometry_df['within_positive_CT_timebin'] = pupillometry_df['within_positive_CT_timebin'].fillna(0).astype(int)

In [None]:
pd.infer_freq(pupillometry_df['Pupillometrie_Zeitpunkt'])

### For every pupillometry entry add metrics for the timebin it ends

Gist: every new measure represents the end of a timebin of X hours

Metrics: 
- For every two sided measure: mean, min, max, delta
- Over time: median, min, max, span


In [None]:
# add inter eye metrics for every pupillometry entry
# NPi
pupillometry_df['NPi_inter_eye_mean'] = pupillometry_df[['NPi_r_wert', 'NPi_r_wert.1']].mean(axis=1)
pupillometry_df['NPi_inter_eye_min'] = pupillometry_df[['NPi_r_wert', 'NPi_r_wert.1']].min(axis=1)
pupillometry_df['NPi_inter_eye_max'] = pupillometry_df[['NPi_r_wert', 'NPi_r_wert.1']].max(axis=1)
pupillometry_df['NPi_inter_eye_delta'] = np.abs(pupillometry_df['NPi_r_wert'] - pupillometry_df['NPi_r_wert.1'])

# CV
pupillometry_df['CV_inter_eye_mean'] = pupillometry_df[['CV_r_wert', 'CV_r_wert.1']].mean(axis=1)
pupillometry_df['CV_inter_eye_min'] = pupillometry_df[['CV_r_wert', 'CV_r_wert.1']].min(axis=1)
pupillometry_df['CV_inter_eye_max'] = pupillometry_df[['CV_r_wert', 'CV_r_wert.1']].max(axis=1)
pupillometry_df['CV_inter_eye_delta'] = np.abs(pupillometry_df['CV_r_wert'] - pupillometry_df['CV_r_wert.1'])

# Size
pupillometry_df['Size_inter_eye_mean'] = pupillometry_df[['Si_r_wert', 'Si_r_wert.1']].mean(axis=1)
pupillometry_df['Size_inter_eye_min'] = pupillometry_df[['Si_r_wert', 'Si_r_wert.1']].min(axis=1)
pupillometry_df['Size_inter_eye_max'] = pupillometry_df[['Si_r_wert', 'Si_r_wert.1']].max(axis=1)
pupillometry_df['Size_inter_eye_delta'] = np.abs(pupillometry_df['Si_r_wert'] - pupillometry_df['Si_r_wert.1'])


In [None]:
pupillometry_metrics = ['NPi', 'CV', 'Size']
inter_eye_metrics = ['mean', 'min', 'max', 'delta']
# combine to get all metrics
single_timepoint_metrics = [f'{metric}_inter_eye_{metric_type}' for metric in pupillometry_metrics for metric_type in inter_eye_metrics]

In [None]:
over_time_metrics = ['max', 'min', 'median', 'span']
# combine to get all metrics
timebin_metrics = [f'{metric}_timebin_{metric_type}' for metric in single_timepoint_metrics for metric_type in over_time_metrics]
timebin_metrics

In [None]:
# add timebin metrics for every pupillometry entry
for index, row in tqdm(pupillometry_df.iterrows(), total=len(pupillometry_df)):
    timebin_begin = pd.to_datetime(row['Pupillometrie_Zeitpunkt']) - pd.Timedelta(timebin_hours, unit='h')
    timebin_end = pd.to_datetime(row['Pupillometrie_Zeitpunkt'])
    
    # compute timebin metrics for every single timepoint metric
    for metric in single_timepoint_metrics:
        # get all values within timebin
        values_within_timebin = pupillometry_df.loc[(pupillometry_df['pNr'] == row['pNr']) & 
                                                    (pupillometry_df['Pupillometrie_Zeitpunkt'] >= timebin_begin) &
                                                    (pupillometry_df['Pupillometrie_Zeitpunkt'] <= timebin_end), metric]
        # if no values within timebin, skip
        if len(values_within_timebin) == 0:
            continue
        
        # add timebin metrics
        pupillometry_df.loc[(pupillometry_df['pNr'] == row['pNr']) & 
                            (pupillometry_df['Pupillometrie_Zeitpunkt'] == row['Pupillometrie_Zeitpunkt']), f'{metric}_timebin_median'] = values_within_timebin.median()
        pupillometry_df.loc[(pupillometry_df['pNr'] == row['pNr']) & 
                            (pupillometry_df['Pupillometrie_Zeitpunkt'] == row['Pupillometrie_Zeitpunkt']), f'{metric}_timebin_min'] = values_within_timebin.min()
        pupillometry_df.loc[(pupillometry_df['pNr'] == row['pNr']) & 
                            (pupillometry_df['Pupillometrie_Zeitpunkt'] == row['Pupillometrie_Zeitpunkt']), f'{metric}_timebin_max'] = values_within_timebin.max()
        pupillometry_df.loc[(pupillometry_df['pNr'] == row['pNr']) &   
                            (pupillometry_df['Pupillometrie_Zeitpunkt'] == row['Pupillometrie_Zeitpunkt']), f'{metric}_timebin_span'] = values_within_timebin.max() - values_within_timebin.min()
        
        
    
    
    

In [None]:
negative_pupillometry_df = pupillometry_df[pupillometry_df['within_positive_CT_timebin'] == 0]
negative_pupillometry_df['label'] = 0
negative_pupillometry_df['timebin_end'] = pd.to_datetime(negative_pupillometry_df['Pupillometrie_Zeitpunkt'])

### Build positive pupillometry dataset (only pupillometry data within CT timebin)

loop through CTs and collect all pupillometry data within CT timebin


In [None]:
for index, row in tqdm(reorganized_data_df.iterrows(), total=len(reorganized_data_df)):
    # verify that patient is in pupillometry data
    if not row['pNr'] in pupillometry_df['pNr'].values:
        reorganized_data_df.loc[(reorganized_data_df['pNr'] == row['pNr']) & 
                                (reorganized_data_df['CT_Time'] == row['CT_Time']), 'pupillometry_available'] = 0
        continue
        
    reorganized_data_df.loc[(reorganized_data_df['pNr'] == row['pNr']) & 
                                (reorganized_data_df['CT_Time'] == row['CT_Time']), 'pupillometry_available'] = 1
        
    timebin_begin = pd.to_datetime(row['CT_Time']) - pd.Timedelta(timebin_hours, unit='h')
    timebin_end = pd.to_datetime(row['CT_Time'])
    
    # add timebin metrics comprising all data within timebin
    values_within_timebin = pupillometry_df.loc[(pupillometry_df['pNr'] == row['pNr']) & 
                                                (pupillometry_df['Pupillometrie_Zeitpunkt'] >= timebin_begin) &
                                                (pupillometry_df['Pupillometrie_Zeitpunkt'] <= timebin_end)]

    # if no values within timebin, skip
    if len(values_within_timebin) == 0:
        continue
    
    for metric in single_timepoint_metrics:
        # add timebin metrics
        reorganized_data_df.loc[(reorganized_data_df['pNr'] == row['pNr']) & 
                                (reorganized_data_df['CT_Time'] == row['CT_Time']), f'{metric}_timebin_median'] = values_within_timebin[metric].median()
        reorganized_data_df.loc[(reorganized_data_df['pNr'] == row['pNr']) & 
                                (reorganized_data_df['CT_Time'] == row['CT_Time']), f'{metric}_timebin_min'] = values_within_timebin[metric].min()
        reorganized_data_df.loc[(reorganized_data_df['pNr'] == row['pNr']) & 
                                (reorganized_data_df['CT_Time'] == row['CT_Time']), f'{metric}_timebin_max'] = values_within_timebin[metric].max()
        reorganized_data_df.loc[(reorganized_data_df['pNr'] == row['pNr']) &   
                                (reorganized_data_df['CT_Time'] == row['CT_Time']), f'{metric}_timebin_span'] = values_within_timebin[metric].max() - values_within_timebin[metric].min()
        

In [None]:
positive_pupillometry_df = reorganized_data_df[(reorganized_data_df['pupillometry_available'] == 1) & (reorganized_data_df['CT_Result_dichotomized'] == 1)]
positive_pupillometry_df['label'] = 1
positive_pupillometry_df['timebin_end'] = pd.to_datetime(positive_pupillometry_df['CT_Time'])

In [None]:
reassembled_pupillometry_df = pd.concat([
    positive_pupillometry_df[['pNr', 'label', 'timebin_end'] + timebin_metrics],
    negative_pupillometry_df[['pNr', 'label', 'timebin_end'] + timebin_metrics]
])

For every subject with a positive CT, censure data after CT

In [None]:
if censure_data_after_first_positive_CT:
    # get all pnrs with a positive CT
    pnrs_with_positive_ct = reassembled_pupillometry_df[reassembled_pupillometry_df['label'] == 1]['pNr'].unique()
    
    # for every subject with a positive CT, censure data after CT
    for pnr in tqdm(pnrs_with_positive_ct):
        # get time of first positive CT
        time_of_first_positive_ct = reassembled_pupillometry_df[(reassembled_pupillometry_df['pNr'] == pnr) & (reassembled_pupillometry_df['label'] == 1)]['timebin_end'].min()
        
        # censure all data after time of first positive CT
        reassembled_pupillometry_df.loc[(reassembled_pupillometry_df['pNr'] == pnr) & (reassembled_pupillometry_df['timebin_end'] > time_of_first_positive_ct), 'to_drop'] = 1
        
    reassembled_pupillometry_df = reassembled_pupillometry_df[reassembled_pupillometry_df['to_drop'] != 1]
    reassembled_pupillometry_df.drop(columns=['to_drop'], inplace=True)

In [None]:
# reassembled_pupillometry_df.to_csv(f'/Users/jk1/Downloads/pupillometry_data_{timebin_hours}h_timebin.csv', index=False)

Further ideas:
- normalize by first measure (or best measure)

In [None]:
reassembled_pupillometry_df.label.value_counts()

# Plotting

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
metric

In [None]:
reassembled_pupillometry_df.reset_index(drop=True, inplace=True)

In [None]:
reassembled_pupillometry_df['label'] = reassembled_pupillometry_df['label'].astype(int)

In [None]:
# create a plot with a subplot for every timebin metric, with a scatterplot of metric vs label

n_columns = 4
n_rows = int(np.ceil(len(timebin_metrics) / n_columns))

fig, axes = plt.subplots(n_rows, n_columns, figsize=(20, 60))

for i, metric in enumerate(timebin_metrics):
    sns.violinplot(data=reassembled_pupillometry_df, y=metric, hue='label', palette='pastel', split=True, gap=0.1, ax=axes[i // n_columns, i % n_columns])
    axes[i // n_columns, i % n_columns].set_title(metric)
    axes[i // n_columns, i % n_columns].set_ylabel(metric)
    axes[i // n_columns, i % n_columns].set_ylabel('')

In [None]:
# save figure
# fig.savefig(f'/Users/jk1/Downloads/pupillometry_data_{timebin_hours}h_timebin.png', dpi=300)