In [1]:
import pandas as pd
import numpy as np
np.random.seed(42)

### A) Expected data format for SSAML code.  
Make sure you have a .csv file with the following columns and row formats. The format differs for non-survival-analysis and survival-analysis tasks. Within runner_power.sh you will find a boolean parameter survivalTF to be set (True for survival analysis, False for non-survival analysis), and a boolean parameter peopleTF (True for patient-level analysis and False for event-level analysis). This notebook is therefore a preprocessing guide/tutorial to re-format existing data to make it ready for SSAML algorithm and the runner_power.sh code. The analysis method is not determined here but with the aforementioned parameters in runner_power.sh.

1. 'regular', non-survival analysis model.  
    columns:  
    -- ID: unique patient identifier (integers)  
    -- event: ground truth / label (integers)  
    -- p: model output, event probability  

   rows are data observations (i.e. one row per event/patient)
   
2. survival analysis model.  
    columns:   
    -- ID: unique patient identifier (integers)  @ Daniel correct: or does this somehow have the censorship data? I'm not clear yet where the censorship data is contained.  
    -- C: event observation (i.e. 1 for event observed, 0 for not observed)  
    -- T: time to event  

   rows are data observations (i.e. one row per event/patient)

### B) sample datasets, as presented in the paper

here, we present the format for three distinct tasks, as presented in the paper.

#### B.1) seizure risk prediction ('seizure tracker (ST) data')

# @ Daniel: I don't have those files, double check if expected format is correct.

In [2]:
# c = pd.read_csv(big_file,sep=',',names=['ID','szTF','AI','RMR'])
# uids = pd.unique(c.ID)
# c.rename(columns={'szTF':'event'},inplace=True)
# c.rename(columns={'AI':'p'},inplace=True)
# peopleTF=True
# survivalTF=False

In [3]:
data = pd.DataFrame(columns=['ID', 'event', 'p'])
data['ID'] = np.arange(100)
data['event'] = np.random.randint(0, high=2, size=data.shape[0]) # binary outcome, high excluded
data['p'] = np.random.rand(data.shape[0]) # model output, probability values between 0 and 1

In [4]:
print(f'data shape: {data.shape}')
print(f'events contained: \n{data.event.value_counts()}')
data.head()

data shape: (100, 3)
events contained: 
1    56
0    44
Name: event, dtype: int64


Unnamed: 0,ID,event,p
0,0,0,0.969585
1,1,1,0.775133
2,2,0,0.939499
3,3,0,0.894827
4,4,0,0.5979


In [5]:
data.to_csv('sample_data_st.csv', index=False)

#### B.2) covid hospitalization risk prediciton ('COVA dataset')

In [6]:
data['ID'] = np.arange(100)
data['event'] = np.random.randint(0, high=2, size=data.shape[0]) # binary outcome, high excluded
data['p'] = np.random.rand(data.shape[0]) # model output, probability values between 0 and 1

In [7]:
# COVA datafile
data_raw = pd.read_csv('COVA-FAKE.csv', sep=',')
data = pd.DataFrame()
data['ID'] = np.array(range(data_raw.shape[0]))
event_categories = ['Prob-dead','Prob-ICU-MV','Prob-Hosp']
data['p'] = (data_raw[event_categories[0]] + data_raw[event_categories[1]] + data_raw[event_categories[2]])/100
data['event'] = 0.0 + (data_raw['actual']>0)

In [8]:
print(f'data shape: {data.shape}')
print(f'events contained: \n{data.event.value_counts()}')
data.head()

data shape: (2000, 3)
events contained: 
0.0    1263
1.0     737
Name: event, dtype: int64


Unnamed: 0,ID,p,event
0,0,0.8843,0.0
1,1,0.90456,1.0
2,2,0.87122,0.0
3,3,0.96559,0.0
4,4,0.02583,0.0


In [9]:
data.to_csv('sample_data_cova.csv', index=False)

#### B.3) Brain age - mortality risk prediction (survival analysis)

# @ Daniel: I don't have those files, I think I am missing sth. here

In [10]:
# c = pd.read_csv(big_file,sep=',')
# uids =  uids = np.array(range(c.shape[0]))
# c['ID'] = uids
# peopleTF=True
# survivalTF=True

In [11]:
data = pd.DataFrame(columns=['ID', 'T', 'C'])
data['ID'] = np.arange(100)
data['T'] = np.random.randint(0, high=21, size=data.shape[0]) # random integer values for time to event
data['C'] = np.random.randint(0, 2, size=data.shape[0]) # random binary event observation Yes/No

In [12]:
print(f'data shape: {data.shape}')
print(f'events contained: \n{data.C.value_counts()}')
data.head()

data shape: (100, 3)
events contained: 
0    59
1    41
Name: C, dtype: int64


Unnamed: 0,ID,T,C
0,0,12,0
1,1,19,0
2,2,14,1
3,3,2,1
4,4,7,0


In [13]:
data.to_csv('sample_data_bai_mortality.csv', index=False)