In [1]:
import pandas as pd
import numpy as np
import os
np.random.seed(42)

### A) Expected data format for SSAML code.  
Make sure you have a .csv file with the following columns and row formats. The format differs for non-survival-analysis and survival-analysis tasks. Within runner_power.sh you will find a boolean parameter survivalTF to be set (True for survival analysis, False for non-survival analysis), and a boolean parameter peopleTF (True for patient-level analysis and False for event-level analysis). This notebook is therefore a preprocessing guide/tutorial to re-format existing data to make it ready for SSAML algorithm and the runner_power.sh code. The analysis method is not determined here but with the aforementioned parameters in runner_power.sh.

1. 'regular', non-survival analysis model.  
    columns:  
    -- ID: unique patient identifier (integers)  
    -- event: ground truth / label (integers)  
    -- p: model output, event probability  

   rows are data observations (i.e. one row per event/patient)
   
2. survival analysis model.  
    columns:   
    -- ID: unique patient identifier (integers)  
    -- C: censhorhip information (i.e. 1 for censored, 0 for not censored)  
    -- z is the z-score value, a covariate for Cox proportional hazard.
    -- T: time to event  

   rows are data observations (i.e. one row per event/patient)

### B) sample datasets, as presented in the paper

here, we present the format for three distinct tasks, as presented in the paper.

#### B.1) seizure risk prediction ('seizure tracker (ST) data')

In [2]:
# c = pd.read_csv(big_file,sep=',',names=['ID','szTF','AI','RMR'])
# uids = pd.unique(c.ID)
# c.rename(columns={'szTF':'event'},inplace=True)
# c.rename(columns={'AI':'p'},inplace=True)
# peopleTF=True
# survivalTF=False

In [3]:
data = pd.DataFrame(columns=['ID', 'event', 'p'])
data['ID'] = np.arange(100)
data['event'] = np.random.randint(0, high=2, size=data.shape[0]) # binary outcome, high excluded
data['p'] = np.random.rand(data.shape[0]) # model output, probability values between 0 and 1

In [4]:
print(f'data shape: {data.shape}')
print(f'events contained: \n{data.event.value_counts()}')
data.head()

data shape: (100, 3)
events contained: 
1    56
0    44
Name: event, dtype: int64


Unnamed: 0,ID,event,p
0,0,0,0.969585
1,1,1,0.775133
2,2,0,0.939499
3,3,0,0.894827
4,4,0,0.5979


In [5]:
data.to_csv('sample_data_st.csv', index=False)

#### B.2) covid hospitalization risk prediciton ('COVA dataset')

In [6]:
data['ID'] = np.arange(100)
data['event'] = np.random.randint(0, high=2, size=data.shape[0]) # binary outcome, high excluded
data['p'] = np.random.rand(data.shape[0]) # model output, probability values between 0 and 1

In [8]:
# COVA datafile
COVAfile = 'COVA-FAKE.csv'
# First make fake COVA data
how_many = 3000
COVA = pd.DataFrame(columns=('actual','CoVAScore','Prob-none','Prob-Hosp','Prob-ICU-MV','Prob-dead'))
isTrue = np.random.randint(low=0,high=2,size=how_many) # random number 0 or 1
COVA['actual'] = isTrue*np.random.randint(low=0,high=4,size=how_many)    # a random integer from 0 to 3, nonzero if isTrue is 1
COVA['CoVAScore'] = np.random.random(how_many)*4    # a random floating number 0 to 4... we don't use this column anyway
#goodPred = (np.random.random(how_many)<.9)*1.0 # 90% good preds
goodPred = np.ones(how_many)
noise = np.random.random(how_many)*.3 # a little bit of noise
noisyTrue = (isTrue==1)*(1-noise) + (isTrue==0)*noise
thePred = noisyTrue*(goodPred==1) + np.random.random(how_many)*(goodPred==0)
prob =  (goodPred==1)*(1-noise) + (goodPred==0)*noise
COVA['Prob-Hosp'] = prob*100
COVA['Prob-none'] = np.zeros(how_many)
COVA['Prob-ICU-MV'] = np.zeros(how_many)
COVA['Prob-dead'] = np.zeros(how_many)
COVA.to_csv(COVAfile,index=False,float_format='%.3f')


In [9]:

data_raw = pd.read_csv('COVA-FAKE.csv', sep=',')
data = pd.DataFrame()
data['ID'] = np.array(range(data_raw.shape[0]))
event_categories = ['Prob-dead','Prob-ICU-MV','Prob-Hosp']
data['p'] = (data_raw[event_categories[0]] + data_raw[event_categories[1]] + data_raw[event_categories[2]])/100
data['event'] = 0.0 + (data_raw['actual']>0)

In [10]:
print(f'data shape: {data.shape}')
print(f'events contained: \n{data.event.value_counts()}')
data.head()

data shape: (3000, 3)
events contained: 
0.0    1818
1.0    1182
Name: event, dtype: int64


Unnamed: 0,ID,p,event
0,0,0.87711,0.0
1,1,0.87459,0.0
2,2,0.90634,1.0
3,3,0.75444,0.0
4,4,0.96893,0.0


In [7]:
data.to_csv('sample_data_cova.csv', index=False)

#### B.3) Brain age - mortality risk prediction (survival analysis)

This database file has the following columns: 'z','T','C', reflecting a z score (output of ML), T=time, and C=censored yes=1, no=0
The ID numbers were not supplied, so row number can be used to produce a sequential ID number here in preprocessing.

In [10]:
# c = pd.read_csv(big_file,sep=',')
# uids =  uids = np.array(range(c.shape[0]))
# c['ID'] = uids
# peopleTF=True
# survivalTF=True

In [11]:
data = pd.DataFrame(columns=['ID','z', 'T', 'C'])
data['ID'] = np.arange(100)
data['T'] = np.random.randint(0, high=21, size=data.shape[0]) # random integer values for time to event
data['C'] = np.random.randint(0, 2, size=data.shape[0]) # random binary censorship information Yes/No
data['z'] = np.random.normal(loc=0, scale=1, size=data.shape[0]) # random z-scored confounding variable.


In [12]:
print(f'data shape: {data.shape}')
print(f'events contained: \n{data.C.value_counts()}')
data.head()

data shape: (100, 4)
events contained: 
0    59
1    41
Name: C, dtype: int64


Unnamed: 0,ID,T,C,z
0,0,12,0,0.055572
1,1,19,0,0.060233
2,2,14,1,-1.439462
3,3,2,1,-0.383488
4,4,7,0,0.886202


In [13]:
data.to_csv('sample_data_bai_mortality.csv', index=False)