In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

from tqdm import tqdm

In [13]:
def extract_datetime_features(df, column):
        df[column] = pd.to_datetime(df[column], errors='coerce')
        df[column + '_year'] = df[column].dt.year
        df[column + '_month'] = df[column].dt.month
        df[column + '_day'] = df[column].dt.day
        df[column + '_hour'] = df[column].dt.hour
        df[column + '_weekday'] = df[column].dt.weekday
        return df

def label_preprocessing(df):
    df['datetime'] = pd.to_datetime(df['measurement_datetime'])
    df.drop(columns=['measurement_datetime'], inplace=True)   
    
    df.dropna(inplace=True)
    #df = extract_datetime_features(df, 'datetime')
    return df

train_labels = pd.read_csv('training_data/SepsisLabel_train.csv')
test_labels = pd.read_csv('testing_data/SepsisLabel_test.csv')

train_labels = label_preprocessing(train_labels)
test_labels = label_preprocessing(test_labels)

print(train_labels.shape, test_labels.shape)
train_labels.head()

(331637, 3) (130483, 2)


Unnamed: 0,person_id,SepsisLabel,datetime
0,274096387,0,2024-12-03 20:00:00
1,1719359031,0,2024-04-20 09:00:00
2,2024544816,0,2021-07-14 07:00:00
3,213710896,0,2022-05-24 07:00:00
4,1335786468,0,2024-08-25 22:00:00


In [14]:
dev_train = pd.read_csv('./training_data/devices_train.csv')
dev_test = pd.read_csv('./testing_data/devices_test.csv')

dev_train.drop(columns='visit_occurrence_id', inplace = True)
dev_test.drop(columns='visit_occurrence_id', inplace = True)

dev_train['datetime'] = pd.to_datetime(dev_train['device_datetime_hourly']) 
dev_test['datetime'] = pd.to_datetime(dev_test['device_datetime_hourly']) 

dev_train.drop(columns='device_datetime_hourly', inplace = True)
dev_test.drop(columns='device_datetime_hourly', inplace = True)

print(dev_train.shape, dev_test.shape)
dev_train.head()

(750878, 3) (320919, 3)


Unnamed: 0,person_id,device,datetime
0,1553934216,Arterial blood pressure catheter,2024-10-17 10:00:00
1,1553934216,Arterial blood pressure catheter,2024-10-17 11:00:00
2,1553934216,Arterial blood pressure catheter,2024-10-17 12:00:00
3,1553934216,Arterial blood pressure catheter,2024-10-17 13:00:00
4,1553934216,Arterial blood pressure catheter,2024-10-17 14:00:00


In [15]:
dev_train['device'].value_counts()

device
Urinary catheter                      198542
Central venous catheter               188955
Arterial blood pressure catheter      173702
Nasogastric/orogastric tube stylet    169397
Endotracheal tube                      20282
Name: count, dtype: int64

In [16]:
train_labels = train_labels.sort_values(['person_id', 'datetime'])
dev_train = dev_train.sort_values(['person_id', 'datetime'])
time_window = pd.Timedelta(days = 7)

train_tuples = []

for idx, (pid, lbl, dt) in tqdm(train_labels.iterrows(), total = train_labels.shape[0]):
    dev = dev_train[(dev_train['person_id'] == pid) & (dev_train['datetime'] >= dt - time_window) & (dev_train['datetime'] <= dt)]
    tup = (pid, dt, ", ".join(dev['device'].dropna()))
    train_tuples.append(tup)

100%|██████████| 331637/331637 [37:28<00:00, 147.47it/s]


In [17]:
train_df = pd.DataFrame(train_tuples, columns=['person_id', 'datetime', 'aggregated_devices'])
tfidf = TfidfVectorizer(tokenizer=lambda x: x.split(','), analyzer = 'word', max_features = 5)
device_vecs_train = tfidf.fit_transform(train_df['aggregated_devices'])
device_vec_df = pd.DataFrame(device_vecs_train.toarray(), columns=tfidf.get_feature_names_out())
train_df = pd.concat([train_df[['person_id', 'datetime']], device_vec_df.reset_index(drop=True)], axis = 1)
train_df.to_csv('device_train.csv', index = False)

del train_df
del train_tuples
del device_vecs_train



In [18]:
test_labels = test_labels.sort_values(['person_id', 'datetime'])
dev_test = dev_test.sort_values(['person_id', 'datetime'])
time_window = pd.Timedelta(days = 7)

test_tuples = []

for idx, (pid, dt) in tqdm(test_labels.iterrows(), total = test_labels.shape[0]):
    dev = dev_test[(dev_test['person_id'] == pid) & (dev_test['datetime'] >= dt - time_window) & (dev_test['datetime'] <= dt)]
    tup = (pid, dt, ", ".join(dev['device'].dropna()))
    test_tuples.append(tup)

100%|██████████| 130483/130483 [03:13<00:00, 673.51it/s]


In [19]:
test_df = pd.DataFrame(test_tuples, columns=['person_id', 'datetime', 'aggregated_devices'])
device_vecs_test = tfidf.transform(test_df['aggregated_devices'])
device_vec_df = pd.DataFrame(device_vecs_test.toarray(), columns=tfidf.get_feature_names_out())
test_df = pd.concat([test_df[['person_id', 'datetime']], device_vec_df.reset_index(drop=True)], axis = 1)
test_df.to_csv('device_test.csv', index = False)

In [20]:
print(test_df.shape)
test_df.head()

(130483, 7)


Unnamed: 0,person_id,datetime,arterial blood pressure catheter,central venous catheter,endotracheal tube,nasogastric/orogastric tube stylet,urinary catheter
0,3858662,2019-11-29 01:00:00,0.0,0.0,0.0,0.0,0.0
1,3858662,2019-11-29 03:00:00,0.0,0.0,0.0,0.0,0.0
2,3858662,2019-11-29 05:00:00,0.0,0.0,0.0,0.0,0.0
3,3858662,2019-11-29 06:00:00,0.0,0.0,0.0,0.0,0.0
4,3858662,2019-11-29 07:00:00,0.0,0.0,0.0,0.0,0.0


In [21]:
test_df.describe()

Unnamed: 0,person_id,datetime,arterial blood pressure catheter,central venous catheter,endotracheal tube,nasogastric/orogastric tube stylet,urinary catheter
count,130483.0,130483,130483.0,130483.0,130483.0,130483.0,130483.0
mean,1088774000.0,2022-08-30 09:45:00.158641408,0.154935,0.086977,0.024061,0.11018,0.214659
min,3858662.0,2019-04-16 14:00:00,0.0,0.0,0.0,0.0,0.0
25%,528693800.0,2021-06-22 14:00:00,0.0,0.0,0.0,0.0,0.0
50%,1112701000.0,2022-11-10 20:00:00,0.0,0.0,0.0,0.0,0.0
75%,1657864000.0,2024-02-09 09:30:00,0.0,0.0,0.0,0.0,0.075065
max,2139901000.0,2025-01-10 10:00:00,1.0,1.0,1.0,1.0,1.0
std,625821700.0,,0.353391,0.271146,0.130816,0.301638,0.393045
