In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

%matplotlib notebook
import matplotlib.pyplot as plt

<h1> Load Dataset </h1>

In [2]:
## CSV from: https://www.kaggle.com/code/salikhussaini49/read-data-from-directory
df = pd.read_csv('data/dataset.csv')
df.drop(['Unnamed: 0', 'EtCO2', 'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN', 'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium', 'Potassium', 'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC', 'Fibrinogen', 'Platelets'], axis=1, inplace=True)

In [3]:
df.head()

Unnamed: 0,Hour,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,Glucose,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,Patient_ID
0,0,,,,,,,,,68.54,0,,,-0.02,1,0,17072
1,1,65.0,100.0,,,72.0,,16.5,,68.54,0,,,-0.02,2,0,17072
2,2,78.0,100.0,,,42.5,,,,68.54,0,,,-0.02,3,0,17072
3,3,73.0,100.0,,,,,17.0,,68.54,0,,,-0.02,4,0,17072
4,4,70.0,100.0,,129.0,74.0,69.0,14.0,161.0,68.54,0,,,-0.02,5,0,17072


In [4]:
df.describe()

Unnamed: 0,Hour,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,Glucose,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,Patient_ID
count,1552210.0,1398811.0,1349474.0,525226.0,1325945.0,1358940.0,1065656.0,1313875.0,265516.0,1552210.0,1552210.0,940250.0,940250.0,1552202.0,1552210.0,1552210.0,1552210.0
mean,25.49274,84.58144,97.19395,36.977228,123.7505,82.4001,63.83056,18.7265,136.932283,62.00947,0.559269,0.496571,0.503429,-56.12512,26.99499,0.01798468,59201.48
std,28.88256,17.32524,2.936924,0.770014,23.23156,16.34175,13.95601,5.098194,51.310728,16.38622,0.4964749,0.499989,0.499989,162.2569,29.00542,0.1328956,50248.19
min,0.0,20.0,20.0,20.9,20.0,20.0,20.0,1.0,10.0,14.0,0.0,0.0,0.0,-5366.86,1.0,0.0,1.0
25%,9.0,72.0,96.0,36.5,107.0,71.0,54.0,15.0,106.0,51.68,0.0,0.0,0.0,-47.05,11.0,0.0,9990.0
50%,19.0,83.5,98.0,37.0,121.0,80.0,62.0,18.0,127.0,64.0,1.0,0.0,1.0,-6.03,21.0,0.0,19965.0
75%,33.0,95.5,99.5,37.5,138.0,92.0,72.0,21.5,153.0,74.0,1.0,1.0,1.0,-0.04,34.0,0.0,109878.0
max,335.0,280.0,100.0,50.0,300.0,300.0,300.0,100.0,988.0,100.0,1.0,1.0,1.0,23.99,336.0,1.0,120000.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1552210 entries, 0 to 1552209
Data columns (total 17 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   Hour         1552210 non-null  int64  
 1   HR           1398811 non-null  float64
 2   O2Sat        1349474 non-null  float64
 3   Temp         525226 non-null   float64
 4   SBP          1325945 non-null  float64
 5   MAP          1358940 non-null  float64
 6   DBP          1065656 non-null  float64
 7   Resp         1313875 non-null  float64
 8   Glucose      265516 non-null   float64
 9   Age          1552210 non-null  float64
 10  Gender       1552210 non-null  int64  
 11  Unit1        940250 non-null   float64
 12  Unit2        940250 non-null   float64
 13  HospAdmTime  1552202 non-null  float64
 14  ICULOS       1552210 non-null  int64  
 15  SepsisLabel  1552210 non-null  int64  
 16  Patient_ID   1552210 non-null  int64  
dtypes: float64(12), int64(5)
memory usage: 201.3 M

<h1> Pre-processing </h1>

<h3> Replace missing values </h3>

In [6]:
"""Méthode qui permet de fill les NaN avec une méthode qui s'appel le ForwardFill
et qui fait aussi du Backward fill"""

def impute_missing_vals(df, attributes):
    df_clean = df.copy()
    for att in attributes:
        if df_clean[att].isnull().sum() == len(df_clean):
            df_clean[att] = df_clean[att].fillna(0) # On remplie les cases par des 0
        elif df_clean[att].isnull().sum() == len(df_clean) - 1:
            df_clean[att] = df_clean[att].ffill().bfill() # On remplie les cases à l'aide d'un Forward Fill et d'un Backward Fill
        else:
            df_clean[att] = df_clean[att].interpolate(method='nearest', limit_direction='both') # On utilise une méthode d'interpolation pour remplir avec des valeurs non nulles.
            df_clean[att] = df_clean[att].ffill().bfill()
    
    return df_clean

In [7]:
df = impute_missing_vals(df, df.columns)

<h3> Standardization </h3>

In [8]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
scaled_data = min_max_scaler.fit_transform(df.loc[:, df.columns != 'Patient_ID'].to_numpy())
df.loc[:, df.columns != 'Patient_ID'] = scaled_data

In [9]:
df.head()

Unnamed: 0,Hour,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,Glucose,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,Patient_ID
0,0.0,0.173077,1.0,0.51134,0.389286,0.185714,0.175,0.156566,0.154397,0.634186,0.0,1.0,0.0,0.995546,0.0,0.0,17072
1,0.002985,0.173077,1.0,0.51134,0.389286,0.185714,0.175,0.156566,0.154397,0.634186,0.0,1.0,0.0,0.995546,0.002985,0.0,17072
2,0.00597,0.223077,1.0,0.51134,0.389286,0.080357,0.175,0.156566,0.154397,0.634186,0.0,1.0,0.0,0.995546,0.00597,0.0,17072
3,0.008955,0.203846,1.0,0.51134,0.389286,0.080357,0.175,0.161616,0.154397,0.634186,0.0,1.0,0.0,0.995546,0.008955,0.0,17072
4,0.01194,0.192308,1.0,0.51134,0.389286,0.192857,0.175,0.131313,0.154397,0.634186,0.0,1.0,0.0,0.995546,0.01194,0.0,17072


In [10]:
df.describe()

Unnamed: 0,Hour,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,Glucose,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,Patient_ID
count,1552210.0,1552210.0,1552210.0,1552210.0,1552210.0,1552210.0,1552210.0,1552210.0,1552210.0,1552210.0,1552210.0,1552210.0,1552210.0,1552210.0,1552210.0,1552210.0,1552210.0
mean,0.07609774,0.24766,0.9644136,0.5485283,0.3705302,0.2239183,0.1558019,0.17857,0.1252508,0.5582496,0.559269,0.49952,0.50048,0.9851386,0.07759699,0.01798468,59201.48
std,0.08621659,0.0667358,0.03863969,0.02491878,0.08290813,0.05851399,0.05019481,0.05185881,0.05000579,0.1905374,0.4964749,0.4999999,0.4999999,0.03009855,0.08658334,0.1328956,50248.19
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.02686567,0.2,0.95,0.532646,0.3107143,0.1821429,0.1214286,0.1464646,0.09509202,0.4381395,0.0,0.0,0.0,0.9868221,0.02985075,0.0,9990.0
50%,0.05671642,0.2423077,0.975,0.5474227,0.3607143,0.2178571,0.15,0.1717172,0.1145194,0.5813953,1.0,0.0,1.0,0.9944313,0.05970149,0.0,19965.0
75%,0.09850746,0.2884615,0.9875,0.5635739,0.4214286,0.2571429,0.1857143,0.2020202,0.1411043,0.6976744,1.0,1.0,1.0,0.9955424,0.09850746,0.0,109878.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,120000.0


<h3> Split dataset per patient </h3>

In [11]:
data = []
label = []
window_size = 6

for patientId in df['Patient_ID'].unique():
    tmp_data = df[df['Patient_ID'] == patientId]
    if(len(tmp_data) >= window_size):
        data.append(tmp_data.drop(['Hour', 'Patient_ID', 'SepsisLabel'], axis=1).to_numpy())
        label.append(tmp_data['SepsisLabel'].to_numpy())

In [31]:
data

[array([[0.17307692, 1.        , 0.51134021, 0.38928571, 0.18571429,
         0.175     , 0.15656566, 0.15439673, 0.63418605, 0.        ,
         1.        , 0.        , 0.99554616, 0.        ],
        [0.17307692, 1.        , 0.51134021, 0.38928571, 0.18571429,
         0.175     , 0.15656566, 0.15439673, 0.63418605, 0.        ,
         1.        , 0.        , 0.99554616, 0.00298507],
        [0.22307692, 1.        , 0.51134021, 0.38928571, 0.08035714,
         0.175     , 0.15656566, 0.15439673, 0.63418605, 0.        ,
         1.        , 0.        , 0.99554616, 0.00597015],
        [0.20384615, 1.        , 0.51134021, 0.38928571, 0.08035714,
         0.175     , 0.16161616, 0.15439673, 0.63418605, 0.        ,
         1.        , 0.        , 0.99554616, 0.00895522],
        [0.19230769, 1.        , 0.51134021, 0.38928571, 0.19285714,
         0.175     , 0.13131313, 0.15439673, 0.63418605, 0.        ,
         1.        , 0.        , 0.99554616, 0.0119403 ],
        [0.16153846,

In [58]:
# One patient per batch
train_loader = []

for i in range(len(data)):
    patient_data = data[i]
    labels = label[i]
    X_data = []
    Y_data = []
    
    for j in range(len(patient_data) - (window_size - 1)):
        X_data.append(patient_data[j:(j + window_size)])
        Y_data.append([labels[(j + window_size - 1)]])
    
    train_loader.append([torch.Tensor(X_data), torch.Tensor(Y_data)])

<h1> TimeSeriesDataset </h1>

In [59]:
# OLD METHOD (BATCH WITH MULTIPLE PATIENT)
'''
from torch.utils.data import TensorDataset, DataLoader

tensor_x = torch.Tensor(X_data) # transform to torch tensor
tensor_y = torch.Tensor(Y_data)

train_dataset = TensorDataset(tensor_x, tensor_y) # create your datset
train_loader = DataLoader(train_dataset) # create your dataloader, on can add the batch size here
'''

'\nfrom torch.utils.data import TensorDataset, DataLoader\n\ntensor_x = torch.Tensor(X_data) # transform to torch tensor\ntensor_y = torch.Tensor(Y_data)\n\ntrain_dataset = TensorDataset(tensor_x, tensor_y) # create your datset\ntrain_loader = DataLoader(train_dataset) # create your dataloader, on can add the batch size here\n'

<h1> Transfomer </h1>

In [60]:
# SOURCE: https://github.com/LiamMaclean216/Pytorch-Transfomer/blob/master/Transformer.ipynb
from utils import *
from Network import *

#hyperparams
enc_seq_len = 6 # 6 # length of input given to encoder. Can have any integer value.
dec_seq_len = 6 # 2 # length of input given to decoder. Can have any integer value.
output_sequence_length = 1 # 1 # Length of the target sequence, i.e. how many time steps should your forecast cover

input_size = 14 # Multivariate forecasting.
dim_val = 10 # This can be any value divisible by n_heads. 512 is used in the original transformer paper.
dim_attn = 5

lr = 0.002
epochs = 20 # 20

n_heads = 4 # The number of attention heads (aka parallel attention layers). dim_val must be divisible by this number

n_decoder_layers = 1
n_encoder_layers = 3

#batch_size = 256

#init network and optimizer
t = Transformer(dim_val, dim_attn, input_size, dec_seq_len, output_sequence_length, n_decoder_layers, n_encoder_layers, n_heads)
optimizer = torch.optim.Adam(t.parameters(), lr=lr)

In [62]:
#keep track of loss for graph
losses = []

#build live matplotlib fig
fig = plt.figure(figsize=(8,6))

ax = fig.add_subplot(111)
plt.ion()

fig.show()
fig.canvas.draw()

    
for e in range(epochs):
    print("Starting epoch: " + str(e))
    out = []
    
    #for b in range(-10- enc_seq_len, 10 - enc_seq_len):
    for b in train_loader:
        optimizer.zero_grad()
        X, Y = b
        
        #Forward pass and calculate loss
        net_out = t(X)
        #print(net_out.shape,Y.shape)
        
        print(Y)
        
        loss = torch.mean((net_out - Y) ** 2)

        #backwards pass
        loss.backward()
        optimizer.step()
        
        #Track losses and draw rgaph
        out.append([net_out.detach().numpy(), Y])
        losses.append(loss.detach().numpy())

        ax.clear()
        ax.plot(losses)
        ax.set_title("Mean Squared Error")
        fig.canvas.draw()

<IPython.core.display.Javascript object>

Starting epoch: 0
tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]])
tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]])
tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],


tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]])


KeyboardInterrupt: 