In [5]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.model_selection import train_test_split
import torch.optim as optim
import pandas as pd
import pickle

In [4]:
# Read DataFrames from dictionary

# Load the dictionary from the file
path_to_read = '/Users/luisescobar/Documents/Thesis/DataSets/Dictionary'
#path_to_save = '/Users/luisescobar/Documents/Thesis/DataSets/Dictionary'
file_name = 'completo1007_(edit)_clotting.pkl'
name_to_read = f'{path_to_read}/{file_name}'

with open(name_to_read, 'rb') as file:
    loaded_dict = pickle.load(file)

In [8]:
# Split data into train and test
'''
Maybe will be a better idea to split the final DataFrames into clotting and no clotting. Once we have this split we
can divide in a better ratio for train and test  
'''
# Extract the keys
keys = list(loaded_dict.keys())

# Split the keys (80% train, 20% test)
train_keys, test_keys = train_test_split(keys, test_size=0.2, random_state=42)

# Create the train and test dictionaries
train_dict = {key: loaded_dict[key] for key in train_keys}
test_dict = {key: loaded_dict[key] for key in test_keys}

print("Train Dictionary:", len(train_dict))
print("Test Dictionary:", len(test_dict))

Train Dictionary: 148
Test Dictionary: 37


In [None]:
'''
columns=["Date__Heure","P_Access","P_Filter","P_Effluent","P_Return","Q_Blood_Pump",
          "Q_Replacement", "Q_Dialysate", "Q_PBP", "Q_Patient_Fluid_Removal", "DeltaP", "TMP", "TMPa", "trt", 
         "Patient_weight__Kg_", "Set"]

For training we could remove
0   Date__Heure ---> maybe we could find seasonality with this variable. However, we must transform the data  
13  trt 
14  Patient_weight__Kg_ ---> we could include it for other models to see if it produce a prediction difference
15  Set 
'''
columns=["P_Access","P_Filter","P_Effluent","P_Return","Q_Blood_Pump",
         "Q_Replacement", "Q_Dialysate", "Q_PBP", "Q_Patient_Fluid_Removal", 
         "DeltaP", "TMP", "TMPa", "Condition_1", "Condition_2", "Delta_P_ref", 
         "TMP_ref"]

In [None]:
# Step 1: Prepare the data
sequences = []
labels = []
lengths = []

for patient_id, df in data.items():
    # Extract explanatory variables (x1, x2, x3, x4)
    sequence = torch.tensor(df[columns].values, dtype=torch.float32)
    
    # Extract the target variable (y)
    label = torch.tensor(df['Clotting'].values[0], dtype=torch.float32)  # Assuming the same label for the entire series
    
    sequences.append(sequence)
    labels.append(label)
    lengths.append(len(sequence))

# Convert labels to a tensor
labels = torch.tensor(labels, dtype=torch.float32)

# Pad sequences to the maximum length
padded_sequences = nn.utils.rnn.pad_sequence(sequences, batch_first=True)

# Convert lengths to a tensor
lengths = torch.tensor(lengths)

In [None]:
# Step 2: Define the GRU model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size=1):
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, lengths):
        # Pack the padded sequence
        packed_input = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.gru(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        # We use the last hidden state of each sequence
        idx = (lengths - 1).view(-1, 1).expand(len(lengths), output.size(2)).unsqueeze(1)
        last_output = output.gather(1, idx).squeeze(1)

        return torch.sigmoid(self.fc(last_output))

# Instantiate the model
input_size = padded_sequences.size(2)  # Number of features (4 in this case: x1, x2, x3, x4)
hidden_size = 64  # Number of GRU units
model = GRUModel(input_size, hidden_size)

In [None]:
# Step 3: Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Step 4: Training Loop
#num_epochs = 100
num_epochs =100

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    # Forward pass
    outputs = model(padded_sequences, lengths)
    loss = criterion(outputs.squeeze(), labels)

    # Backward pass and optimization
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')


In [None]:
# Step 5: Test the model (optional)
model.eval()
with torch.no_grad():
    test_outputs = model(padded_sequences, lengths)
    predictions = (test_outputs.squeeze() > 0.5).float()
    print(f'Predictions: {predictions}')

Problems to address
How dictionary will work during Step 1
Should we normalize the values?
Double check the step # Extract the target variable (y) since is assuming that the same label for the entire series
What does this means?
