In [1]:
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split

from model.rnn import GRUDecoder
from model.autoencoder import AutoEncoder
from data_processing.pipeline import encoding_pipeline

In [2]:
def get_device():
    # Check if CUDA is available
    if torch.cuda.is_available():
        # If CUDA is available, select the first CUDA device
        device = torch.device("cuda:0")
        print("Using CUDA device:", torch.cuda.get_device_name(0))
    # Check for MPS availability on supported macOS devices (requires PyTorch 1.12 or newer)
    elif torch.backends.mps.is_available():
        # If MPS is available, use MPS device
        device = torch.device("mps")
        print("Using MPS (Metal Performance Shaders) device")
    else:
        # Fallback to CPU if neither CUDA nor MPS is available
        device = torch.device("cpu")
        print("Using CPU")
    return device
device = get_device()

Using MPS (Metal Performance Shaders) device


# Read the data

Right now the notebook is set to work with fake data. This can be changed once the pipeline works.

The data is stored as a Dict[person_id, Sequences] where Sequences is a Dict[year, survery_wave_response]

In [3]:
# read in data and prepare transformations
data = pd.read_csv('data/other_data/PreFer_fake_data.csv')
targets = pd.read_csv('data/other_data/PreFer_fake_outcome.csv')
codebook = pd.read_csv('data/codebooks/PreFer_codebook.csv')

In [4]:
sequences = encoding_pipeline(data, codebook)

# Train the autoencoder

In [5]:
from model.dataset import PretrainingDataset

pretrain_dataset = PretrainingDataset(sequences)

In [6]:
### Initialization of the Autoencoder 
HIDDEN_DIM = 512
ENCODING_SIZE = 64
BATCH_SIZE = 64
num_epochs_autoencoder = 10
learning_rate_autoencoder = 0.001

SEQ_LEN = pretrain_dataset.get_seq_len()
vocab_size = pretrain_dataset.get_vocab_size()

train_dataloader = DataLoader(pretrain_dataset, batch_size=BATCH_SIZE, shuffle=True)
autoencoder = AutoEncoder(vocab_size=vocab_size, embedding_size=HIDDEN_DIM, encoding_size=ENCODING_SIZE, sequence_len=SEQ_LEN).to(device)

error = nn.MSELoss()
optimizer = optim.Adam( autoencoder.parameters())



In [7]:
autoencoder.train()
device = torch.device("cpu") # mps is not working for Me (Mikkel)
autoencoder.to(device)
for epoch in range(num_epochs_autoencoder):
    for year, seq in train_dataloader:
        year = year.to(device)
        seq = seq.to(device)
        
        optimizer.zero_grad()
        xx = autoencoder(year, seq)
        loss = nn.functional.mse_loss(xx, autoencoder.embedding(year, seq))
        
        loss.backward()
        
        optimizer.step()
    print(f'epoch {epoch} \t Loss: {loss.item():.4g}')

epoch 0 	 Loss: 2.576
epoch 1 	 Loss: 2.435
epoch 2 	 Loss: 2.312
epoch 3 	 Loss: 2.229
epoch 4 	 Loss: 2.139
epoch 5 	 Loss: 2.053
epoch 6 	 Loss: 1.875


KeyboardInterrupt: 

# Train the RNN

First we need to create Dataset class that can hold both the target (stored in a pd.DataFrame) and the sequences.

The sequences will be of dimension 14 x encoding_dimension, because we have 14 years of surveys.

I have created some code for getting the data into the right format, but it might not be useful.

## Regarding masks
Right now the masking is done already in the encoding. I haven't found exactly where Mikkel implemented this.
So for now, assume that nothing is padded, and then we'll figure it out with Mikkel.

In [46]:
class SequencesWithTarget(Dataset):
    def __init__(self, sequences:dict, target: pd.DataFrame):
        self.sequences = sequences 
        self.target = targets.set_index(keys = 'nomem_encr').squeeze().to_dict()
        self.keys = list(sequences.keys())

    def __len__(self):
        return len(self.keys)

    def __getitem__(self, index):
        person_id = self.keys[index]
        
        target = self.target[person_id]
        sequence = self.sequences[person_id]

        return target, sequence

In [47]:
# its not everyone we have a target for, so we do restrict the data to 
# the ones with known outcomes

train_person_ids, test_person_ids = train_test_split(targets['nomem_encr'], test_size=0.2, random_state=42)

In [65]:
torch.BoolTensor([1,0,1])

tensor([ True, False,  True])

In [48]:
# structure the data as a Dict[person_id, survey_embedding_sequence] 
# where survey_embedding_sequence is a tensor of size 14 x embedding_dimension
rnn_data = {person_id:
                autoencoder.encode(
                    torch.tensor(
                        [ wave_response for _, wave_response in wave_responses.items()]
                    ).to(device)
                )
            for person_id, wave_responses in sequences.items()
           }

In [49]:
# split data based on the splits made for the target
train_data = {person_id: rnn_data[person_id] for person_id in train_person_ids}
test_data = {person_id: rnn_data[person_id] for person_id in test_person_ids}


In [24]:
train_dataset = SequencesWithTarget(train_data, target = targets)
test_dataset = SequencesWithTarget(test_data, target = targets)

rnn_batch_size = 64

train_dataloader = DataLoader(train_dataset, batch_size=rnn_batch_size, shuffle=True)
test_dataloader  = DataLoader(test_dataset,  batch_size=rnn_batch_size)

In [62]:
HIDDEN_SIZE = 10

num_epochs_rnn = 2
learning_rate_rnn = 0.001

rnn_model = GRUDecoder(
    input_size=ENCODING_SIZE,
    hidden_size=HIDDEN_SIZE,
    max_seq_len=14
).to(device)

# assume that all 14 years are observed for everyone
single_mask = torch.BoolTensor([True]*14).to(device) 

# Define loss function and optimizer for RNN
rnn_loss = torch.nn.BCELoss()
rnn_optimizer = torch.optim.Adam(rnn_model.parameters(), lr=learning_rate_rnn)
# Training loop

rnn_model.train()
for epoch in range(num_epochs_rnn):
    print(epoch)
    running_loss = 0

    for batch in train_dataloader:
        labels, inputs = batch
        labels, inputs = labels.to(torch.float).to(device), inputs.to(device)

        rnn_optimizer.zero_grad() 

        # Forward pass
        mask = torch.stack([single_mask]*len(labels) ) #  not correct masking

        xx = rnn_model(inputs, mask)
        outputs = torch.nn.functional.sigmoid(xx)

        loss = rnn_loss(torch.flatten(outputs), labels)  

        #loss.backward(retain_graph=True)
        loss.backward()
        rnn_optimizer.step()

        running_loss += loss.item() * inputs.size(0)

    # Calculate average loss for the epoch
    epoch_loss = running_loss / len(train_dataloader.dataset)

    print(f"Epoch {epoch+1}/{num_epochs_rnn}, Loss: {epoch_loss:.4f}")j
    

0


RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

In [51]:
import torch
from sklearn.metrics import precision_recall_fscore_support

def evaluate_model(model, dataloader, single_mask):
    model.eval()  # Set the model to evaluation mode
    predictions, actuals = [], []
    
    with torch.no_grad():
        for batch in dataloader:
            labels, inputs = batch
            labels, inputs = labels.to(torch.float).to(device), inputs.to(device)
            
            # Forward pass
            mask = torch.stack([single_mask]*len(labels))
            outputs = model(inputs, mask)
            outputs = torch.nn.functional.sigmoid(outputs)
            
            # Convert outputs to binary predictions
            predicted_labels = (outputs > 0.5).float()  # Threshold predictions
            print(labels)
            print(predicted_labels.flatten())
            print('')
            
            # Store predictions and actual labels
            predictions.append(predicted_labels.flatten())
            actuals.append(labels.flatten())
    
    # Concatenate all the batches
    predictions = torch.cat(predictions)
    actuals = torch.cat(actuals)
    
    # Calculate precision, recall, and F1 score
    precision, recall, f1, _ = precision_recall_fscore_support(actuals.cpu().numpy(), predictions.cpu().numpy(), average='binary')
    
    return precision, recall, f1


In [52]:
precision, recall, f1 = evaluate_model(rnn_model, test_dataloader, single_mask)
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

tensor([0., 0., 0., 1., 0., 1.], device='mps:0')
tensor([0., 0., 0., 0., 0., 0.], device='mps:0')

Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
