In [27]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split

from model.rnn import GRUDecoder
from model.autoencoder import AutoEncoder
from data_processing.pipeline import encoding_pipeline

In [15]:
def get_device():
    # Check if CUDA is available
    if torch.cuda.is_available():
        # If CUDA is available, select the first CUDA device
        device = torch.device("cuda:0")
        print("Using CUDA device:", torch.cuda.get_device_name(0))
    # Check for MPS availability on supported macOS devices (requires PyTorch 1.12 or newer)
    elif torch.backends.mps.is_available():
        # If MPS is available, use MPS device
        device = torch.device("mps")
        print("Using MPS (Metal Performance Shaders) device")
    else:
        # Fallback to CPU if neither CUDA nor MPS is available
        device = torch.device("cpu")
        print("Using CPU")
    return device
device = get_device()

Using MPS (Metal Performance Shaders) device


# Read the data

Right now the notebook is set to work with fake data. This can be changed once the pipeline works.

The data is stored as a Dict[person_id, Sequences] where Sequences is a Dict[year, survery_wave_response]

In [16]:
# read in data and prepare transformations
data = pd.read_csv('data/other_data/PreFer_fake_data.csv')
targets = pd.read_csv('data/other_data/PreFer_fake_outcome.csv')
codebook = pd.read_csv('data/codebooks/PreFer_codebook.csv')

In [17]:
sequences = encoding_pipeline(data, codebook)

# Train the autoencoder

In [18]:
from model.dataset import PretrainingDataset

pretrain_dataset = PretrainingDataset(sequences)

In [23]:
### Initialization of the Autoencoder 
HIDDEN_DIM = 256
ENCODING_SIZE = 64
BATCH_SIZE = 32
num_epochs_autoencoder = 100
learning_rate_autoencoder = 5e-3

SEQ_LEN = pretrain_dataset.get_seq_len()
vocab_size = pretrain_dataset.get_vocab_size()

train_dataloader = DataLoader(pretrain_dataset, batch_size=BATCH_SIZE, shuffle=True)
autoencoder = AutoEncoder(vocab_size=vocab_size, embedding_size=HIDDEN_DIM, encoding_size=ENCODING_SIZE, sequence_len=SEQ_LEN).to(device)

loss_f = nn.HuberLoss(delta=0.5)
optimizer = optim.RAdam( autoencoder.parameters(), lr = learning_rate_autoencoder, weight_decay=1e-3, decoupled_weight_decay=True)



In [29]:
autoencoder.train()
# device = torch.device("cpu") # mps is not working for Me (Mikkel)
autoencoder.to(device)
loss_metric = []
for epoch in range(num_epochs_autoencoder):
    loss_epoch_metric = []
    for year, seq in train_dataloader:
        optimizer.zero_grad()
        year = year.to(device)
        seq = seq.to(device)

        xx = autoencoder(year, seq)
        loss = loss_f(xx, autoencoder.embedding(year, seq))
        loss_epoch_metric.append(loss.detach().cpu().numpy())
        loss.backward()
        optimizer.step()
    loss_metric.append(np.mean(loss_epoch_metric))
    print(f'epoch {epoch} \t Loss: {loss_metric[-1]:.4g}')

epoch 0 	 Loss: 0.5084
epoch 1 	 Loss: 0.5025
epoch 2 	 Loss: 0.5087
epoch 3 	 Loss: 0.5129
epoch 4 	 Loss: 0.5018
epoch 5 	 Loss: 0.4916


# Train the RNN

First we need to create Dataset class that can hold both the target (stored in a pd.DataFrame) and the sequences.

The sequences will be of dimension 14 x encoding_dimension, because we have 14 years of surveys.

I have created some code for getting the data into the right format, but it might not be useful.

## Regarding masks
Right now the masking is done already in the encoding. I haven't found exactly where Mikkel implemented this.
So for now, assume that nothing is padded, and then we'll figure it out with Mikkel.

In [None]:
# its not everyone we have a target for, so we do restrict the data to 
# the ones with known outcomes

train_person_ids, test_person_ids = train_test_split(targets['nomem_encr'], test_size=0.2, random_state=42)

In [None]:
# structure the data as a Dict[person_id, survey_embedding_sequence] 
# where survey_embedding_sequence is a tensor of size 14 x embedding_dimension
with torch.no_grad():
    rnn_data = {person_id:
                    autoencoder(
                        torch.tensor(
                            [year-2007 for year, _ in wave_responses.items()]
                        ).to(device),
                        torch.tensor(
                            [ wave_response for _, wave_response in wave_responses.items()]
                        ).to(device),
                        encode_only=True,
                    )
                for person_id, wave_responses in sequences.items()
            }

In [None]:
# split data based on the splits made for the target
train_data = {person_id: rnn_data[person_id] for person_id in train_person_ids}
test_data = {person_id: rnn_data[person_id] for person_id in test_person_ids}


In [None]:
from model.dataset import FinetuningDataset
train_dataset = FinetuningDataset(train_data, targets = targets)
test_dataset = FinetuningDataset(test_data, targets = targets)

rnn_batch_size = 64

train_dataloader = DataLoader(train_dataset, batch_size=rnn_batch_size, shuffle=True)
test_dataloader  = DataLoader(test_dataset,  batch_size=rnn_batch_size)

In [None]:
HIDDEN_SIZE = 10

num_epochs_rnn = 10
learning_rate_rnn = 0.001

rnn_model = GRUDecoder(
    input_size=ENCODING_SIZE,
    hidden_size=HIDDEN_SIZE,
    max_seq_len=14
).to(device)

# Define loss function and optimizer for RNN
rnn_loss = torch.nn.BCELoss()
rnn_optimizer = torch.optim.Adam(rnn_model.parameters(), lr=learning_rate_rnn)

# Training loop
rnn_model.train()
for epoch in range(num_epochs_rnn):
    print(epoch)
    running_loss = 0

    for batch in train_dataloader:
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(torch.float).to(device)

        rnn_optimizer.zero_grad() 

        # Forward pass
        xx = rnn_model(inputs)
        outputs = torch.nn.functional.sigmoid(xx)

        loss = rnn_loss(torch.flatten(outputs), labels)  

        running_loss += loss.item() * inputs.size(0)
        #loss.backward(retain_graph=True)
        loss.backward()
        rnn_optimizer.step()


    # Calculate average loss for the epoch
    epoch_loss = running_loss / len(train_dataloader.dataset)

    print(f"Epoch {epoch+1}/{num_epochs_rnn}, Loss: {epoch_loss:.4f}")
    

The model is going to set all input MASK to None
0
Epoch 1/10, Loss: 1.1433
1
Epoch 2/10, Loss: 0.9728
2
Epoch 3/10, Loss: 0.8092
3
Epoch 4/10, Loss: 0.7336
4
Epoch 5/10, Loss: 0.6012
5
Epoch 6/10, Loss: 0.5827
6
Epoch 7/10, Loss: 0.5493
7
Epoch 8/10, Loss: 0.5346
8
Epoch 9/10, Loss: 0.5265
9
Epoch 10/10, Loss: 0.5300


In [None]:
import torch
from sklearn.metrics import precision_recall_fscore_support

def evaluate_model(model, dataloader):
    model.eval()  # Set the model to evaluation mode
    predictions, actuals = [], []
    
    with torch.no_grad():
        for batch in dataloader:
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(torch.float).to(device)
            
            # Forward pass
            outputs = model(inputs)
            outputs = torch.nn.functional.sigmoid(outputs)
            
            # Convert outputs to binary predictions
            predicted_labels = (outputs > 0.5).float()  # Threshold predictions
            print(labels)
            print(predicted_labels.flatten())
            print('')
            
            # Store predictions and actual labels
            predictions.append(predicted_labels.flatten())
            actuals.append(labels.flatten())
    
    # Concatenate all the batches
    predictions = torch.cat(predictions)
    actuals = torch.cat(actuals)
    
    # Calculate precision, recall, and F1 score
    precision, recall, f1, _ = precision_recall_fscore_support(actuals.cpu().numpy(), predictions.cpu().numpy(), average='binary')
    
    return precision, recall, f1


In [None]:
precision, recall, f1 = evaluate_model(rnn_model, test_dataloader)
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

tensor([0., 0., 0., 1., 0., 1.])
tensor([0., 0., 0., 0., 0., 0.])

Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000


  _warn_prf(average, modifier, msg_start, len(result))
