In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split

from model.rnn import GRUDecoder
from model.autoencoder import AutoEncoder
from data_processing.pipeline import encoding_pipeline

In [2]:
def get_device():
    # Check if CUDA is available
    if torch.cuda.is_available():
        # If CUDA is available, select the first CUDA device
        device = torch.device("cuda:0")
        print("Using CUDA device:", torch.cuda.get_device_name(0))
    # Check for MPS availability on supported macOS devices (requires PyTorch 1.12 or newer)
    elif torch.backends.mps.is_available():
        # If MPS is available, use MPS device
        device = torch.device("mps")
        print("Using MPS (Metal Performance Shaders) device")
    else:
        # Fallback to CPU if neither CUDA nor MPS is available
        device = torch.device("cpu")
        print("Using CPU")
    return device
device = get_device()

Using MPS (Metal Performance Shaders) device


# Read the data

Right now the notebook is set to work with fake data. This can be changed once the pipeline works.

The data is stored as a Dict[person_id, Sequences] where Sequences is a Dict[year, survery_wave_response]

In [3]:
# read in data and prepare transformations
data = pd.read_csv('data/other_data/PreFer_fake_data.csv')
targets = pd.read_csv('data/other_data/PreFer_fake_outcome.csv')
codebook = pd.read_csv('data/codebooks/PreFer_codebook.csv')

In [4]:
sequences = encoding_pipeline(data, codebook)

# Train the autoencoder

In [5]:
from model.dataset import PretrainingDataset

pretrain_dataset = PretrainingDataset(sequences)

In [6]:
### Initialization of the Autoencoder 
HIDDEN_DIM = 256
ENCODING_SIZE = 128
BATCH_SIZE = 16
num_epochs_autoencoder = 100
learning_rate_autoencoder = 1e-3

SEQ_LEN = pretrain_dataset.get_seq_len()
vocab_size = pretrain_dataset.get_vocab_size()

train_dataloader = DataLoader(pretrain_dataset, batch_size=BATCH_SIZE, shuffle=True)
autoencoder = AutoEncoder(vocab_size=vocab_size, embedding_size=HIDDEN_DIM, encoding_size=ENCODING_SIZE, sequence_len=SEQ_LEN).to(device)

loss_f1 = nn.HuberLoss(delta=1.0)
loss_cls = nn.CrossEntropyLoss(label_smoothing=0.1)
loss_cos = nn.CosineEmbeddingLoss()
optimizer = optim.RAdam( autoencoder.parameters(), lr = learning_rate_autoencoder, weight_decay=1e-3, decoupled_weight_decay=True)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = num_epochs_autoencoder, eta_min = 1e-6, last_epoch = -1)

  from .autonotebook import tqdm as notebook_tqdm


### Load the pre-trained Autoencoder

In [None]:
### INSTEAD YOU CAN SALDO LOAD A MODEL
model_path = "weights/autoencoder_10_05.pt"
autoencoder = AutoEncoder(vocab_size=vocab_size, embedding_size=HIDDEN_DIM, encoding_size=ENCODING_SIZE, sequence_len=SEQ_LEN)
autoencoder.load_state_dict(torch.load(model_path, map_location=device))


### (or) Train the autoencoder

In [7]:
autoencoder.train()
# device = torch.device("cpu") # mps is not working for Me (Mikkel)
autoencoder.to(device)
loss_metric = []
for epoch in range(num_epochs_autoencoder):
    loss_epoch_metric = []
    for year, seq in train_dataloader:
        optimizer.zero_grad()
        year = year.to(device)
        seq = seq.to(device)

        x1, x2 = autoencoder(year, seq)
        loss = loss_cls(x2.permute(0,2,1), seq.long()) + loss_cos(x1.reshape(x1.size(1) * x1.size(0), -1 ), 
                                                                autoencoder.embedding(year, seq).view(x1.size(1) * x1.size(0), -1), 
                                                                torch.ones(seq.size(0) * seq.size(1)).to(device))
         #+ 0.7 * loss_f1(x1, autoencoder.embedding(year, seq)) +  
        loss_epoch_metric.append(loss.detach().cpu().numpy())
        loss.backward()
        optimizer.step()
    ## After epoch end
    scheduler.step()
    loss_metric.append(np.mean(loss_epoch_metric))
    print(f'epoch {epoch} \t Loss: {loss_metric[-1]:.4g} and LR: {scheduler.get_last_lr()[0]:.5g}')

epoch 0 	 Loss: 11.86 and LR: 0.00099975
epoch 1 	 Loss: 5.785 and LR: 0.00099901
epoch 2 	 Loss: 4.483 and LR: 0.00099778
epoch 3 	 Loss: 3.981 and LR: 0.00099606
epoch 4 	 Loss: 3.687 and LR: 0.00099385
epoch 5 	 Loss: 3.54 and LR: 0.00099115
epoch 6 	 Loss: 3.438 and LR: 0.00098797
epoch 7 	 Loss: 3.363 and LR: 0.00098431
epoch 8 	 Loss: 3.302 and LR: 0.00098017
epoch 9 	 Loss: 3.252 and LR: 0.00097555
epoch 10 	 Loss: 3.211 and LR: 0.00097047
epoch 11 	 Loss: 3.165 and LR: 0.00096492
epoch 12 	 Loss: 3.108 and LR: 0.00095892
epoch 13 	 Loss: 3.081 and LR: 0.00095246
epoch 14 	 Loss: 3.038 and LR: 0.00094556
epoch 15 	 Loss: 3.009 and LR: 0.00093822
epoch 16 	 Loss: 2.983 and LR: 0.00093044
epoch 17 	 Loss: 2.954 and LR: 0.00092224


#### Save the weights of the pretrained autoencoder

In [7]:
torch.save(autoencoder.state_dict(), 'weights/autoencoder_10_05.pt')

### Explroe the embedding space

In [8]:
import umap
import matplotlib.pyplot as plt
w = autoencoder.embedding.answer_embedding.weight.detach().cpu().numpy()
projector = umap.UMAP(n_components=2)
wp = projector.fit_transform(w)
plt.scatter(wp[:,0], wp[:,1])


# Train the RNN

First we need to create Dataset class that can hold both the target (stored in a pd.DataFrame) and the sequences.

The sequences will be of dimension 14 x encoding_dimension, because we have 14 years of surveys.

I have created some code for getting the data into the right format, but it might not be useful.

## Regarding masks
Right now the masking is done already in the encoding. I haven't found exactly where Mikkel implemented this.
So for now, assume that nothing is padded, and then we'll figure it out with Mikkel.

In [7]:
# its not everyone we have a target for, so we do restrict the data to 
# the ones with known outcomes

train_person_ids, test_person_ids = train_test_split(targets['nomem_encr'], test_size=0.2, random_state=42)

In [8]:
# structure the data as a Dict[person_id, survey_embedding_sequence] 
# where survey_embedding_sequence is a tensor of size 14 x embedding_dimension
#rnn_data = {person_id:
#                    autoencoder(
#                        torch.tensor(
#                            [year-2007 for year, _ in wave_responses.items()]
#                        ).to(device),
##                        torch.tensor(
 #                           [ wave_response for _, wave_response in wave_responses.items()]
##                        ).to(device),
# #                       encode_only=True,
#                    )
#                for person_id, wave_responses in sequences.items()
#            }

In [13]:
rnn_data = {person_id: (
        torch.tensor([year-2007 for year, _ in wave_responses.items()]).to(device),
        torch.tensor([ wave_response for _, wave_response in wave_responses.items()]).to(device)
        )
        for person_id, wave_responses in sequences.items()
}

In [14]:
# split data based on the splits made for the target
train_data = {person_id: rnn_data[person_id] for person_id in train_person_ids}
test_data = {person_id: rnn_data[person_id] for person_id in test_person_ids}


In [16]:
from model.dataset import FinetuningDataset
train_dataset = FinetuningDataset(train_data, targets = targets)
test_dataset = FinetuningDataset(test_data, targets = targets)

rnn_batch_size = 16

train_dataloader = DataLoader(train_dataset, batch_size=rnn_batch_size, shuffle=True)
test_dataloader  = DataLoader(test_dataset,  batch_size=rnn_batch_size)

In [83]:
# ft - fine-tuning

HIDDEN_SIZE = 24

num_epochs_ft = 40
learning_rate_ft = 5e-4

rnn_model = GRUDecoder(
    input_size=ENCODING_SIZE,
    hidden_size=HIDDEN_SIZE,
    max_seq_len=14
).to(device)

# Define loss function and optimizer for RNN
ft_loss = torch.nn.BCELoss()
ft_optimizer = torch.optim.RAdam(list(rnn_model.parameters()) +list(autoencoder.parameters()) , lr=learning_rate_ft, weight_decay=1e-3, decoupled_weight_decay=True)
ft_scheduler = optim.lr_scheduler.CosineAnnealingLR(ft_optimizer, T_max = num_epochs_ft, eta_min = 1e-6, last_epoch = -1)

# Training loop
rnn_model.train()
autoencoder.train()
print("Ready!")

The model is going to set all input MASK to None
Ready!


In [84]:
loss_per_epoch = []
for epoch in range(num_epochs_ft):
    print(epoch)
    loss_per_step = []
    for batch in train_dataloader:
        ft_optimizer.zero_grad() 
        inputs, labels = batch
        labels = labels.to(torch.float).to(device)

        input_year, input_seq = inputs
        bs, ss = labels.size(0), 14
        input_year = input_year.reshape(-1).to(device)
        input_seq = input_seq.reshape(bs * ss, -1).to(device)

        survey_embeddings = autoencoder(input_year, input_seq, encode_only = True).view(bs,ss, -1)



        # Forward pass
        xx = rnn_model(survey_embeddings)
        outputs = torch.nn.functional.sigmoid(xx)

        loss = ft_loss(torch.flatten(outputs), labels)  
        loss_per_step.append(loss.detach().cpu().numpy())

        #loss.backward(retain_graph=True)
        loss.backward()
        ft_optimizer.step()
    # On epoch end
    loss_per_epoch.append(np.mean(loss_per_step))
    ft_scheduler.step()

    print(f"Epoch {epoch+1}/{num_epochs_ft}, Loss: {loss_per_epoch[-1]:.4f}")
    

0
Epoch 1/40, Loss: 0.7528
1
Epoch 2/40, Loss: 1.7066
2
Epoch 3/40, Loss: 1.3687
3
Epoch 4/40, Loss: 1.3773
4
Epoch 5/40, Loss: 1.4083
5


KeyboardInterrupt: 

In [29]:
import torch
from sklearn.metrics import precision_recall_fscore_support

def evaluate_model(model, dataloader):
    model.eval()  # Set the model to evaluation mode
    predictions, actuals = [], []
    
    with torch.no_grad():
        for batch in dataloader:
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(torch.float).to(device)
            
            # Forward pass
            outputs = model(inputs)
            outputs = torch.nn.functional.sigmoid(outputs)
            
            # Convert outputs to binary predictions
            predicted_labels = (outputs > 0.5).float()  # Threshold predictions
            print(labels)
            print(predicted_labels.flatten())
            print('')
            
            # Store predictions and actual labels
            predictions.append(predicted_labels.flatten())
            actuals.append(labels.flatten())
    
    # Concatenate all the batches
    predictions = torch.cat(predictions)
    actuals = torch.cat(actuals)
    
    # Calculate precision, recall, and F1 score
    precision, recall, f1, _ = precision_recall_fscore_support(actuals.cpu().numpy(), predictions.cpu().numpy(), average='binary')
    
    return precision, recall, f1


In [30]:
precision, recall, f1 = evaluate_model(rnn_model, test_dataloader)
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

tensor([0., 0., 0., 1., 0., 1.], device='mps:0')
tensor([0., 0., 0., 0., 0., 0.], device='mps:0')

Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
