In [1]:
# Data packages
import pandas as pd 
import os

import pandas as pd
from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split

from model.rnn import GRUDecoder
from model.autoencoder import AutoEncoder, SimpleAutoEncoder
from model.layers import ConvEncoderLayer, ConvDecoderLayer, Norm
from data_processing.pipeline import encoding_pipeline, get_generic_name

import torch.nn as nn
import torch.nn.functional as F
from torch_frame.nn.conv import TabTransformerConv, ExcelFormerConv
from torch_frame.nn.decoder import ExcelFormerDecoder
from model.embeddings import SurveyEmbeddings

In [2]:
def get_device():
    # Check if CUDA is available
    if torch.cuda.is_available():
        # If CUDA is available, select the first CUDA device
        device = torch.device("cuda:0")
        print("Using CUDA device:", torch.cuda.get_device_name(0))
    # Check for MPS availability on supported macOS devices (requires PyTorch 1.12 or newer)
    elif torch.backends.mps.is_available():
        # If MPS is available, use MPS device
        device = torch.device("mps")
        print("Using MPS (Metal Performance Shaders) device")
    else:
        # Fallback to CPU if neither CUDA nor MPS is available
        device = torch.device("cpu")
        print("Using CPU")
    return device
device = get_device()

Using MPS (Metal Performance Shaders) device


# Read the data

Right now the notebook is set to work with fake data. This can be changed once the pipeline works.

The data is stored as a Dict[person_id, Sequences] where Sequences is a Dict[year, survery_wave_response]

In [3]:
# read in data and prepare transformations
data = pd.read_csv("data/training_data/PreFer_train_data.csv")
targets = pd.read_csv('data/training_data/PreFer_train_outcome.csv')
codebook = pd.read_csv('data/codebooks/PreFer_codebook.csv')

  data = pd.read_csv("data/training_data/PreFer_train_data.csv")


In [4]:
importance = pd.read_csv('features_importance_1000.csv')
custom_pairs = importance.iloc[:50].feature.map(lambda x: get_generic_name(x))

In [5]:
# check if sequences have been preprocessed (saves time)
if False:# os.path.exists('data/processed_data/sequences.pt'):
    sequences = torch.load('data/processed_data/sequences.pt')
else:
    sequences = encoding_pipeline(data, codebook, custom_pairs=custom_pairs)
    #torch.save(sequences, 'data/processed_data/sequences.pt')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  codebook["pairs"] = codebook['var_name'].apply(get_generic_name)


In [6]:
from model.dataset import PretrainingDataset
pretrain_dataset = PretrainingDataset(sequences)

# Experimental Encoder (Only)

In [8]:


class ExpEncoder(nn.Module):
    def __init__(self, vocab_size, sequence_len: int, embedding_size: int, output_size: int, num_cols: int, num_layers: int = 3) -> None:
        super().__init__()

        self.embedding = SurveyEmbeddings(
            vocab_size, sequence_len, n_years=14, embedding_dim=embedding_size)

        self.encoders = nn.ModuleList([
            nn.Sequential(
                ExcelFormerConv(channels=embedding_size, num_cols=num_cols, num_heads=4),
                nn.LayerNorm(embedding_size))
                 for _ in range(num_layers)])
        
        self.flatten = ExcelFormerDecoder(in_channels = embedding_size, 
                                          out_channels=output_size, 
                                          num_cols= num_cols)
        

    def forward(self, year, seq):
        """
        Method that returns full encoding-decoding
        """
        x = self.embedding(year, seq)
        for encoder in self.encoders:
            x = encoder(x)
        x = self.flatten(x)
        return x


# Train the RNN

First we need to create Dataset class that can hold both the target (stored in a pd.DataFrame) and the sequences.

The sequences will be of dimension 14 x encoding_dimension, because we have 14 years of surveys.

I have created some code for getting the data into the right format, but it might not be useful.

## Regarding masks
Right now the masking is done already in the encoding. I haven't found exactly where Mikkel implemented this.
So for now, assume that nothing is padded, and then we'll figure it out with Mikkel.

In [9]:
# its not everyone we have a target for, so we do restrict the data to 
# the ones with known outcomes
targets = targets[targets.new_child.notna()]
train_person_ids, test_person_ids = train_test_split(targets['nomem_encr'], test_size=0.2, random_state=42)

In [10]:
rnn_data = {person_id: (
        torch.tensor([year-2007 for year, _ in wave_responses.items()]).to(device),
        torch.tensor([ wave_response for _, wave_response in wave_responses.items()]).to(device)
        )
        for person_id, wave_responses in sequences.items()
}

In [11]:
# split data based on the splits made for the target
train_data = {person_id: rnn_data[person_id] for person_id in train_person_ids}
test_data = {person_id: rnn_data[person_id] for person_id in test_person_ids}

In [12]:
from model.dataset import FinetuningDataset
train_dataset = FinetuningDataset(train_data, targets = targets)
test_dataset = FinetuningDataset(test_data, targets = targets)

rnn_batch_size = 4

train_dataloader = DataLoader(train_dataset, batch_size=rnn_batch_size, shuffle=True)
test_dataloader  = DataLoader(test_dataset,  batch_size=rnn_batch_size)

In [20]:
# ft - fine-tuning

HIDDEN_SIZE = 128
ENCODING_SIZE = 128
NUM_COLS = 44
#ENCODING_SIZE = 64
learning_rate_autoencoder = 1e-3

SEQ_LEN = pretrain_dataset.get_seq_len()
VOCAB_SIZE = pretrain_dataset.get_vocab_size()

num_epochs_ft = 40
learning_rate_ft = 5e-3

encoder = ExpEncoder(vocab_size=VOCAB_SIZE, sequence_len=SEQ_LEN, embedding_size=HIDDEN_SIZE, output_size=ENCODING_SIZE, num_cols=NUM_COLS).to(device)

rnn_model = GRUDecoder(
    input_size=ENCODING_SIZE,
    hidden_size=HIDDEN_SIZE,
    max_seq_len=14
).to(device)

# Define loss function and optimizer for RNN
ft_loss = nn.BCEWithLogitsLoss()
ft_optimizer = torch.optim.NAdam(list(rnn_model.parameters()) + list(encoder.parameters()) , lr=learning_rate_ft, weight_decay=1e-2, decoupled_weight_decay=True)
ft_scheduler = optim.lr_scheduler.CosineAnnealingLR(ft_optimizer, T_max = num_epochs_ft, eta_min = 1e-6, last_epoch = -1)

# Training loop
rnn_model.train()
encoder.train()
print("Ready!")

The model is going to set all input MASK to None
Ready!


In [21]:
loss_per_epoch = []
for epoch in range(num_epochs_ft):
    # print(epoch)
    loss_per_step = []
    loop_object  = tqdm(enumerate(train_dataloader), desc=f"Epochs {epoch}")
    for i, batch in loop_object :        
        ft_optimizer.zero_grad() 
        inputs, labels = batch
        labels = labels.to(torch.float).to(device)

        input_year, input_seq = inputs
        bs, ss = labels.size(0), input_year.size(1)
        input_year = input_year.reshape(-1).to(device)
        input_seq = input_seq.reshape(bs * ss, -1).to(device)

        encodings = encoder(input_year, input_seq).view(bs,ss, -1)
        mask = ((input_seq == 101).sum(-1) == NUM_COLS).view(bs,ss).detach()


        # Forward pass
        outputs = rnn_model(encodings, mask=~mask)

        loss = ft_loss(torch.flatten(outputs), labels)  
        loss_per_step.append(loss.detach().cpu().numpy())
        loop_object.set_postfix_str("mean loss: %.3f"%np.mean(loss_per_step[-100:]))

        #loss.backward(retain_graph=True)
        loss.backward()
        ft_optimizer.step()
    # On epoch end
    loss_per_epoch.append(np.mean(loss_per_step))
    ft_scheduler.step()

    print(f"Epoch {epoch+1}/{num_epochs_ft}, Loss: {loss_per_epoch[-1]:.4f}")
    

Epochs 0: 198it [00:23,  8.61it/s, mean loss: 0.500]


Epoch 1/40, Loss: 0.6215


Epochs 1: 198it [00:20,  9.74it/s, mean loss: 0.507]


Epoch 2/40, Loss: 0.5193


Epochs 2: 198it [00:20,  9.83it/s, mean loss: 0.529]


Epoch 3/40, Loss: 0.5266


Epochs 3: 198it [00:20,  9.82it/s, mean loss: 0.524]


Epoch 4/40, Loss: 0.5193


Epochs 4: 198it [00:20,  9.78it/s, mean loss: 0.504]


Epoch 5/40, Loss: 0.5161


Epochs 5: 198it [00:20,  9.87it/s, mean loss: 0.529]


Epoch 6/40, Loss: 0.5127


Epochs 6: 198it [00:20,  9.78it/s, mean loss: 0.506]


Epoch 7/40, Loss: 0.5152


Epochs 7: 198it [00:20,  9.82it/s, mean loss: 0.471]


Epoch 8/40, Loss: 0.5101


Epochs 8: 198it [00:20,  9.72it/s, mean loss: 0.486]


Epoch 9/40, Loss: 0.5124


Epochs 9: 198it [00:21,  9.41it/s, mean loss: 0.512]


Epoch 10/40, Loss: 0.5097


Epochs 10: 198it [00:20,  9.55it/s, mean loss: 0.503]


Epoch 11/40, Loss: 0.5125


Epochs 11: 198it [00:21,  9.35it/s, mean loss: 0.515]


Epoch 12/40, Loss: 0.5158


Epochs 12: 198it [00:21,  9.33it/s, mean loss: 0.505]


Epoch 13/40, Loss: 0.5107


Epochs 13: 198it [00:21,  9.36it/s, mean loss: 0.499]


Epoch 14/40, Loss: 0.5113


Epochs 14: 198it [00:20,  9.64it/s, mean loss: 0.514]


Epoch 15/40, Loss: 0.5095


Epochs 15: 132it [00:13,  9.35it/s, mean loss: 0.548]

In [17]:
mask[0], input_seq[0:14], encodings

(tensor([False, False, False, False, False, False, False, False, False, False,
         False, False, False, False], device='mps:0'),
 tensor([[101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
          101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
          101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
          101, 101],
         [101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
          101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
          101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
          101, 101],
         [101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
          101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
          101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
          101, 101],
         [101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
   

In [None]:
from sklearn.metrics import precision_recall_fscore_support, average_precision_score

val_loss = []
preds = []
targets = []

## Set both models into the eval mode.=
rnn_model.eval()
autoencoder.eval()
for batch in test_dataloader:
    inputs, labels = batch
    labels = labels.to(torch.float).to(device)

    input_year, input_seq = inputs
    bs, ss = labels.size(0), 14
    input_year = input_year.reshape(-1).to(device)
    input_seq = input_seq.reshape(bs * ss, -1).to(device)

    encodings = autoencoder.get_encoding(input_year, input_seq).view(bs,ss, -1)
    survey_emb = aggregator(encodings)


    # Forward pass
    xx = rnn_model(survey_emb)
    outputs = torch.nn.functional.sigmoid(xx).flatten()
    loss = ft_loss(outputs, labels)  
    val_loss.append(loss.detach().cpu().numpy())
    preds.extend(outputs.detach().cpu().numpy().tolist())
    targets.extend(labels.cpu().numpy().tolist())

In [None]:
survey_emb

tensor([[[  79.6868,  529.0562,  122.7735,  ...,   74.3379, -556.9098,
          -675.4937],
         [  86.1495,  570.1829,  132.3874,  ...,   80.2999, -599.9509,
          -728.2538],
         [  85.2354,  566.8043,  131.5119,  ...,   79.5485, -596.7887,
          -723.5583],
         ...,
         [  74.5646,  494.1296,  114.7229,  ...,   69.4697, -520.0018,
          -631.0103],
         [  71.2082,  481.3701,  111.2558,  ...,   66.8464, -507.5799,
          -613.6431],
         [  73.0651,  484.0807,  112.4068,  ...,   68.0308, -509.4069,
          -618.1808]],

        [[  79.6868,  529.0562,  122.7735,  ...,   74.3379, -556.9098,
          -675.4937],
         [  86.1495,  570.1829,  132.3874,  ...,   80.2999, -599.9509,
          -728.2538],
         [  85.2354,  566.8043,  131.5119,  ...,   79.5485, -596.7887,
          -723.5583],
         ...,
         [  70.9823,  480.2695,  110.9016,  ...,   66.5088, -506.5378,
          -612.0607],
         [  70.6130,  477.2684,  110.305

In [None]:
# Concatenate all the batches
predictions = (torch.tensor(preds) > 0.5).float()
actuals = torch.tensor(targets).flatten()

# Calculate precision, recall, and F1 score
precision, recall, f1, _ = precision_recall_fscore_support(actuals.cpu().numpy(), predictions.cpu().numpy(), average='binary')
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Precision: 0.1364
Recall: 0.0600
F1 Score: 0.0833


In [None]:
batch

[[tensor([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  0],
          [ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  0],
          [ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  0],
          [ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  0],
          [ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  0],
          [ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  0]],
         device='mps:0'),
  tensor([[[101, 101, 101,  ..., 101, 101, 101],
           [101, 101, 101,  ..., 101, 101, 101],
           [101, 101, 101,  ..., 101, 101, 101],
           ...,
           [101, 101, 101,  ..., 101, 101, 101],
           [  0,  27,  34,  ..., 171,  76,  73],
           [101, 101, 101,  ..., 101, 101, 101]],
  
          [[101, 101, 101,  ..., 101, 101, 101],
           [101, 101, 101,  ..., 101, 101, 101],
           [101, 101, 101,  ..., 101, 101, 101],
           ...,
           [  0,  72, 101,  ..., 177,  30,  32],
           [  0, 101, 101,  ..., 177,  