In [6]:
# Data packages
import pandas as pd 
import matplotlib.pyplot as plt

import os
from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, average_precision_score

from model.rnn import GRUDecoder
from model.autoencoder import TabularEncoder
from data_processing.pipeline import encoding_pipeline, get_generic_name

import torch.nn as nn
import torch.nn.functional as F

In [9]:
def get_device():
    # Check if CUDA is available
    if torch.cuda.is_available():
        # If CUDA is available, select the first CUDA device
        device = torch.device("cuda:0")
        print("Using CUDA device:", torch.cuda.get_device_name(0))
    # Check for MPS availability on supported macOS devices (requires PyTorch 1.12 or newer)
    elif torch.backends.mps.is_available():
        # If MPS is available, use MPS device
        device = torch.device("mps")
        print("Using MPS (Metal Performance Shaders) device")
    else:
        # Fallback to CPU if neither CUDA nor MPS is available
        device = torch.device("cpu")
        print("Using CPU")
    return device
device = get_device()

Using MPS (Metal Performance Shaders) device


# Read the data

Right now the notebook is set to work with fake data. This can be changed once the pipeline works.

The data is stored as a Dict[person_id, Sequences] where Sequences is a Dict[year, survery_wave_response]

In [10]:
# read in data and prepare transformations
data = pd.read_csv("data/training_data/PreFer_train_data.csv", low_memory=False)
targets = pd.read_csv('data/training_data/PreFer_train_outcome.csv')
codebook = pd.read_csv('data/codebooks/PreFer_codebook.csv')

In [11]:
importance = pd.read_csv('features_importance_1000.csv')
custom_pairs = importance.iloc[:50].feature.map(lambda x: get_generic_name(x))

In [12]:
# check if sequences have been preprocessed (saves time)
if False: #os.path.exists('data/processed_data/sequences.pt'):
    sequences = torch.load('data/processed_data/sequences.pt')
else:
    sequences = encoding_pipeline(data, codebook, custom_pairs=custom_pairs)
    #torch.save(sequences, 'data/processed_data/sequences.pt')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  codebook["pairs"] = codebook['var_name'].apply(get_generic_name)


In [13]:
from model.dataset import PretrainingDataset
pretrain_dataset = PretrainingDataset(sequences)

# Experimental Encoder (Only)

# Train the RNN

First we need to create Dataset class that can hold both the target (stored in a pd.DataFrame) and the sequences.

The sequences will be of dimension 14 x encoding_dimension, because we have 14 years of surveys.

I have created some code for getting the data into the right format, but it might not be useful.

## Regarding masks
Right now the masking is done already in the encoding. I haven't found exactly where Mikkel implemented this.
So for now, assume that nothing is padded, and then we'll figure it out with Mikkel.

In [69]:
# its not everyone we have a target for, so we do restrict the data to 
# the ones with known outcomes
targets = targets[targets.new_child.notna()]
train_person_ids, test_person_ids = train_test_split(targets['nomem_encr'], test_size=0.2, random_state=42)

In [59]:
rnn_data = {person_id: (
        torch.tensor([year-2007 for year, _ in wave_responses.items()]).to(device),
        torch.tensor([ wave_response for _, wave_response in wave_responses.items()]).to(device)
        )
        for person_id, wave_responses in sequences.items()
}

In [60]:
# split data based on the splits made for the target
train_data = {person_id: rnn_data[person_id] for person_id in train_person_ids}
test_data = {person_id: rnn_data[person_id] for person_id in test_person_ids}

In [61]:
from model.dataset import FinetuningDataset
train_dataset = FinetuningDataset(train_data, targets = targets)
test_dataset = FinetuningDataset(test_data, targets = targets)

rnn_batch_size = 10

train_dataloader = DataLoader(train_dataset, batch_size=rnn_batch_size, shuffle=True)
test_dataloader  = DataLoader(test_dataset,  batch_size=rnn_batch_size)

In [63]:
# ft - fine-tuning

HIDDEN_SIZE = 64
ENCODING_SIZE = 64
NUM_COLS = 44

SEQ_LEN = pretrain_dataset.get_seq_len()
VOCAB_SIZE = pretrain_dataset.get_vocab_size()

num_epochs_ft = 1
learning_rate_ft = 1e-3

encoder = TabularEncoder(vocab_size=VOCAB_SIZE, 
                         embedding_size=HIDDEN_SIZE, 
                         output_size=ENCODING_SIZE, 
                         num_layers=2, 
                         sequence_len=SEQ_LEN, 
                         layer_type = "excel",
                         num_cols=NUM_COLS).to(device)

decoder = GRUDecoder(
    input_size=ENCODING_SIZE,
    hidden_size=HIDDEN_SIZE,
    num_layers=2,
    max_seq_len=14,
    dropout=0.15,
    bidirectional=False,
    with_attention = True
).to(device)

# Define loss function and optimizer for RNN
ft_loss = nn.BCELoss()
ft_optimizer = torch.optim.NAdam(list(decoder.parameters()) + list(encoder.parameters()) , lr=learning_rate_ft, weight_decay=1e-2, decoupled_weight_decay=True)
ft_scheduler = optim.lr_scheduler.CosineAnnealingLR(ft_optimizer, T_max = num_epochs_ft, eta_min = 1e-6, last_epoch = -1)

# Training loop
decoder.train()
encoder.train()
print("Ready!")

Ready!


In [64]:
loss_per_epoch = []
for epoch in range(num_epochs_ft):
    # print(epoch)
    loss_per_step = []
    loop_object  = tqdm(enumerate(train_dataloader), desc=f"Epochs {epoch}")
    for i, batch in loop_object :        
        ft_optimizer.zero_grad() 
        inputs, labels = batch
        labels = labels.to(torch.float).to(device)

        input_year, input_seq = inputs
        bs, ss = labels.size(0), 14
        input_year = input_year.reshape(-1).to(device)
        input_seq = input_seq.reshape(bs * ss, -1).to(device)

        encodings = encoder(input_year, input_seq).view(bs,ss, -1)
        mask = ~((input_seq == 101).sum(-1) == NUM_COLS).view(bs,ss).detach()

        # Forward pass
        outputs = nn.functional.sigmoid(decoder(encodings, mask=mask))

        loss = ft_loss(torch.flatten(outputs), labels)  
        loss_per_step.append(loss.detach().cpu().numpy())
        loop_object.set_postfix_str("mean loss: %.3f"%np.mean(loss_per_step[-100:]))

        #loss.backward(retain_graph=True)
        loss.backward()
        ft_optimizer.step()
    # On epoch end
    loss_per_epoch.append(np.mean(loss_per_step))
    ft_scheduler.step()

    print(f"Epoch {epoch+1}/{num_epochs_ft}, Loss: {loss_per_epoch[-1]:.4f}")
    

Epochs 0: 514it [01:23,  6.18it/s, mean loss: nan]

Epoch 1/1, Loss: nan





In [65]:

val_loss = []
preds = []
targets = []

## Set both models into the eval mode.=
decoder.eval()
encoder.eval()
for batch in test_dataloader:
    inputs, labels = batch
    labels = labels.to(torch.float).to(device)

    input_year, input_seq = inputs
    bs, ss = labels.size(0), 14
    input_year = input_year.reshape(-1).to(device)
    input_seq = input_seq.reshape(bs * ss, -1).to(device)

    encodings = encoder(input_year, input_seq).view(bs,ss, -1)
    mask = ~((input_seq == 101).sum(-1) == NUM_COLS).view(bs,ss).detach()


    # Forward pass
    xx = decoder(encodings, mask)
    outputs = torch.nn.functional.sigmoid(xx).flatten()
    loss = ft_loss(outputs, labels)  
    val_loss.append(loss.detach().cpu().numpy())
    preds.extend(outputs.detach().cpu().numpy().tolist())
    targets.extend(labels.cpu().numpy().tolist())

In [66]:
# Concatenate all the batches
predictions = (torch.tensor(preds) > 0.5).float()
probs = F.sigmoid(predictions)
actuals = torch.tensor(targets).flatten()

# Calculate precision, recall, and F1 score
precision, recall, f1, _ = precision_recall_fscore_support(actuals.cpu().numpy(), predictions.cpu().numpy(), average='binary')
map_roc = average_precision_score(actuals.numpy(), probs.numpy())
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"-- mAP Score: {map_roc:.4f} --")

Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
-- mAP Score: 0.2525 --


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Cross validation

In [14]:
from model.dataset import FinetuningDataset
from model.dataset import PretrainingDataset
from sklearn.model_selection import KFold
# read in data and prepare transformations
data = pd.read_csv("data/training_data/PreFer_train_data.csv", low_memory=False)
targets = pd.read_csv('data/training_data/PreFer_train_outcome.csv')
targets = targets[targets.new_child.notna()].reset_index(drop=True)
codebook = pd.read_csv('data/codebooks/PreFer_codebook.csv')

In [15]:
n_features = 50

importance = pd.read_csv('features_importance_1000.csv')
custom_pairs = importance.iloc[:n_features].feature.map(lambda x: get_generic_name(x))
sequences = encoding_pipeline(data, codebook, custom_pairs=custom_pairs)

rnn_data = {person_id: (
        torch.tensor([year-2007 for year, _ in wave_responses.items()]).to(device),
        torch.tensor([ wave_response for _, wave_response in wave_responses.items()]).to(device)
        )
        for person_id, wave_responses in sequences.items()
}

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  codebook["pairs"] = codebook['var_name'].apply(get_generic_name)


In [16]:
def initialize(HIDDEN_SIZE=64,
               ENCODING_SIZE=64,
               NUM_COLS=44,
               num_epochs_ft=5,
               learning_rate_ft=1e-3,
               sequences = []
               ):

    pretrain_dataset = PretrainingDataset(sequences)
    SEQ_LEN = pretrain_dataset.get_seq_len()
    VOCAB_SIZE = pretrain_dataset.get_vocab_size()

    encoder = TabularEncoder(vocab_size=VOCAB_SIZE, 
                             embedding_size=HIDDEN_SIZE, 
                             output_size=ENCODING_SIZE, 
                             num_layers=1, 
                             sequence_len=SEQ_LEN, 
                             layer_type = "excel",
                             num_cols=NUM_COLS,
                             dropout=0.1
                             ).to(device)

    decoder = GRUDecoder(
        input_size=ENCODING_SIZE,
        hidden_size=HIDDEN_SIZE,
        num_layers=1,
        max_seq_len=14,
        dropout=0.15,
        bidirectional=False,
        with_attention = True
    ).to(device)

    # Define loss function and optimizer for RNN
    ft_loss = nn.BCELoss()
    ft_optimizer = torch.optim.NAdam(list(decoder.parameters()) + list(encoder.parameters()) , lr=learning_rate_ft, weight_decay=1e-2, decoupled_weight_decay=True)
    ft_scheduler = optim.lr_scheduler.CosineAnnealingLR(ft_optimizer, T_max = num_epochs_ft, eta_min = 1e-6, last_epoch = -1)

    # Training loop
    decoder.train()
    encoder.train()

    return encoder, decoder, ft_optimizer, ft_loss, ft_scheduler

In [21]:

def evaluate_and_step(loop_object, encoder, decoder, ft_scheduler, ft_loss, ft_optimizer):
    for i, batch in loop_object :        

        ft_optimizer.zero_grad() 
        inputs, labels = batch
        labels = labels.to(torch.float).to(device)

        input_year, input_seq = inputs
        bs, ss = labels.size(0), 14
        input_year = input_year.reshape(-1).to(device)
        input_seq = input_seq.reshape(bs * ss, -1).to(device)

        encodings = encoder(input_year, input_seq).view(bs,ss, -1)
        mask = ~((input_seq == 101).sum(-1) == NUM_COLS).view(bs,ss).detach()

        # Forward pass
        outputs = nn.functional.sigmoid(decoder(encodings, mask=mask))

        loss = ft_loss(torch.flatten(outputs), labels)  

        loss.backward()
        ft_optimizer.step()

    # On epoch end
    ft_scheduler.step()
    

In [18]:
def evaluate(test_dataloader, encoder, decoder):
    val_loss = []
    preds = []
    targets = []

    ## Set both models into the eval mode.=
    decoder.eval()
    encoder.eval()
    for batch in test_dataloader:
        inputs, labels = batch
        labels = labels.to(torch.float).to(device)

        input_year, input_seq = inputs
        bs, ss = labels.size(0), 14
        input_year = input_year.reshape(-1).to(device)
        input_seq = input_seq.reshape(bs * ss, -1).to(device)

        encodings = encoder(input_year, input_seq).view(bs,ss, -1)
        mask = ~((input_seq == 101).sum(-1) == NUM_COLS).view(bs,ss).detach()

        # Forward pass
        xx = decoder(encodings, mask)
        outputs = torch.nn.functional.sigmoid(xx).flatten()
        loss = ft_loss(outputs, labels)  
        val_loss.append(loss.detach().cpu().numpy())
        preds.extend(outputs.detach().cpu().numpy().tolist())
        targets.extend(labels.cpu().numpy().tolist())


    # Concatenate all the batches
    predictions = (torch.tensor(preds) > 0.5).float()
    probs = F.sigmoid(predictions)
    actuals = torch.tensor(targets).flatten()

    # Calculate precision, recall, and F1 score
    precision, recall, f1, _ = precision_recall_fscore_support(actuals.cpu().numpy(), predictions.cpu().numpy(), average='binary')
    map_roc = average_precision_score(actuals.numpy(), probs.numpy())
    
    return precision, recall, f1, map_roc
     


In [20]:
HIDDEN_SIZE=64
ENCODING_SIZE=64
NUM_COLS=44
num_epochs_ft=10
learning_rate_ft=1e-3
rnn_batch_size = 10

n_splits = 4

kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Prepare for cross-validation
prec_per_fold = []
rec_per_fold = []
f1_per_fold = []
map_roc_per_fold = []

train_prec_per_fold = []
train_rec_per_fold = []
train_f1_per_fold = []
train_map_roc_per_fold = []

for fold, (train_index, val_index) in enumerate(kf.split(targets['nomem_encr'])):
    print(f'Fold: {fold}')
    train_person_ids = targets.loc[train_index, 'nomem_encr']
    test_person_ids = targets.loc[val_index, 'nomem_encr']
    
    encoder, decoder, ft_optimizer, ft_loss, ft_scheduler = initialize(
        HIDDEN_SIZE=64,
        ENCODING_SIZE=64,
        NUM_COLS=44,
        num_epochs_ft=1,
        learning_rate_ft=1e-3,
        sequences=sequences,
        )

    train_data = {person_id: rnn_data[person_id] for person_id in train_person_ids}
    test_data = {person_id: rnn_data[person_id] for person_id in test_person_ids}
    
    train_dataset = FinetuningDataset(train_data, targets = targets)
    test_dataset = FinetuningDataset(test_data, targets = targets)

    train_dataloader = DataLoader(train_dataset, batch_size=rnn_batch_size, shuffle=True)
    test_dataloader  = DataLoader(test_dataset,  batch_size=rnn_batch_size)
    

    for epoch in range(num_epochs_ft):
        loop_object  = tqdm(enumerate(train_dataloader), desc=f"Epochs {epoch}")
    
        evaluate_and_step(loop_object, encoder, decoder, ft_scheduler, ft_loss, ft_optimizer)

    precision, recall, f1, map_roc = evaluate(test_dataloader, encoder, decoder)
    precision_train, recall_train, f1_train, map_roc_train = evaluate(train_dataloader, encoder, decoder)
    
    prec_per_fold.append(precision)
    rec_per_fold.append(recall)
    f1_per_fold.append(f1)
    map_roc_per_fold.append(map_roc)

    train_prec_per_fold.append(precision_train)
    train_rec_per_fold.append(recall_train)
    train_f1_per_fold.append(f1_train)
    train_map_roc_per_fold.append(map_roc_train)


Fold: 0


Epochs 0: 0it [00:00, ?it/s]


NameError: name 'loss_per_step' is not defined

In [None]:
print("Results on test set")
print("Prec:", ' '.join(f"{x:.3f}" for x in prec_per_fold))
print("Recall:", ' '.join(f"{x:.3f}" for x in rec_per_fold))
print("f1:", ' '.join(f"{x:.3f}" for x in f1_per_fold))
print("map roc:", ' '.join(f"{x:.3f}" for x in map_roc_per_fold))

Results on test set
Prec: 0.857 0.667 0.629 0.938
Recall: 0.517 0.588 0.750 0.294
f1: 0.645 0.625 0.684 0.448
map roc: 0.557 0.477 0.524 0.422


In [None]:
print('Results on training set')
print("Prec:", ' '.join(f"{x:.3f}" for x in train_prec_per_fold))
print("Recall:", ' '.join(f"{x:.3f}" for x in train_rec_per_fold))
print("f1:", ' '.join(f"{x:.3f}" for x in train_f1_per_fold))
print("map roc:", ' '.join(f"{x:.3f}" for x in train_map_roc_per_fold))

Results on training set
Prec: 0.933 0.948 0.777 0.955
Recall: 0.721 0.907 0.956 0.261
f1: 0.813 0.927 0.857 0.410
map roc: 0.730 0.880 0.752 0.410
