# Read the data

Right now the notebook is set to work with fake data. This can be changed once the pipeline works.

The data is stored as a Dict[person_id, Sequences] where Sequences is a Dict[year, survery_wave_response]

Tokenizing takes a bit of time

In [141]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.utils.data import TensorDataset

from sklearn.model_selection import train_test_split

from model.rnn import GRUDecoder
from model.autoencoder import AutoEncoder

from data_processing.encoding.categorical import CategoricalTransformer
from data_processing.encoding.numeric_and_date import ToQuantileTransformer
from data_processing.encoding.text2vec import TextTransform
from data_processing.sequences.sequencing import to_sequences

In [2]:
# read in data and prepare transformations
data = pd.read_csv('data/other_data/PreFer_fake_data.csv')
targets = pd.read_csv('data/other_data/PreFer_fake_outcome.csv')
codebook = pd.read_csv('data/codebooks/PreFer_codebook.csv')
summary = pd.read_csv('data/codebooks/PreFer_codebook_summary.csv')

categorical_columns = codebook[(codebook.var_name.str.startswith('c')) & (codebook.type_var == 'categorical')].var_name.tolist()
quantile_columns = codebook[(codebook.var_name.str.startswith('c')) & ((codebook.type_var == 'numeric') | (codebook.type_var == 'date or time'))].var_name.tolist()

cat_transform = CategoricalTransformer()
cat_transform.fit(codebook)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  core_cat_df['values_cat'] = core_cat_df['values_cat'].str.split("; ").apply(lambda x: [e.strip() for e in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  core_cat_df['labels_cat'] = core_cat_df['labels_cat'].str.split("; ").apply(lambda x: [e.strip() for e in x])


In [3]:
# this cell takes a bit of time
for col in categorical_columns:
    data[col] = cat_transform.transform(data[col])
    
quantile_transform = ToQuantileTransformer(quantile_columns)
quantile_transform.fit(data)
data = quantile_transform.transform(data)

data.fillna(101, inplace=True)
data[quantile_columns] = data[quantile_columns].astype(int)

data2 = data[data.columns[data.columns.str.startswith('c')]]
data2['nomem_encr'] = data['nomem_encr']

sequences = to_sequences(data2, summary)

  data2['nomem_encr'] = data['nomem_encr']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['nomem_encr'] = data['nomem_encr']


In [4]:
def get_device():
    # Check if CUDA is available
    if torch.cuda.is_available():
        # If CUDA is available, select the first CUDA device
        device = torch.device("cuda:0")
        print("Using CUDA device:", torch.cuda.get_device_name(0))
    # Check for MPS availability on supported macOS devices (requires PyTorch 1.12 or newer)
    elif torch.backends.mps.is_available():
        # If MPS is available, use MPS device
        device = torch.device("mps")
        print("Using MPS (Metal Performance Shaders) device")
    else:
        # Fallback to CPU if neither CUDA nor MPS is available
        device = torch.device("cpu")
        print("Using CPU")
    return device
device = get_device()

In [5]:
# There are some strings that have still not been properly been filtered out
# this cell gets rid of them. 
# The real solution is to change the categorical encoding class
for _, wave_responses in sequences.items():
    for year, wave_response in wave_responses.items():
        
        not_int = np.array([not isinstance(x, int) for x in wave_response], dtype = bool)
        wave_responses[year] = [
            item if not _bool else 101 for (item, _bool) in zip(wave_response, not_int)
        ]

# Train the autoencoder

In [7]:
# We dont need targets or year information for the autoencoder
# so we merge everthing together in one tensor
autoencoder_data = torch.tensor([
                                wave_response
                                for _, wave_responses in sequences.items()
                                for _, wave_response in wave_responses.items()
                        ]).to(torch.int64)

In [127]:
# this is my attempt at getting the autoencoder to work

# the existing autoencoder does not collapse peoples survery
# responses to one-dimensional representations.
# However, I can't get it to work.


class AutoEncoder(torch.nn.Module):
    def __init__(self, num_embeddings, n_questions, embedding_dim=512, encoding_dim=16) -> None:
        super().__init__()
        self.encoding_dim = encoding_dim

        self.embed = torch.nn.Embedding(
            num_embeddings=num_embeddings,
            embedding_dim=embedding_dim
        )

        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(embedding_dim, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 32),
            torch.nn.ReLU(),
            torch.nn.Linear(32, encoding_dim),
        )

        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(encoding_dim, 32),
            torch.nn.ReLU(),
            torch.nn.Linear(32, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, embedding_dim),
        )

    def forward(self, x):
        x_embed = self.embed(x) # shape n_years x n_questions x embedding_dim

        # reduce to 2-dimensional tensor
        x_flat = x_embed.view(-1, x_embed.size(-1)) # shape n_years x (n_questions x embedding_dim)
        
        x_encoded = self.encoder(x_flat) 

        x_decoded = self.decoder(x_encoded) 

        # Reshape the decoded tensor back to its original 3-dimensional shape
        x_reconstructed = x_decoded.view(x.size(0), x.size(1), -1)

        return x_reconstructed

    def get_loss(self, x):
        x_flat = x.view(-1, x.size(1) * x.size(2))
        x_hat = self.forward(x)
        return torch.nn.functional.mse_loss(x_hat, x_flat)

    def embed_and_encode(self, x):
        x_flat = x.view(-1, x.size(1) * x.size(2))
        x_emb = self.embed(x_flat)
        return self.encoder(x_emb)

    def get_encoding_dim(self):
        return self.encoding_dim


In [139]:
# attempt at training the autoencoder.
# However, I get the error that I'm trying to look up too many
# embeddings. I haven't solved why.
batch_size = 128 

train_dataloader = DataLoader(autoencoder_data, batch_size=batch_size, shuffle=True)

num_epochs_autoencoder = 10
learning_rate_autoencoder = 0.001
embedding_dim = 2
vocab_size = 15000#len(set([ elem for  _, sequence in sequences.items() for _, item in sequence.items() for elem in item]))

n_questions = autoencoder_data.shape[1]

error = nn.MSELoss()

autoencoder = AutoEncoder(num_embeddings=vocab_size, n_questions=n_questions).to(device)

optimizer = optim.Adam( autoencoder.parameters())

In [142]:
autoencoder.train()
for epoch in range(num_epochs_autoencoder):
    for batch in train_dataloader:
        optimizer.zero_grad()
        loss = autoencoder.get_loss(batch[0].to(device))
        
        loss.backward()
        
        optimizer.step()
        if epoch % int(0.1*num_epochs_autoencoder) == 0:
            print(f'epoch {epoch} \t Loss: {loss.item():.4g}')

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

# Train the RNN

First we need to create Dataset class that can hold both the target (stored in a pd.DataFrame) and the sequences.

The sequences will be of dimension 14 x encoding_dimension, because we have 14 years of surveys.

I have created some code for getting the data into the right format, but it might not be useful.

## Regarding masks
Right now the masking is done already in the encoding. I haven't found exactly where Mikkel implemented this.
So for now, assume that nothing is padded, and then we'll figure it out with Mikkel.

In [None]:
class TensorSequencesWithTarget(Dataset):
    def __init__(self, sequences:dict, target: pd.DataFrame):
        self.sequences = sequences 
        self.target = targets.set_index(keys = 'nomem_encr').squeeze().to_dict()
        self.keys = list(sequences.keys())

    def __len__(self):
        return len(self.keys)

    def __getitem__(self, index):
        person_id = self.keys[index]
        
        target = self.target[person_id]
        sequence = self.sequences[person_id]
        

        return target, sequence

In [None]:
# its not everyone we have a target for, so we do restrict the data to 
# the ones with known outcomes

train_person_ids, test_person_ids = train_test_split(targets['nomem_encr'], test_size=0.2, random_state=42)
train_person_ids, val_person_ids = train_test_split(train_person_ids, test_size=0.1, random_state=42)

In [None]:
# structure the data as a Dict[person_id, survey_embedding_sequence] 
# where survey_embedding_sequence is a tensor of size 14 x embedding_dimension
rnn_data = {person_id:
                autoencoder.embed_and_encode(
                    torch.tensor(
                        [ wave_response for _, wave_response in wave_responses.items()]
                    )
                ).to(torch.float)
            for person_id, wave_responses in sequences.items()
           }

In [None]:
# split data based on the splits made for the target
train_data = {person_id: rnn_data[person_id] for person_id in train_person_ids}
val_data = {person_id: rnn_data[person_id] for person_id in val_person_ids}
test_data = {person_id: rnn_data[person_id] for person_id in test_person_ids}


In [None]:
train_dataset = SequencesWithTarget(train_data, target = targets)
val_dataset = SequencesWithTarget(val_data, target = targets)
test_dataset = SequencesWithTarget(test_data, target = targets)

rnn_batch_size = 50

train_dataloader = DataLoader(train_dataset, batch_size=rnn_batch_size, shuffle=True)
val_dataloader   = DataLoader(val_dataset,   batch_size=rnn_batch_size)
test_dataloader  = DataLoader(test_dataset,  batch_size=rnn_batch_size)

### My attempt at training the model, but probably not correct

In [None]:
hidden_size = 10
num_epochs_rnn = 10
learning_rate_rnn = 0.001

rnn_model = GRUDecoder(
    input_size=autoencoder.get_encoding_dim(),
    hidden_size=hidden_size,
    max_seq_len=14
).to(device)

# assume that all 14 years are observed for everyone
single_mask = torch.BoolTensor([True]*14).to(device) 

# Define loss function and optimizer for RNN
rnn_criterion = torch.nn.CrossEntropyLoss()
rnn_optimizer = torch.optim.Adam(rnn_model.parameters(), lr=learning_rate_rnn)

In [None]:
# Training loop
rnn_model.train()

for epoch in range(num_epochs_rnn):
    
    for batch in train_dataloader:
        labels, inputs = batch
        print(labels)
        
        rnn_optimizer.zero_grad() 

        # Forward pass
        mask = torch.stack([single_mask]*len(labels) ) #  not correct masking
                           
        xx = rnn_model(inputs, mask)
        outputs = torch.nn.functional.sigmoid(xx)
        loss = rnn_criterion(outputs.squeeze(), labels)  
        
        loss.backward()
        rnn_optimizer.step()
        
        running_loss += loss.item() * inputs.size(0)
    
    # Calculate average loss for the epoch
    epoch_loss = running_loss / len(train_dataloader.dataset)
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")

