In [5]:
from transformers import XLMRobertaTokenizer, XLMRobertaModel
import pandas as pd
import numpy as np
import torch.nn as nn
import torch
from torch import Tensor
from typing import List, Tuple
from torch.utils.data import TensorDataset, DataLoader, random_split
import sys
sys.path.append('/Users/germa/thesis/bachelor_project2023/src/models/lstm')
from xlm_roberta import XLMRoberta
sys.path.append('/Users/germa/thesis/bachelor_project2023/src/utils')
from chunker import Chunker

In [12]:
class LSTMOnXLMRoberta(nn.Module):
    def __init__(self, input_size, lstm_hidden_size, num_lstm_layers, model_name = 'xlm-roberta-base', train_path = '../../../data/train/train.csv', train_size = 0.8, batch_size = 32, shuffle = True):
        super(LSTMOnXLMRoberta, self).__init__()
        self.xlmroberta_model = XLMRoberta(model_name)
        self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
        self.chunker = Chunker(self.tokenizer, 512)
        self.lstm = nn.LSTM(input_size=input_size,
                            hidden_size=lstm_hidden_size,
                            num_layers=num_lstm_layers,
                            batch_first=True,
                            bidirectional=False)
        self.fc = nn.Linear(lstm_hidden_size * 2, 1)

        self.optimizer = torch.optim.AdamW(self.fc.parameters(), lr=1e-5)
        self.loss_function = nn.MSELoss()

        self.train_df, self.train_label = self.get_data(train_path)
    
    def get_data(self, path):
        train_df = pd.read_csv(path)
        train_short_df = train_df.head(2)
        return train_short_df, train_df["overall"]

    def chunk_data(self, df):
        texts1, texts2 = df["text1"], df["text2"]
        input_ids = []
        for i in range(len(texts1)):
            input_id_1 = self.chunker.chunk(texts1[i])
            input_id_2 = self.chunker.chunk(texts2[i])
            input_ids.append([input_id_1, input_id_2])           
        return input_ids
    
    def batch_data(self, input_ids, labels):
        #convert to batches of 50% of the data without tensor
        input_ids = torch.tensor(input_ids)
        labels = torch.tensor(labels)
        dataset = TensorDataset(input_ids, labels)
        train_size = int(0.8 * len(dataset))
        val_size = len(dataset) - train_size
        train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
        train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=True)
        return train_dataloader, val_dataloader
        
    def _manage_device(self) -> None:
        """
            Manage the device to run the model on
        """
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.xlmroberta_model.to(self.device)
        self.lstm.to(self.device)
    
    def get_embeddings(self, input_ids):
        """
            Get the embeddings from the XLM-Roberta model
        """
        #input_ids = input_ids.to(self.device)
        outputs = self.xlmroberta_model.run(input_ids)
        return outputs
    
    def test_batch(self):
        train_dataloader, val_dataloader = self.batch_data(self.train_df, self.train_label)
        for batch in train_dataloader:
            print(batch)
            print("Done")
     
    def train(self):
        self._manage_device()
        train_data = self.chunk_data(self.train_df)
        labels = self.train_label
        input_embeddings = self.get_embeddings(train_data)
        batch_loss = 0
        for i, row in enumerate(input_embeddings):
            print(row.shape)
            lstm_out, _ = self.lstm(row)
            lstm_out_last = lstm_out[:, -1, :]  # Extract output of the last time step
            nn = lstm_out_last.reshape(1, -1) #concatenate the two outputs
            output = self.fc(nn)
            print(output)
            label = torch.tensor(labels[i], dtype=torch.float32)
            print(label)
            loss = self.loss_function(output, label)
            batch_loss += loss
        self.optimizer.zero_grad()
        loss.backward()
        print(batch_loss)

In [13]:
lstmonxlmroberta = LSTMOnXLMRoberta(768, 384, 1)
lstmonxlmroberta.train()


Token indices sequence length is longer than the specified maximum sequence length for this model (748 > 512). Running this sequence through the model will result in indexing errors


torch.Size([2, 1, 768])
tensor([[-0.0748]], grad_fn=<AddmmBackward0>)
tensor(4.)
torch.Size([2, 2, 768])
tensor([[-0.0967]], grad_fn=<AddmmBackward0>)
tensor(3.6667)


  return F.mse_loss(input, target, reduction=self.reduction)


tensor(30.7673, grad_fn=<AddBackward0>)
