In [1]:
from transformers import XLMRobertaTokenizer, XLMRobertaModel
import pandas as pd
import numpy as np
import torch.nn as nn
import torch
from torch import Tensor
from typing import List, Tuple
from torch.utils.data import TensorDataset, DataLoader, random_split
import sys
sys.path.append('/Users/germa/thesis/bachelor_project2023/src/models/lstm')
from xlm_roberta import XLMRoberta
sys.path.append('/Users/germa/thesis/bachelor_project2023/src/utils')
from chunker import Chunker

  from .autonotebook import tqdm as notebook_tqdm
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [38]:
class LSTMOnXLMRoberta(nn.Module):
    def __init__(self, input_size, lstm_hidden_size, num_lstm_layers, model_name = 'xlm-roberta-base', train_path = '../../../data/train/train.csv', train_size = 0.8, batch_size = 32, shuffle = True):
        super(LSTMOnXLMRoberta, self).__init__()
        self.xlmroberta_model = XLMRoberta(model_name)
        self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
        self.chunker = Chunker(self.tokenizer, 512)
        self.lstm = nn.LSTM(input_size=input_size,
                            hidden_size=lstm_hidden_size,
                            num_layers=num_lstm_layers,
                            batch_first=True,
                            bidirectional=False)
        self.fc = nn.Linear(lstm_hidden_size * 2, 1)

        parameters_to_optimize = list(self.fc.parameters()) + list(self.lstm.parameters()) + list(self.xlmroberta_model.parameters())

        self.optimizer = torch.optim.AdamW(parameters_to_optimize, lr=1e-5)
        self.loss_function = nn.MSELoss()
        self.train_df, self.train_label = self.get_data(train_path)
        torch.manual_seed(42)

    def parameter_to_optimize(self):
        for param in self.xlmroberta_model.parameters():
            param.requires_grad = True
        for param in self.lstm.parameters():
            param.requires_grad = True
        for param in self.fc.parameters():
            param.requires_grad = True

    def get_data(self, path):
        train_df = pd.read_csv(path)
        train_short_df = train_df.head(1)
        return train_short_df, train_df["overall"]

    def chunk_data(self, df):
        texts1, texts2 = df["text1"], df["text2"]
        input_ids = []
        for i in range(len(texts1)):
            input_id_1 = self.chunker.chunk(texts1[i])
            input_id_2 = self.chunker.chunk(texts2[i])
            input_ids.append([input_id_1, input_id_2])           
        return input_ids
    
        
    def _manage_device(self) -> None:
        """
            Manage the device to run the model on
        """
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.xlmroberta_model.to(self.device)
        self.lstm.to(self.device)
        self.fc.to(self.device)
    
    def get_embeddings(self, input_ids):
        """
            Get the embeddings from the XLM-Roberta model
        """
        outputs = self.xlmroberta_model.run(input_ids)
        return outputs


    def pad_to_same_size(self, tensors):
        max_size = max(tensor.size(0) for tensor in tensors)
        padded_tensors = []
        for tensor in tensors:
            if tensor.size(0) < max_size:
                padding = torch.zeros(max_size - tensor.size(0), tensor.size(1))
                padded_tensor = torch.cat((tensor, padding), dim=0)
                padded_tensors.append(padded_tensor)
            else:
                padded_tensors.append(tensor)
        return torch.stack(padded_tensors)
    
    def train(self, input_embeddings, labels, train_data):
        #self.train()
        batch_loss = 0
        for i, row in enumerate(input_embeddings):
            index1, index2 = len(row[0]), len(row[1])
            row_padded= self.pad_to_same_size(row)
            lstm_out, _ = self.lstm(row_padded)
            lstm_out_last1 = lstm_out[0, index1 -1, :]
            lstm_out_last2 = lstm_out[1, index2 -1, :]
            nn = torch.cat((lstm_out_last1, lstm_out_last2), 0)
            output = self.fc(nn)
            label = torch.tensor(labels[i], dtype=torch.float32).view(1)
            loss = self.loss_function(output, label)
            batch_loss += loss
        self.optimizer.zero_grad()
        #loss.backward()
        self.optimizer.step()

    def run(self):
        self._manage_device()
        train_data = self.chunk_data(self.train_df)
        labels = self.train_label
        #split data into batches
        self.parameter_to_optimize()
        input_embeddings = self.get_embeddings(train_data)
        self.train(input_embeddings, labels)

In [39]:
lstmonxlmroberta = LSTMOnXLMRoberta(768, 384, 1)
lstmonxlmroberta.run()


[[tensor([[ 4.0905e-02,  6.2396e-02, -1.7472e-02,  5.7991e-02, -3.3865e-03,
         -5.5634e-02,  6.5424e-03,  5.6553e-02, -1.0675e-03, -1.2234e-01,
         -2.0923e-02,  6.2287e-02,  5.5673e-02,  6.2484e-02, -3.1000e-02,
          6.5937e-02,  4.8447e-02,  2.6553e-02,  1.3085e-01, -1.9854e-02,
          6.3706e-02, -1.4030e-03,  1.9903e-02,  8.1925e-02,  8.3939e-03,
         -3.6644e-02,  8.0267e-02,  7.7792e-02,  1.6930e-01, -4.0484e-02,
          4.7077e-02,  2.5932e-02, -3.2395e-02,  6.3032e-02,  8.7296e-02,
          1.4874e-01, -3.7790e-02, -2.3070e-02,  6.1640e-02,  7.6694e-02,
          8.8929e-02,  3.5382e-02,  9.3020e-02,  7.7223e-02,  3.6105e-02,
         -1.4366e-02,  2.7386e-02,  3.9184e-03, -7.8590e-03, -2.0560e-02,
          1.9638e-02, -3.8450e-02,  9.8752e-02,  5.1279e-02, -2.1623e-01,
         -3.1953e-02,  5.3428e-02,  3.9249e-02,  3.6214e-02,  8.3689e-02,
         -3.6675e-02,  9.2704e-02,  5.4579e-02,  5.8259e-02,  1.7206e-02,
          1.4652e-02, -1.2182e-01,  

In [24]:
class XLMRob(nn.Module):
    def __init__(self, model_name: str):
        super(XLMRob, self).__init__()
        self.model = XLMRobertaModel.from_pretrained('xlm-roberta-base')
        self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
        torch.manual_seed(42)
        
    #run function that receives a string, it tokenizes it and returns the embeddings
    def test(self, text: str) -> Tensor:
        # input_ids = self.tokenizer.encode(text, return_tensors='pt')
        # print(input_ids)
        outputs = self.model(text)
        return outputs.last_hidden_state.mean(dim=1)
    
    def run(self, input_ids):
                    chunk_ids = torch.tensor(input_ids).unsqueeze(0)
                    attention_mask = [1 if i != 0 else 0 for i in chunk_ids.tolist()]
                    attention_mask = torch.tensor(attention_mask).unsqueeze(0)
                    outputs= self.model(chunk_ids, attention_mask)
                    last_hidden_state = outputs.last_hidden_state.mean(dim=1).squeeze(0)
                    return outputs.last_hidden_state.mean(dim=1)

In [26]:
test= "my life is amazing and great and happy"
t = torch.tensor([0,759,6897,83,44613,136,6782,136, 17723,2])

xlm = XLMRob('xlm-roberta-base')
print(xlm.run(t))
print(xlm.run(t))

tensor([[-1.8432e-02,  5.2022e-02, -4.2706e-03,  2.6372e-02,  6.1900e-02,
         -6.6941e-02, -2.3860e-02, -1.2533e-02,  1.2653e-02, -9.2673e-02,
          2.2528e-02,  5.2544e-03,  2.4869e-01,  5.5534e-02,  2.5687e-02,
         -1.1155e-02,  3.5093e-02, -1.2294e-02,  1.1389e-02,  3.9266e-02,
         -9.9249e-03, -2.4965e-02,  2.2498e-02,  2.6500e-02,  4.9724e-03,
          6.7137e-02, -2.3471e-04,  7.9836e-02,  3.9751e-02,  2.9158e-02,
          5.4116e-03, -9.7006e-03,  1.5934e-02, -6.1360e-03,  9.3478e-03,
          8.6105e-02, -4.4822e-02, -3.7905e-02, -1.8973e-02,  1.5970e-02,
          1.8462e-02,  6.0275e-02, -6.4021e-02,  1.3625e-02,  1.3653e-02,
         -8.7474e-03,  6.4311e-02,  3.3904e-02,  8.8496e-04, -5.2797e-02,
          1.3059e-02, -2.5622e-02,  2.3061e-02,  7.4072e-02, -5.7084e-02,
         -3.4691e-02,  2.5629e-02, -2.3299e-02, -6.6048e-02,  1.9622e-01,
          2.6276e-02,  8.0608e-02,  1.4119e-02,  1.0655e-01,  3.5507e-03,
         -3.1255e-02, -3.0850e-02, -1.

  chunk_ids = torch.tensor(input_ids).unsqueeze(0)


In [15]:

xlm2 = XLMRob('xlm-roberta-base')
print(xlm2.run(t))

  chunk_ids = torch.tensor(input_ids).unsqueeze(0)


ValueError: too many values to unpack (expected 2)