In [105]:
import pandas as pd
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer, XLMRobertaConfig
import torch
from torch import Tensor
from torch.utils.data import TensorDataset, DataLoader

tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
config = XLMRobertaConfig(
    num_labels=1,
    output_hidden_states=False,
    output_attentions=False,
)
config.vocab_size = tokenizer.vocab_size

model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=1)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [86]:
import warnings
warnings.filterwarnings("ignore")

In [87]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768,

In [88]:
train_data = pd.read_csv("../../data/train/train.csv")
test_data = pd.read_csv("../../data/test/final_test_pairs.csv")

print(train_data.columns)

Index(['Unnamed: 0', 'pair_id', 'id1', 'id2', 'text1', 'text2', 'overall',
       'lang1', 'lang2'],
      dtype='object')


In [89]:
def tokenize_and_shorten_sentence(text: str) -> Tensor:
    """
    Tokenize the input text and shorten it to 256 tokens.

    Args:
        text (str): The input text.

    Returns:
        Tensor: The tokenized and shortened text tensor.
    """
    tokenized_text = tokenizer(text, return_tensors="pt", padding=False, truncation=False, add_special_tokens=False, max_length = None)

    if tokenized_text["input_ids"].shape[1] > 256:
        shorten_ids =  tokenized_text["input_ids"][:, :200].tolist()[0] + tokenized_text["input_ids"][:, -54:].tolist()[0]
    else:
        shorten_ids = tokenized_text["input_ids"].tolist()[0] + [tokenizer.pad_token_id] * (254 - tokenized_text["input_ids"].shape[1])


    return tokenizer.decode(shorten_ids)

In [90]:
def tokenize_texts(text1:str, text2:str):
    tokenized_text = tokenizer(text1, text2, return_tensors="pt", padding="max_length", 
                               truncation=True, add_special_tokens=True, max_length=512)
    return tokenized_text["input_ids"], tokenized_text["attention_mask"]

In [91]:
input_ids = []
attention_mask = []


for i in range(len(train_data)):
    text1 = train_data["text1"][i]
    text2 = train_data["text2"][i]
    text1_truncated = tokenize_and_shorten_sentence(text1)
    text2_truncated = tokenize_and_shorten_sentence(text2)

    text_input_ids, text_attention_mask = tokenize_texts(text1_truncated, text2_truncated)
    input_ids.append(text_input_ids.tolist()[0])
    attention_mask.append(text_attention_mask.tolist()[0])

    

Token indices sequence length is longer than the specified maximum sequence length for this model (746 > 512). Running this sequence through the model will result in indexing errors
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strat

In [97]:
score = torch.tensor(train_data["overall"]).float()
data = TensorDataset(torch.tensor(input_ids).long(), torch.tensor(attention_mask).long(), score.view(-1, 1))

In [103]:
train_size = int(0.1 * len(data))
test_size = len(data) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(data, [train_size, test_size])

loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

In [106]:
EPOCHS = 1
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
total_loss = 0
model.train()

for epoch in range(EPOCHS):
   for idx, (ids, att, val) in enumerate(loader):
      ids, att, val = ids.to(device), att.to(device), val.to(device)

      outputs = model(input_ids=ids, attention_mask=att, labels=val)
      loss, logits = outputs[:2]

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      #scheduler.step()
      total_loss += loss.item()

      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      if idx % 10 == 0:
         print("average training loss: {0:.2f}".format(total_loss / (idx+1)))
         print("current loss:", loss.item())

         print(f"logits: {logits}")

   # Store the loss value for plotting the learning curve.
   loss_values.append(total_loss)
   print("average training loss: {0:.2f}".format(total_loss))

average training loss: 10.52
current loss: 10.515270233154297
logits: tensor([[ 0.0944],
        [-0.0041],
        [ 0.0993],
        [ 0.0851]], grad_fn=<AddmmBackward0>)
average training loss: 6.66
current loss: 5.921501159667969
logits: tensor([[0.2391],
        [0.2604],
        [1.0051],
        [1.1593]], grad_fn=<AddmmBackward0>)
average training loss: 5.75
current loss: 4.26238489151001
logits: tensor([[0.4642],
        [0.9886],
        [2.1218],
        [0.5776]], grad_fn=<AddmmBackward0>)
average training loss: 4.72
current loss: 1.5303306579589844
logits: tensor([[1.3579],
        [1.7493],
        [0.5731],
        [1.2010]], grad_fn=<AddmmBackward0>)
average training loss: 4.11
current loss: 0.7820334434509277
logits: tensor([[3.1095],
        [3.4597],
        [3.0207],
        [3.3843]], grad_fn=<AddmmBackward0>)
average training loss: 3.62
current loss: 1.4791193008422852
logits: tensor([[2.0832],
        [3.1919],
        [1.6894],
        [2.0780]], grad_fn=<AddmmBa

NameError: name 'loss_values' is not defined

Predictions: tensor([[[-0.0474],
         [ 0.2472],
         [ 0.2336],
         [ 0.2687],
         [ 0.3569],
         [ 0.2603],
         [ 0.3908],
         [-0.0327]]], grad_fn=<ViewBackward0>)
