In [1]:
import pandas as pd
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer, XLMRobertaConfig
import torch
from torch import Tensor
from torch.utils.data import TensorDataset, DataLoader
import numpy as np

tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
config = XLMRobertaConfig(
    num_labels=1,
    output_hidden_states=False,
    output_attentions=False,
)
config.vocab_size = tokenizer.vocab_size

model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=1)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768,

In [4]:
train_data = pd.read_csv("../../data/train/train.csv")
test_data = pd.read_csv("../../data/test/final_test_pairs.csv")

print(train_data.columns)

Index(['Unnamed: 0', 'pair_id', 'id1', 'id2', 'text1', 'text2', 'overall',
       'lang1', 'lang2'],
      dtype='object')


In [5]:
def tokenize_and_shorten_sentence(text: str) -> Tensor:
    """
    Tokenize the input text and shorten it to 256 tokens.

    Args:
        text (str): The input text.

    Returns:
        Tensor: The tokenized and shortened text tensor.
    """
    tokenized_text = tokenizer(text, return_tensors="pt", padding=False, truncation=False, add_special_tokens=False, max_length = None)

    if tokenized_text["input_ids"].shape[1] > 256:
        shorten_ids =  tokenized_text["input_ids"][:, :200].tolist()[0] + tokenized_text["input_ids"][:, -54:].tolist()[0]
    else:
        shorten_ids = tokenized_text["input_ids"].tolist()[0] + [tokenizer.pad_token_id] * (254 - tokenized_text["input_ids"].shape[1])


    return tokenizer.decode(shorten_ids)

In [6]:
def tokenize_texts(text1:str, text2:str):
    tokenized_text = tokenizer(text1, text2, return_tensors="pt", padding="max_length", 
                               truncation=True, add_special_tokens=True, max_length=512)
    return tokenized_text["input_ids"], tokenized_text["attention_mask"]

In [14]:
input_ids = []
attention_mask = []


for i in range(len(train_data[:25])):
    text1 = train_data["text1"][i]
    text2 = train_data["text2"][i]
    text1_truncated = tokenize_and_shorten_sentence(text1)
    text2_truncated = tokenize_and_shorten_sentence(text2)

    text_input_ids, text_attention_mask = tokenize_texts(text1_truncated, text2_truncated)
    input_ids.append(text_input_ids.tolist()[0])
    attention_mask.append(text_attention_mask.tolist()[0])

    

In [16]:
print(attention_mask[22])
print(input_ids[22])
print(len(attention_mask[22]))

score = torch.tensor(train_data["overall"]).float()
data = TensorDataset(torch.tensor(input_ids).long(), torch.tensor(attention_mask).long(), score.view(-1, 1))

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

AssertionError: Size mismatch between tensors

In [None]:
train_size = int(0.8 * len(data))
test_size = len(data) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(data, [train_size, test_size])

loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=True)

In [None]:
def eval_pearson_corr(model, val_loader):
    """
    Evaluate the Pearson correlation of the model.

    Args:
        model (XLMRobertaForSequenceClassification): The model to evaluate.
        val_loader (DataLoader): The validation data loader.

    Returns:
        float: The Pearson correlation of the model.
    """
    model.eval()
    preds = []
    labels = []
    for idx, batch in enumerate(val_loader):
        input_ids, attention_mask, label = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=None)
            preds.append(outputs.logits.cpu().numpy())
            labels.append(label.cpu().numpy())
            print(f"The correlation for batch number {len(idx)} is :")
            print(np.corrcoef(np.concatenate(preds).flatten(), np.concatenate(labels).flatten())[0, 1])
    preds = np.concatenate(preds)
    labels = np.concatenate(labels)
    return np.corrcoef(preds.flatten(), labels.flatten())[0, 1]



In [None]:
EPOCHS = 1
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
total_loss = 0
loss_values = []

def train(model, loader, val_loader, optimizer, EPOCHS):

   model.train()
   for epoch in range(EPOCHS):
      for idx, (ids, att, val) in enumerate(loader):
         ids, att, val = ids.to(device), att.to(device), val.to(device)
         outputs = model(input_ids=ids, attention_mask=att, labels=val)
         loss, logits = outputs[:2]
         optimizer.zero_grad()
         loss.backward()
         optimizer.step()
         total_loss += loss.item()
         torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

         if idx % 10 == 0:
            print("average training loss: {0:.2f}".format(total_loss / (idx+1)))
            print("current loss:", loss.item())
            print(f"logits: {logits}")
            
      loss_values.append(total_loss)
      print("average training loss: {0:.2f}".format(total_loss))

train(model, loader, val_loader, optimizer, EPOCHS)




average training loss: 6.33
current loss: 6.329257965087891
logits: tensor([[ 0.0171],
        [-0.0023],
        [-0.0311],
        [-0.1140],
        [ 0.2167],
        [ 0.0257],
        [-0.0162],
        [ 0.2933]], grad_fn=<AddmmBackward0>)
average training loss: 7.90
current loss: 6.7416276931762695
logits: tensor([[0.8825],
        [0.2642],
        [1.2874],
        [1.0074],
        [0.8416],
        [1.1070],
        [0.5024],
        [0.5320]], grad_fn=<AddmmBackward0>)
average training loss: 6.00
current loss: 2.5527827739715576
logits: tensor([[1.6781],
        [1.9889],
        [2.2216],
        [2.1277],
        [1.5780],
        [1.8669],
        [1.8939],
        [1.8594]], grad_fn=<AddmmBackward0>)
average training loss: 4.71
current loss: 2.4748387336730957
logits: tensor([[2.7295],
        [2.9217],
        [3.2007],
        [2.4448],
        [0.5208],
        [3.1640],
        [2.2395],
        [2.8676]], grad_fn=<AddmmBackward0>)
average training loss: 3.99
curre

TypeError: evaluation() takes 2 positional arguments but 3 were given

In [None]:

#perfom compute_accuracy on validation set
print(eval_pearson_corr(model, val_loader))

The correlation for batch number3 is :
0.7290684776166821
The correlation for batch number3 is :
0.7096006334463993
The correlation for batch number3 is :
0.7001433295226278
The correlation for batch number3 is :
0.6739208806995669
The correlation for batch number3 is :
0.7028208290554693
The correlation for batch number3 is :
0.7109311977826535
The correlation for batch number3 is :
0.7030675850692871
The correlation for batch number3 is :
0.6305507729832571
The correlation for batch number3 is :
0.6157131781406932
The correlation for batch number3 is :
0.6283542232713288
The correlation for batch number3 is :
0.5507159373583025
The correlation for batch number3 is :
0.5405234747681081
The correlation for batch number3 is :
0.5304123385033863
The correlation for batch number3 is :
0.5376511797971465
The correlation for batch number3 is :
0.5405870374715676
The correlation for batch number3 is :
0.5477203393546923
The correlation for batch number3 is :
0.5585409272663694
The correlatio