In [412]:
import pandas as pd
from transformers import XLMRobertaModel, XLMRobertaTokenizer, XLMRobertaConfig
import torch
from torch import Tensor
from torch.utils.data import TensorDataset, DataLoader

tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base", pad_token="<pad>")
config = XLMRobertaConfig()
config.vocab_size = tokenizer.vocab_size
model = XLMRobertaModel.from_pretrained("xlm-roberta-base")


In [413]:
import warnings
warnings.filterwarnings("ignore")

In [414]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

XLMRobertaModel(
  (embeddings): XLMRobertaEmbeddings(
    (word_embeddings): Embedding(250002, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): XLMRobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x XLMRobertaLayer(
        (attention): XLMRobertaAttention(
          (self): XLMRobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): XLMRobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=Tru

In [415]:
train_data = pd.read_csv("../../data/train/train.csv")
test_data = pd.read_csv("../../data/test/final_test_pairs.csv")

print(train_data.columns)

Index(['Unnamed: 0', 'pair_id', 'id1', 'id2', 'text1', 'text2', 'overall',
       'lang1', 'lang2'],
      dtype='object')


In [416]:
def tokenize_and_shorten_sentence(text: str) -> Tensor:
    """
    Tokenize the input text and shorten it to 256 tokens.

    Args:
        text (str): The input text.

    Returns:
        Tensor: The tokenized and shortened text tensor.
    """
    tokenized_text = tokenizer(text, return_tensors="pt", padding=False, truncation=True, add_special_tokens=False)

    if tokenized_text["input_ids"].shape[1] > 256:
        #concatenate the first 200 tokens and the last 56 tokens into one list
        shorten_ids =  tokenized_text["input_ids"][:, :200].tolist()[0] + tokenized_text["input_ids"][:, -56:].tolist()[0]
        

    else:
        shorten_ids = tokenized_text["input_ids"].tolist()[0] + [tokenizer.pad_token_id] * (256 - tokenized_text["input_ids"].shape[1])


    return tokenizer.decode(shorten_ids)

In [417]:
def tokenize_texts(text1:str, text2:str):
    tokenized_text = tokenizer(text1, text2, return_tensors="pt", padding="max_length", 
                               truncation=True, add_special_tokens=True, max_length=512)
    return tokenized_text["input_ids"], tokenized_text["attention_mask"]

In [418]:
input_ids = []
attention_mask = []


for i in range(len(train_data)):
    text1 = train_data["text1"][i]
    text2 = train_data["text2"][i]
    text1_truncated = tokenize_and_shorten_sentence(text1)
    text2_truncated = tokenize_and_shorten_sentence(text2)
    
    text_input_ids, text_attention_mask = tokenize_texts(text1_truncated, text2_truncated)
    input_ids.append(text_input_ids.tolist()[0])
    attention_mask.append(text_attention_mask.tolist()[0])

    

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [419]:
score = torch.tensor(train_data["overall"]).long()
data = TensorDataset(torch.tensor(input_ids).long(), torch.tensor(attention_mask).long(), score.view(-1, 1))
loader = DataLoader(data, batch_size=4, shuffle=True)

In [441]:
with torch.no_grad():
   for idx, (ids, att, val) in enumerate(loader):
      ids = ids.to(device)
      att = att.to(device)
      val = val.to(device)

      print(ids.shape, att.shape, val.shape)

      y_pred = model(input_ids=ids, attention_mask=att)
      last_hidden_states = y_pred.last_hidden_state

# Assuming you want to perform regression, you'll add a linear layer on top
# Let's assume regression_target_size is the size of your regression target
      regression_target_size = 1  # Example: single-dimensional regression
      regressor = torch.nn.Linear(model.config.hidden_size, regression_target_size)
# Forward pass through the regressor layer
      predictions = regressor(last_hidden_states).squeeze()
   
# Now predictions contains the regression outputs
      print("Predictions:", predictions)
      break
   

torch.Size([4, 512]) torch.Size([4, 512]) torch.Size([4, 1])
Predictions: tensor([[ 0.0029,  0.1713,  0.2096,  ...,  0.1565,  0.0955,  0.0737],
        [-0.0071,  0.1684,  0.1290,  ...,  0.0853,  0.0794,  0.0926],
        [-0.0495,  0.1623,  0.1591,  ...,  0.2384,  0.1284, -0.0032],
        [-0.0032,  0.1742,  0.1598,  ...,  0.0927,  0.0927,  0.0520]])


In [439]:

# Initialize XLM-RoBERTa model and tokenizer
model_name = "xlm-roberta-base"
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
model = XLMRobertaModel.from_pretrained(model_name)

# Example input text
input_text = "This is an example sentence."

# Tokenize input text
inputs = tokenizer(input_text, return_tensors="pt")

# Forward pass through the model
with torch.no_grad():
    outputs = model(**inputs)

# Get the final hidden states (last layer's hidden states)
last_hidden_states = outputs.last_hidden_state

# Assuming you want to perform regression, you'll add a linear layer on top
# Let's assume regression_target_size is the size of your regression target
regression_target_size = 1  # Example: single-dimensional regression
regressor = torch.nn.Linear(model.config.hidden_size, regression_target_size)

# Forward pass through the regressor layer
predictions = regressor(last_hidden_states)

# Now predictions contains the regression outputs
print("Predictions:", predictions)


Predictions: tensor([[[0.1291],
         [0.4641],
         [0.4428],
         [0.4613],
         [0.3763],
         [0.4047],
         [0.4604],
         [0.0469]]], grad_fn=<ViewBackward0>)
