In [23]:
import pandas as pd
from transformers import XLMRobertaModel, XLMRobertaTokenizer
import torch

tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
model = XLMRobertaModel.from_pretrained("xlm-roberta-base")

In [2]:
train_data = pd.read_csv("../../data/train/train.csv")
test_data = pd.read_csv("../../data/test/final_test_pairs.csv")

print(train_data.columns)

Index(['Unnamed: 0', 'pair_id', 'id1', 'id2', 'text1', 'text2', 'overall',
       'lang1', 'lang2'],
      dtype='object')


In [3]:
def select_tokens(n_head: int, n_tail: int, text: str):
    """
        Select n tokens from either head or tail of the text.

        Parameters:
            n_head: number of tokens from the head of the text
            n_tail: number of tokens from the tail of the text
            text: text to select tokens from
    """

    tokens = text.split()
    n_tokens = len(tokens)

    if n_tokens <= n_head + n_tail:
        return text

    head = tokens[:n_head]
    tail = tokens[-n_tail:]

    return " ".join(head + tail)

N_HEAD = 200
N_TAIL = 56

# truncate the text by getting N_HEAD tokens from the head and N_TAIL tokens from the tail
train_data["text1_truncated"] = train_data["text1"].apply(lambda x: select_tokens(N_HEAD, N_TAIL, x))
train_data["text2_truncated"] = train_data["text2"].apply(lambda x: select_tokens(N_HEAD, N_TAIL, x))

# concatenate the truncated texts with a separator token
train_data["text_truncated"] = train_data["text1_truncated"] + " <sep> " + train_data["text2_truncated"]

# Get the embeddings for the train set

In [21]:
def pad_embedding(embedding, target_length):
    """
    Pads the embedding tensor to the target length with zeros.

    Parameters:
        embedding: The embedding tensor of shape [1, sequence_length, embedding_size].
        target_length: The target sequence length.
    """
    current_length = embedding.size(1)
    padding_length = target_length - current_length
    if padding_length > 0:
        # padding is added as [1, padding_length, embedding_size]
        padding = torch.zeros((1, padding_length, embedding.size(2)))
        padded_embedding = torch.cat([embedding, padding], dim=1)
        return padded_embedding
    else:
        return embedding

In [None]:
train_embeddings = torch.empty((len(train_data), 512, 768))

for i, text in enumerate(train_data["text_truncated"]):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state
        embeddings = pad_embedding(embeddings, 512)
        
    train_embeddings[i] = embeddings

print(train_embeddings.shape)