In [1]:
import pandas as pd
from collections import defaultdict
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer, XLMRobertaConfig, pipeline
import torch
from torch import Tensor
from torch.utils.data import TensorDataset, DataLoader, random_split
import numpy as np
from typing import List, Tuple
import time
import os

tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base", TOKENIZERS_PARALLELISM=True)
config = XLMRobertaConfig(
    num_labels=1,
    output_hidden_states=False,
    output_attentions=False,
);
config.vocab_size = tokenizer.vocab_size

tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base", TOKENIZERS_PARALLELISM=True)
model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=1);

os.environ["TOKENIZERS_PARALLELISM"] = "false"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device);


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:

%run ../../utils/article_dataset.py
%run ../../utils/dev_utils.py
%run ../../utils/chunker.py
%run ../../utils/ml_utils.py

train_data = pd.read_csv("../../../data/train/train.csv")
test_data = pd.read_csv("../../../data/test/merged_test_data.csv")

DEV_NUM = 8
train_data = train_data[:DEV_NUM]
test_data = test_data[:DEV_NUM]

chunker = Chunker(tokenizer, 255)

In [3]:
def create_chunks(df: pd.DataFrame) -> pd.DataFrame:
    df["chunks1"] = df["text1"].apply(chunker.chunk)
    df["chunks2"] = df["text2"].apply(chunker.chunk)
    return df

def create_combinations(df: pd.DataFrame) -> pd.DataFrame:
    sep_token_id = tokenizer.sep_token_id
    sep_token_array = np.array([sep_token_id])

    combined = []
    for row_idx, row in df.iterrows():
        combinations = defaultdict(np.ndarray)
        chunks1 = row["chunks1"] # e.g. (2, 256)
        chunks2 = row["chunks2"] # e.g. (3, 256)
        
        for i, chunk1 in enumerate(chunks1):
            for j, chunk2 in enumerate(chunks2):
                if chunk1[-1] != sep_token_id:
                    chunk1 = np.concatenate((chunk1, sep_token_array))

                combinations[(row_idx, i, j)] = np.concatenate((chunk1, chunk2))
                
        combined.append(combinations)

    df["combinations"] = combined
    return df

train_data = create_chunks(train_data)
train_data = create_combinations(train_data)

df = train_data[["combinations", "overall"]]

batch_size = 2
batched_df = np.array_split(df, np.ceil(len(df) / batch_size))

Token indices sequence length is longer than the specified maximum sequence length for this model (716 > 512). Running this sequence through the model will result in indexing errors


In [4]:
def train(batched_df):
    model.train()
    for idx, batch in enumerate(batched_df):
        combinations_list = []
        labels_list = []

        labels = batch["overall"].values
        combinations = batch["combinations"].values[1]

        row, i = -1, -1
        for key, combination in combinations.items():
            curr_row = key[0]
            if curr_row != row:
                row = curr_row
                i += 1
            
            label = labels[i]
            tensor_combination = torch.tensor(combination, dtype=torch.float)

            labels_list.append(float(label))
            combinations_list.append(tensor_combination)

        ids = torch.stack(combinations_list).long()
        labels = torch.tensor(labels_list, dtype=torch.float).float()

        ids = ids.to(device)
        labels = labels.to(device)

        outputs = model(ids, labels=labels)
        loss, logits = outputs[:2]    
        print(loss)

        loss.backward()

In [5]:
train(batched_df)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor(14.5752, grad_fn=<MseLossBackward0>)
tensor(4.6184, grad_fn=<MseLossBackward0>)
tensor(1.1300, grad_fn=<MseLossBackward0>)
tensor(12.3055, grad_fn=<MseLossBackward0>)
