In [16]:
import pandas as pd
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer, XLMRobertaConfig, pipeline
import torch
from torch import Tensor
from torch.utils.data import TensorDataset, DataLoader, random_split
import numpy as np
from typing import List, Tuple
import time
import os

tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base", TOKENIZERS_PARALLELISM=True)
config = XLMRobertaConfig(
    num_labels=1,
    output_hidden_states=False,
    output_attentions=False,
);
config.vocab_size = tokenizer.vocab_size

tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base", TOKENIZERS_PARALLELISM=True)
model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=1);

os.environ["TOKENIZERS_PARALLELISM"] = "false"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device);


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:

%run ../../utils/article_dataset.py
%run ../../utils/dev_utils.py
%run ../../utils/chunker.py
%run ../../utils/ml_utils.py

train_data = pd.read_csv("../../../data/train/train.csv")
test_data = pd.read_csv("../../../data/test/merged_test_data.csv")

In [39]:

def chunk(text: str) -> List[List[int]]:
    max_length = 256
    tokenized_text = tokenizer(text, return_tensors="pt", padding= True, truncation=False, add_special_tokens=True, max_length= None)
    input_ids = tokenized_text["input_ids"].tolist()[0]

    chunks = [input_ids[i:i+max_length] for i in range(0, len(input_ids), max_length)]
    return chunks

def create_chunks(df: pd.DataFrame) -> pd.DataFrame:
    df["chunks1"] = df["text1"].apply(chunk)
    df["chunks2"] = df["text2"].apply(chunk)
    return df

def create_combinations(df: pd.DataFrame) -> pd.DataFrame:
    df["combinations"] = df.apply(lambda x: [(i, j) for i in x["chunks1"] for j in x["chunks2"]], axis=1)
    return df

train_data = create_chunks(train_data)
train_data = create_combinations(train_data)

In [40]:
train_dataset = ArticleDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=False, collate_fn=MlUtils.collate_fn)