In [9]:
import pandas as pd
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer, XLMRobertaConfig, pipeline
import torch
from torch import Tensor
from torch.utils.data import TensorDataset, DataLoader, random_split
import numpy as np
from typing import List, Tuple
import time

tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base", TOKENIZERS_PARALLELISM=True)
config = XLMRobertaConfig(
    num_labels=1,
    output_hidden_states=False,
    output_attentions=False,
)
config.vocab_size = tokenizer.vocab_size

model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=1)

summarizer = pipeline("summarization", model="Falconsai/text_summarization")

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768,

In [11]:
train_data = pd.read_csv("../../../data/train/train.csv")
test_data = pd.read_csv("../../../data/test/final_test_pairs.csv")

print(train_data.columns)

Index(['Unnamed: 0', 'pair_id', 'id1', 'id2', 'text1', 'text2', 'overall',
       'lang1', 'lang2'],
      dtype='object')


In [12]:
def _add_pad_token_to_text(text:str, max_length:int=254):
    return text + " <pad>" * (max_length - len(text.split()))

def summarize(df):
    df['summary1'] = df['text1'].apply(
        lambda x: summarizer(x, max_length=254, min_length=150, do_sample=False)[0]['summary_text'] if len(x.split()) > 254 else x).apply(lambda x: _add_pad_token_to_text(x, 254))
    df['summary2'] = df['text2'].apply(
        lambda x: summarizer(x, max_length=254, min_length=150, do_sample=False)[0]['summary_text'] if len(x.split()) > 254 else x).apply(lambda x: _add_pad_token_to_text(x, 254))

    return df

In [13]:
def tokenize_texts(df: pd.DataFrame, col1: str = "summary1", col2: str = "summary2") -> Tuple[Tensor, Tensor]:
        """
            Tokenize the input texts and return the input_ids and attention_mask
        """

        texts1, texts2 = df[col1], df[col2]
        input_ids, attention_mask = [], []

        for idx, (t1, t2) in enumerate(zip(texts1, texts2)):
            tokenized_text = tokenizer(t1, t2, return_tensors="pt", padding="max_length", 
                                    truncation=True, add_special_tokens=True, max_length=512)
            input_ids.append(tokenized_text["input_ids"].tolist()[0])
            att = [1 if i != 1 else 0 for i in tokenized_text["input_ids"].tolist()[0]]
            attention_mask.append(att)

        return torch.tensor(input_ids).long(), torch.tensor(attention_mask).long()

In [18]:
train_data_parts = np.array_split(train_data, 20)

In [19]:
# divide the train_data df into 10 parts

# for each part, summmarize the text1 and text2 and tokenize them
for i, part in enumerate(train_data_parts):
    print(f"Processing part {i+1} of 20")
    part = summarize(part)
    part.to_csv(f"../../../data/train/train_part_{i+1}.csv", index=False)

Processing part 1 of 20


In [None]:
train_data_parts = np.array_split(train_data, 10)

In [8]:
import numpy as np
import pandas as pd
from multiprocessing import Pool

def process_part(part):
    # Summarize the text1 and text2 columns and tokenize them
    part = summarize(part)
    print(f"Processed part with {len(part)} rows.")
    return part

# Assuming train_data is already defined

# Define the number of processes to use (you can adjust this based on your system)
num_processes = 4

with Pool(num_processes) as pool:
    processed_parts = pool.map(process_part, train_data_parts)

# Save each processed part to a separate CSV file
for i, part in enumerate(processed_parts):
    part.to_csv(f"../../../data/train/train_part_{i+1}.csv", index=False)


Process SpawnPoolWorker-15:
Traceback (most recent call last):
  File "/Users/hubert/opt/anaconda3/envs/thesis_env/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/hubert/opt/anaconda3/envs/thesis_env/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/hubert/opt/anaconda3/envs/thesis_env/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/hubert/opt/anaconda3/envs/thesis_env/lib/python3.9/multiprocessing/queues.py", line 367, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'process_part' on <module '__main__' (built-in)>
Process SpawnPoolWorker-17:
Traceback (most recent call last):
  File "/Users/hubert/opt/anaconda3/envs/thesis_env/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/hubert/opt/anaconda3/envs/thesis_env/lib/python3.9/multiprocessing/pro

KeyboardInterrupt: 

In [None]:
batch_size = 8
shuffle = True

score = torch.tensor(train_data["overall"]).float().view(-1, 1)
tensor_dataset = TensorDataset(input_ids, attention_mask, score)

train_size = int(0.8 * len(tensor_dataset))
val_size = len(tensor_dataset) - train_size
train_dataset, val_dataset = random_split(tensor_dataset, [train_size, val_size])

# Define data loaders with appropriate batch size and shuffle
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=4)

In [86]:
def train(loader: DataLoader, epochs:int=3):
        """
            Train the model
        """
        optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

        #logging.info("Training the model...")
        print(f"batch size: {batch_size}, data size: {len(loader)}")

        best_pearson = -1.0
        total_loss = 0
        losses = []
        model.train()

        for epoch in range(epochs):
            start_time = time.time()
            #logging.info(f"Epoch {epoch+1} of {self.epochs}")
            print(f"{'-'*10} Epoch {epoch+1} of {epochs} {'-'*10}")

            for idx, (ids, att, val) in enumerate(loader):
                ids, att, val = ids.to(device), att.to(device), val.to(device)

                outputs = model(input_ids=ids, attention_mask=att, labels=val)
                loss, logits = outputs[:2]

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                #scheduler.step()
                total_loss += loss.item()

                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                if idx % 10 == 0:
                    print("average training loss: {0:.2f}".format(total_loss / (idx+1)))
                    print("current loss:", loss.item())

                    #print(f"logits: {logits}")

            print("starting validation...")
            #dev_true, dev_pred = self.predict(loader)
            #cur_pearson = np.corrcoef(dev_true, dev_pred)[0][1]

            #if cur_pearson > best_pearson:
            #    best_pearson = cur_pearson
            #    torch.save(self.model.state_dict(), self.model_save_path)

            #print("Current dev pearson is {:.4f}, best pearson is {:.4f}".format(cur_pearson, best_pearson))
            print("Time costed : {}s \n".format(round(time.time() - start_time, 3)))
            
            # store the loss value for plotting the learning curve.
            avg_train_loss = total_loss / len(loader)
            print("average training loss: {0:.2f}".format(avg_train_loss))
            losses.append(avg_train_loss)

In [87]:
train(train_loader, 3)

batch size: 8, data size: 1
---------- Epoch 1 of 3 ----------
average training loss: 8.51
current loss: 8.514425277709961
starting validation...
Time costed : 22.282s 

average training loss: 8.51
---------- Epoch 2 of 3 ----------
average training loss: 16.68
current loss: 8.166736602783203
starting validation...
Time costed : 21.501s 

average training loss: 16.68
---------- Epoch 3 of 3 ----------
average training loss: 24.35
current loss: 7.6696062088012695
starting validation...
Time costed : 20.828s 

average training loss: 24.35
