In [1]:
import nltk
import pandas as pd
from nltk.stem import WordNetLemmatizer

# Import STOPWORDS from NLTK
from nltk.corpus import stopwords

import string, re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

import torch
from sklearn.model_selection import train_test_split
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data_raw = pd.read_csv("data/subtask-2-multilingual/ml_only_2024_languages/train_ml_only_2024_languages.tsv",sep='\t')
#dev_data = pd.read_csv("data/subtask-2-multilingual/ml_only_2024_languages/dev_ml_only_2024_languages.tsv", sep='\t')


In [3]:
train_data = train_data_raw.sample(n=1500)
dev_data = train_data_raw.sample(n=300)

In [4]:
# Mapping label strings to integers
label_map = {"OBJ": 0, "SUBJ": 1}
train_data['label'] = train_data['label'].map(label_map)
print(train_data['label'])

dev_data['label'] = dev_data['label'].map(label_map)
print(dev_data['label'])

2053    0
589     0
3941    0
179     0
3604    0
       ..
3331    1
232     0
1305    1
3427    1
3518    0
Name: label, Length: 1500, dtype: int64
1337    0
3183    0
1168    0
4337    0
4350    1
       ..
3114    0
3276    0
189     0
161     1
1361    1
Name: label, Length: 300, dtype: int64


In [5]:
#train_data = train_data.drop('solved_conflict', axis=1)

In [6]:
train_data

Unnamed: 0,sentence_id,sentence,label
2053,183b50f29309276f125e605f0c83fc1e6a7bb67f,"Der andere Angeklagte bekundete, er könne sich...",0
589,AFP_29-eurl_02_003,بالتزامن مع ذلك تواجه فرنسا اتهامات قضائية بال...,0
3941,d5773929-3805-44a6-bd17-299c1a1c36ad,"Ieri i test positivi erano stati 7.925, i mort...",0
179,MIS_2852-curl_04_003,"وتابع: ""يمكننا القيام بذلك لأن جهازنا الصحي هو...",0
3604,1c3faba9-0fa3-4c13-bdd1-9f3cc1e333d8,Venerdì — mentre veniva «processata» nei forum...,0
...,...,...,...
3331,a37add47-58f6-4da6-9d79-93a783eca9d3,Come ripeto ormai da anni le soluzioni ci sono...,1
232,MIS_561-curl_02_022,وتقدم السودان بعد الثورة على البشير إلى المرتب...,0
1305,9ebd51e3-8f5a-4174-938f-1efdf3bfe85d,The reason for the lack of any actual ‘levelli...,1
3427,5c144c6e-2295-4cc6-86f4-82b60b7327d6,Anche dal consulente più impensabile: Walter R...,1


In [7]:
# Load tokenizer
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')

In [8]:
train_encodings = tokenizer(train_data['sentence'].tolist(), truncation=True, padding=True)
eval_encodings = tokenizer(dev_data['sentence'].tolist(), truncation=True, padding=True)

In [9]:
#train_encodings["input_ids"]
#train_encodings["attention_mask"]


In [10]:
# Create PyTorch datasets
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_data['label'].to_numpy())
)

val_dataset = torch.utils.data.TensorDataset(
    torch.tensor(eval_encodings['input_ids']),
    torch.tensor(eval_encodings['attention_mask']),
    torch.tensor(dev_data['label'].to_numpy())
)

In [11]:
print(train_dataset[0][0].shape) # input_ids shape
print(train_dataset[0][1].shape) # attention_mask shape
print(train_dataset[0][2].shape) # train_labels_onehot shape

torch.Size([197])
torch.Size([197])
torch.Size([])


In [12]:
# Define model
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-large', num_labels=2)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
device

#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#print(device)


device(type='cuda', index=0)

In [14]:
# trainer parameters
epochs = 4
learning_rate=5e-5 # default 5e-5
warmup_steps = 500

In [15]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=warmup_steps,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=25,
    learning_rate=learning_rate,
)

In [16]:
# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=lambda data: {'input_ids': torch.stack([item[0] for item in data]),
                                'attention_mask': torch.stack([item[1] for item in data]),
                                'labels': torch.stack([item[2] for item in data])},
)

In [17]:
# Train the model
trainer.train()

# epoch 4 / 50 = 200	0.538800	0.644553

Step,Training Loss,Validation Loss
25,0.6927,0.644363
50,0.6402,0.589586
75,0.6069,0.584591
100,0.6255,0.59156
125,0.6034,0.575733
150,0.6581,0.568187
175,0.5204,0.52749
200,0.5024,0.460873
225,0.5014,0.49905
250,0.6301,0.584781


TrainOutput(global_step=376, training_loss=0.5701390454109679, metrics={'train_runtime': 2976.9405, 'train_samples_per_second': 2.015, 'train_steps_per_second': 0.126, 'total_flos': 2151450920808000.0, 'train_loss': 0.5701390454109679, 'epoch': 4.0})

In [18]:
#model.save_pretrained("./model")

In [19]:
#model = XLMRobertaForSequenceClassification.from_pretrained("./model")

In [20]:
# test
test_data = pd.read_csv("data/subtask-2-multilingual/ml_only_2024_languages/dev_test_ml_only_2024_languages.tsv", sep='\t')  # Update with your dev data file
test_data['label'] = test_data['label'].map(label_map)
test_encodings = tokenizer(test_data['sentence'].tolist(), truncation=True, padding=True)

test_dataset = torch.utils.data.TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(test_data['label'])
)

In [21]:
preds = trainer.predict(test_dataset)
pred_labels = preds.predictions.argmax(-1)



In [22]:
print(pred_labels)

[0 0 0 1 0 1 0 1 1 1 0 0 1 0 0 0 0 1 0 1 1 1 0 1 1 1 1 1 1 1 1 0 0 1 0 0 0
 1 1 1 0 0 1 1 1 1 1 1 1 0 0 0 0 0 1 1 0 0 1 1 1 0 0 0 0 1 1 0 1 1 0 1 1 1
 0 1 0 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 0 0 1 0 0 1 1 1 1 0 0 1 1 1 1 1 0
 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0
 0 0 0 0 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 0 0 0 1 1 1 0 1 0 0 1 1 0 1 0 1 1 1
 1 1 1 0 0 1 0 1 1 0 0 1 1 1 1 1 0 1 1 0 0 1 0 1 1 0 1 0 1 1 1 1 1 0 1 0 1
 0 0 1 0 0 1 0 0 1 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 0 0 0 0 1 0 1 0 1 1 1 1 0
 1 0 1 1 1 1 0 1 0 0 1 1 0 1 0 1 0 0 1 1 0 0 1 0 1 0 0 1 0 0 1 1 1 1 1 1 1
 0 1 1 1 1 0 0 1 0 0 0 1 0 1 1 0 0 1 1 0 1 1 1 0 0 0 0 0 0 0 1 0 1 1 0 1 1
 0 0 1 0 0 1 1 1 1 1 0 0 0 1 0 0 0 1 1 1 1 0 1 1 1 0 1 1 1 1 0 1 1 0 0 0 1
 0 0 0 1 0 1 1 0 0 0 0 1 0 0 0 1 1 1 1 0 1 1 1 1 0 0 0 0 1 1]


In [23]:
# Convert the true labels to integers

# Compute the accuracy and classification report
accuracy = accuracy_score(test_data['label'], pred_labels)
class_report = classification_report(test_data['label'], pred_labels, target_names=['OBJ', 'SUBJ'])

print(f"Accuracy for RoBERTa: {accuracy}")
print(f"Classification Report:\n{class_report}")

Accuracy for RoBERTa: 0.765
Classification Report:
              precision    recall  f1-score   support

         OBJ       0.79      0.72      0.75       200
        SUBJ       0.74      0.81      0.78       200

    accuracy                           0.77       400
   macro avg       0.77      0.77      0.76       400
weighted avg       0.77      0.77      0.76       400



In [24]:
# learning rate 5e-5
# epochs 3 - acc 0.61 - OBJ 0.56, SUBJ 0.71 - f1 0.66 - 0.54