In [1]:
import nltk
import pandas as pd
from nltk.stem import WordNetLemmatizer

# Import STOPWORDS from NLTK
from nltk.corpus import stopwords

import string, re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

import torch
from sklearn.model_selection import train_test_split
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data = pd.read_csv("data/subtask-2-multilingual/ml_only_2024_languages/train_ml_only_2024_languages.tsv",sep='\t')
dev_data = pd.read_csv("data/subtask-2-multilingual/ml_only_2024_languages/dev_ml_only_2024_languages.tsv", sep='\t')


In [3]:
# Mapping label strings to integers
label_map = {"OBJ": 0, "SUBJ": 1}
train_data['label'] = train_data['label'].map(label_map)
print(train_data['label'])

dev_data['label'] = dev_data['label'].map(label_map)
print(dev_data['label'])

0       0
1       0
2       0
3       0
4       1
       ..
4423    0
4424    1
4425    1
4426    0
4427    0
Name: label, Length: 4428, dtype: int64
0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    0
33    0
34    0
35    0
36    0
37    0
38    0
39    0
Name: label, dtype: int64


In [4]:
#train_data = train_data.drop('solved_conflict', axis=1)

In [5]:
train_data

Unnamed: 0,sentence_id,sentence,label
0,MIS_702-curl_03_010,"وحتى الآن هذا العام، كانت الشمس ""فارغة"" مع عدم...",0
1,MIS_560-curl_04_005,وكان أفيخاي قد نشر تدوينة بشأن مسلسل النهاية ق...,0
2,MIS_2265-eurl_03_022,وجاء في خطاب ممثل بوليساريو أحمد بخاري يوم الا...,0
3,FAT_1139-eurl_01_023,4 أسئلة ينبغي طرحها قبل تصديق المعلومة,0
4,MIS_427-curl_06_004,وذكر مجاهد أن الوزيرة محملة خلال تلك الزيارة ب...,1
...,...,...,...
4423,0e93f441-9faa-4b64-ad38-690e86b1d0f7,"Presi due della banda, il più piccolo ha 15 anni",0
4424,bdad5c4d-d160-4d8c-9e16-8f26512b8479,Il congresso è previsto tra due anni ma appena...,1
4425,3e6f5ca0-7cfe-4a3b-96b0-b00af2f9186f,Tutto ciò che fanno nasce da un amore sincero ...,1
4426,ddae1feb-9fb4-4574-828c-67fe8c3bb063,ha detto il segretario di Stato Usa Antony Bli...,0


In [6]:
# Load tokenizer
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')

In [7]:
train_encodings = tokenizer(train_data['sentence'].tolist(), truncation=True, padding=True)
eval_encodings = tokenizer(dev_data['sentence'].tolist(), truncation=True, padding=True)

In [8]:
#train_encodings["input_ids"]
#train_encodings["attention_mask"]


In [9]:
# Create PyTorch datasets
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_data['label'])
)

val_dataset = torch.utils.data.TensorDataset(
    torch.tensor(eval_encodings['input_ids']),
    torch.tensor(eval_encodings['attention_mask']),
    torch.tensor(dev_data['label'])
)

In [10]:
print(train_dataset[0][0].shape) # input_ids shape
print(train_dataset[0][1].shape) # attention_mask shape
print(train_dataset[0][2].shape) # train_labels_onehot shape

torch.Size([269])
torch.Size([269])
torch.Size([])


In [11]:
# Define model
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-large', num_labels=2)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# trainer parameters
epochs = 3
learning_rate=5e-5 # default 5e-5
warmup_steps = 500

In [13]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=warmup_steps,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=50,
    learning_rate=learning_rate,
)

In [14]:
# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=lambda data: {'input_ids': torch.stack([item[0] for item in data]),
                                'attention_mask': torch.stack([item[1] for item in data]),
                                'labels': torch.stack([item[2] for item in data])},
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [15]:
# Train the model
trainer.train()

# epoch 4 / 50 = 200	0.538800	0.644553

Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
#model.save_pretrained("./model")

In [None]:
#model = XLMRobertaForSequenceClassification.from_pretrained("./model")

In [None]:
# test
test_data = pd.read_csv("data/subtask-2-multilingual/ml_only_2024_languages/dev_test_ml_only_2024_languages.tsv", sep='\t')  # Update with your dev data file
test_data['label'] = test_data['label'].map(label_map)
test_encodings = tokenizer(test_data['sentence'].tolist(), truncation=True, padding=True)

test_dataset = torch.utils.data.TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(test_data['label'])
)

In [None]:
preds = trainer.predict(test_dataset)
pred_labels = preds.predictions.argmax(-1)



In [None]:
print(pred_labels)

In [None]:
# Convert the true labels to integers

# Compute the accuracy and classification report
accuracy = accuracy_score(test_data['label'], pred_labels)
class_report = classification_report(test_data['label'], pred_labels, target_names=['OBJ', 'SUBJ'])

print(f"Accuracy for RoBERTa: {accuracy}")
print(f"Classification Report:\n{class_report}")

In [None]:
# learning rate 5e-5
# epochs 3 - acc 0.61 - OBJ 0.56, SUBJ 0.71 - f1 0.66 - 0.54