In [1]:
import nltk
import pandas as pd
from nltk.stem import WordNetLemmatizer

# Import STOPWORDS from NLTK
from nltk.corpus import stopwords

import string, re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

import torch
from sklearn.model_selection import train_test_split
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data_raw = pd.read_csv("data/subtask-2-multilingual/ml_only_2024_languages/train_ml_only_2024_languages.tsv",sep='\t')
#dev_data = pd.read_csv("data/subtask-2-multilingual/ml_only_2024_languages/dev_ml_only_2024_languages.tsv", sep='\t')


In [3]:
train_data = train_data_raw.sample(n=1500)
dev_data = train_data_raw.sample(n=300)

In [4]:
# Mapping label strings to integers
label_map = {"OBJ": 0, "SUBJ": 1}
train_data['label'] = train_data['label'].map(label_map)
print(train_data['label'])

dev_data['label'] = dev_data['label'].map(label_map)
print(dev_data['label'])

755     0
3044    0
317     0
1875    1
1874    0
       ..
2643    0
1338    0
1649    1
3819    0
1804    1
Name: label, Length: 1500, dtype: int64
3957    0
3973    0
1537    0
2916    0
2607    1
       ..
566     0
3805    0
3007    0
2192    0
2259    1
Name: label, Length: 300, dtype: int64


In [5]:
#train_data = train_data.drop('solved_conflict', axis=1)

In [6]:
train_data

Unnamed: 0,sentence_id,sentence,label
755,AFP_614-eurl_08_021,وخرجت تظاهرات يوم الجمعة الماضي مطالبة برحيل ا...,0
3044,8314f85e-fd31-46cc-b67d-0b7b244e793a,Solitamente all'abdicazione la politica arriva...,0
317,AFP_637-eurl_01_003,"يضيف والد الشهيد لـ""الوطن"": ""قبل الوصول إلى مو...",0
1875,e9bc0c11-ffff-4676-a54b-840e6379503b,What it means is extension of government—not b...,1
1874,9c63a53a-7f66-4944-a4c0-abdd2dabc4cc,iSteve commenter Sparkling Wiggle writes: Hig...,0
...,...,...,...
2643,d2ed190a149a4cb0aa5d56894791b49691e81778,Eine davon startete letzte Woche in Stanford M...,0
1338,729c8fb9-aa88-4cfd-b92b-80a005ed0e32,"When the money was denied, Feeding Our Future ...",0
1649,edc1fa54-bed0-4aeb-9527-d43860d2757e,What the Levelling Up Bill is really is a Plan...,1
3819,6ab8ae65-8d66-4121-b22d-10a1fa5be60d,"""I vescovi italiani esprimono vicinanza e ammi...",0


In [7]:
# Load tokenizer
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')

In [8]:
train_encodings = tokenizer(train_data['sentence'].tolist(), truncation=True, padding=True)
eval_encodings = tokenizer(dev_data['sentence'].tolist(), truncation=True, padding=True)

In [9]:
#train_encodings["input_ids"]
#train_encodings["attention_mask"]


In [10]:
# Create PyTorch datasets
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_data['label'].to_numpy())
)

val_dataset = torch.utils.data.TensorDataset(
    torch.tensor(eval_encodings['input_ids']),
    torch.tensor(eval_encodings['attention_mask']),
    torch.tensor(dev_data['label'].to_numpy())
)

ValueError: could not determine the shape of object type 'Series'

In [None]:
print(train_dataset[0][0].shape) # input_ids shape
print(train_dataset[0][1].shape) # attention_mask shape
print(train_dataset[0][2].shape) # train_labels_onehot shape

In [None]:
# Define model
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-large', num_labels=2)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
device

#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#print(device)


In [None]:
# trainer parameters
epochs = 3
learning_rate=5e-5 # default 5e-5
warmup_steps = 500

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=warmup_steps,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=50,
    learning_rate=learning_rate,
)

In [None]:
# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=lambda data: {'input_ids': torch.stack([item[0] for item in data]),
                                'attention_mask': torch.stack([item[1] for item in data]),
                                'labels': torch.stack([item[2] for item in data])},
)

In [None]:
# Train the model
trainer.train()

# epoch 4 / 50 = 200	0.538800	0.644553

In [None]:
#model.save_pretrained("./model")

In [None]:
#model = XLMRobertaForSequenceClassification.from_pretrained("./model")

In [None]:
# test
test_data = pd.read_csv("data/subtask-2-multilingual/ml_only_2024_languages/dev_test_ml_only_2024_languages.tsv", sep='\t')  # Update with your dev data file
test_data['label'] = test_data['label'].map(label_map)
test_encodings = tokenizer(test_data['sentence'].tolist(), truncation=True, padding=True)

test_dataset = torch.utils.data.TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(test_data['label'])
)

In [None]:
preds = trainer.predict(test_dataset)
pred_labels = preds.predictions.argmax(-1)



In [None]:
print(pred_labels)

In [None]:
# Convert the true labels to integers

# Compute the accuracy and classification report
accuracy = accuracy_score(test_data['label'], pred_labels)
class_report = classification_report(test_data['label'], pred_labels, target_names=['OBJ', 'SUBJ'])

print(f"Accuracy for RoBERTa: {accuracy}")
print(f"Classification Report:\n{class_report}")

In [None]:
# learning rate 5e-5
# epochs 3 - acc 0.61 - OBJ 0.56, SUBJ 0.71 - f1 0.66 - 0.54