In [1]:
import nltk
import pandas as pd
from nltk.stem import WordNetLemmatizer

# Import STOPWORDS from NLTK
from nltk.corpus import stopwords

import string, re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

import torch
from sklearn.model_selection import train_test_split
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data_raw = pd.read_csv("data/subtask-2-multilingual/ml_only_2024_languages/train_ml_only_2024_languages.tsv",sep='\t')
#dev_data = pd.read_csv("data/subtask-2-multilingual/ml_only_2024_languages/dev_ml_only_2024_languages.tsv", sep='\t')


In [None]:
train_data_raw.shape

In [3]:
train_data = train_data_raw.sample(n=1500)
dev_data = train_data_raw.sample(n=300)

In [4]:
# Mapping label strings to integers
label_map = {"OBJ": 0, "SUBJ": 1}
train_data['label'] = train_data['label'].map(label_map)
print(train_data['label'])

dev_data['label'] = dev_data['label'].map(label_map)
print(dev_data['label'])

3973    0
2457    0
1422    1
1601    0
1817    1
       ..
3766    1
2828    0
2638    1
3974    1
237     0
Name: label, Length: 1500, dtype: int64
910     0
1591    0
3211    0
756     1
1205    0
       ..
1258    0
2309    1
2188    0
2143    0
1317    0
Name: label, Length: 300, dtype: int64


In [5]:
#train_data = train_data.drop('solved_conflict', axis=1)

In [6]:
train_data

Unnamed: 0,sentence_id,sentence,label
3973,c3b6ccdb-a1c3-4e94-b8b1-a92c6c2b2987,spiega la prof di matematica,0
2457,2283e7e30f82a6ee90882c5a6f057b0a4bd6d0b8,Die syrischen Soldaten haben bei ihrem Vormars...,0
1422,5eb8425e-45f5-4e9e-a360-aaf086b58fa0,Is monkeypox going to be the “cause” of anothe...,1
1601,9fe9e827-93b0-4222-8f42-91b16b5cc715,The Feds staked out various Feeding Our Future...,0
1817,b0391982-3045-4447-96d3-e60024cdd491,The Left aims to make the hordes of illegal al...,1
...,...,...,...
3766,27167802-98f4-41b3-b5c5-433165b48881,"Una manifestazione finita tragicamente, con la...",1
2828,b1f15f15-cbbd-4675-a42a-d9172f5f9529,E sull’attacco dello scorso agosto:,0
2638,6dae6a67091b03b7d9d7c51f95fe33d3a02c7377,Allesamt unannehmbare Bedingungen für „Grüne“ ...,1
3974,4bef47c4-0534-4893-b0bd-30eb3fe6c11e,"Chissà cosa ne ha pensato Stephanie Williams, ...",1


In [7]:
# Load tokenizer
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')

In [8]:
train_encodings = tokenizer(train_data['sentence'].tolist(), truncation=True, padding=True)
eval_encodings = tokenizer(dev_data['sentence'].tolist(), truncation=True, padding=True)

In [9]:
#train_encodings["input_ids"]
#train_encodings["attention_mask"]


In [10]:
# Create PyTorch datasets
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_data['label'].to_numpy())
)

val_dataset = torch.utils.data.TensorDataset(
    torch.tensor(eval_encodings['input_ids']),
    torch.tensor(eval_encodings['attention_mask']),
    torch.tensor(dev_data['label'].to_numpy())
)

In [11]:
print(train_dataset[0][0].shape) # input_ids shape
print(train_dataset[0][1].shape) # attention_mask shape
print(train_dataset[0][2].shape) # train_labels_onehot shape

torch.Size([189])
torch.Size([189])
torch.Size([])


In [12]:
# Define model
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-large', num_labels=2)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
device

#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#print(device)


device(type='cpu')

In [14]:
# trainer parameters
epochs = 3
learning_rate=5e-5 # default 5e-5
warmup_steps = 500

In [15]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=warmup_steps,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=50,
    learning_rate=learning_rate,
)

In [16]:
# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=lambda data: {'input_ids': torch.stack([item[0] for item in data]),
                                'attention_mask': torch.stack([item[1] for item in data]),
                                'labels': torch.stack([item[2] for item in data])},
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [17]:
# Train the model
trainer.train()

# epoch 4 / 50 = 200	0.538800	0.644553

KeyboardInterrupt: 

In [None]:
#model.save_pretrained("./model")

In [None]:
#model = XLMRobertaForSequenceClassification.from_pretrained("./model")

In [None]:
# test
test_data = pd.read_csv("data/subtask-2-multilingual/ml_only_2024_languages/dev_test_ml_only_2024_languages.tsv", sep='\t')  # Update with your dev data file
test_data['label'] = test_data['label'].map(label_map)
test_encodings = tokenizer(test_data['sentence'].tolist(), truncation=True, padding=True)

test_dataset = torch.utils.data.TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(test_data['label'])
)

In [None]:
preds = trainer.predict(test_dataset)
pred_labels = preds.predictions.argmax(-1)



In [None]:
print(pred_labels)

In [None]:
# Convert the true labels to integers

# Compute the accuracy and classification report
accuracy = accuracy_score(test_data['label'], pred_labels)
class_report = classification_report(test_data['label'], pred_labels, target_names=['OBJ', 'SUBJ'])

print(f"Accuracy for RoBERTa: {accuracy}")
print(f"Classification Report:\n{class_report}")

In [None]:
# learning rate 5e-5
# epochs 3 - acc 0.61 - OBJ 0.56, SUBJ 0.71 - f1 0.66 - 0.54