In [1]:
import pandas as pd

import string, re

import torch
from sklearn.model_selection import train_test_split
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data = pd.read_csv("data/subtask-2-bulgarian/train_bg.tsv",sep='\t')
dev_data = pd.read_csv("data/subtask-2-bulgarian/dev_bg.tsv", sep='\t')


In [3]:
# Mapping label strings to integers
label_map = {"OBJ": 0, "SUBJ": 1}
train_data['label'] = train_data['label'].map(label_map)
print(train_data['label'])

dev_data['label'] = dev_data['label'].map(label_map)
print(dev_data['label'])

0      0
1      1
2      0
3      0
4      1
      ..
689    1
690    0
691    0
692    1
693    1
Name: label, Length: 694, dtype: int64
0      0
1      1
2      0
3      1
4      1
      ..
101    0
102    0
103    0
104    1
105    1
Name: label, Length: 106, dtype: int64


In [4]:
#train_data = train_data.drop('solved_conflict', axis=1)

In [5]:
train_data

Unnamed: 0,sentence_id,sentence,label
0,b678f74b-3981-4ad9-93b3-4c549605a02c,"Учителите, за които цяла България разбра, са С...",0
1,ea65624a-da34-4bc4-8085-edb93d2e30e1,А ако намерите каска е още по-добре.,1
2,fd709ed2-8c81-4130-b2a3-857083eb4821,"През октомври 1994г. учените от ,,Air Force"" у...",0
3,1cfc5cf2-ebd2-4db8-86ed-38673b2858fc,"Аз обаче вече бях обещала, че ще летя до Алжир...",0
4,a56a980c-2bdf-4da5-8b1f-e8cef1e4305c,Ама знаете ли защо те си купиха тировете обрат...,1
...,...,...,...
689,51ed7539-8d43-42d8-9e67-80c55f89195d,Чувствата силно ще се обезценят и само лъжлива...,1
690,1458a621-510e-421b-ad33-a4f7b8991f32,"Как да помогна на тези хора, които все повече ...",0
691,750ece90-13b4-4bae-b41f-0867772b8a35,"Ще има обаче опит за покушение срещу Тръмп, до...",0
692,7f1a9eff-a320-42ae-946b-356b1f011583,Събудената мощ на духа на българина е голяма.,1


In [6]:
# Load tokenizer
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')

In [7]:
train_encodings = tokenizer(train_data['sentence'].tolist(), truncation=True, padding=True)
eval_encodings = tokenizer(dev_data['sentence'].tolist(), truncation=True, padding=True)

In [8]:
#train_encodings["input_ids"]
#train_encodings["attention_mask"]


In [9]:
# Create PyTorch datasets
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_data['label'])
)

val_dataset = torch.utils.data.TensorDataset(
    torch.tensor(eval_encodings['input_ids']),
    torch.tensor(eval_encodings['attention_mask']),
    torch.tensor(dev_data['label'])
)

In [10]:
print(train_dataset[0][0].shape) # input_ids shape
print(train_dataset[0][1].shape) # attention_mask shape
print(train_dataset[0][2].shape) # train_labels_onehot shape

torch.Size([512])
torch.Size([512])
torch.Size([])


In [11]:
# Define model
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-large', num_labels=2)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
device

In [12]:
# trainer parameters
epochs = 3
learning_rate=5e-5

In [13]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=50,
    learning_rate=learning_rate,
)

In [14]:
# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=lambda data: {'input_ids': torch.stack([item[0] for item in data]),
                                'attention_mask': torch.stack([item[1] for item in data]),
                                'labels': torch.stack([item[2] for item in data])},
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
# Train the model
trainer.train()

# epoch 4 / 50 = 200	0.538800	0.644553

Step,Training Loss,Validation Loss


In [None]:
#model.save_pretrained("./model")

In [None]:
#model = XLMRobertaForSequenceClassification.from_pretrained("./model")

In [None]:
# test
test_data = pd.read_csv("data/subtask-2-bulgarian/dev_test_bg.tsv", sep='\t')  # Update with your dev data file
test_data['label'] = test_data['label'].map(label_map)
test_encodings = tokenizer(test_data['sentence'].tolist(), truncation=True, padding=True)

test_dataset = torch.utils.data.TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(test_data['label'])
)

In [None]:
preds = trainer.predict(test_dataset)
pred_labels = preds.predictions.argmax(-1)



In [None]:
print(pred_labels)

In [None]:
# Convert the true labels to integers

# Compute the accuracy and classification report
accuracy = accuracy_score(test_data['label'], pred_labels)
class_report = classification_report(test_data['label'], pred_labels, target_names=['OBJ', 'SUBJ'])

print(f"Accuracy for RoBERTa: {accuracy}")
print(f"Classification Report:\n{class_report}")

In [None]:
# epochs 2 eval steps 50 - acc 0.61 - OBJ 0.56, SUBJ 0.71 - f1 0.66 - 0.54