In [1]:
import pandas as pd

import torch
from sklearn.model_selection import train_test_split
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, classification_report

In [2]:
train_data = pd.read_csv("data/subtask-2-english/train_en.tsv",sep='\t')
dev_data = pd.read_csv("data/subtask-2-english/dev_en.tsv", sep='\t')


In [3]:
# Mapping label strings to integers
label_map = {"OBJ": 0, "SUBJ": 1}
train_data['label'] = train_data['label'].map(label_map)
print(train_data['label'])

dev_data['label'] = dev_data['label'].map(label_map)
print(dev_data['label'])

0      0
1      0
2      0
3      0
4      0
      ..
795    0
796    0
797    0
798    0
799    0
Name: label, Length: 800, dtype: int64
0      0
1      1
2      0
3      0
4      0
      ..
195    0
196    1
197    1
198    1
199    0
Name: label, Length: 200, dtype: int64


In [26]:
train_data = train_data.drop('solved_conflict', axis=1)

KeyError: "['solved_conflict'] not found in axis"

In [4]:
train_data

Unnamed: 0,sentence_id,sentence,label
0,4f9c8bcd60318b0d1257f35ebc7c4ede9f7930e1,Die Ausbreitung des Virus sei beschränkter als...,0
1,0531b165e42997e8eecbb84d1e774c728041db8c,Zwar sei ein Anstieg der Zahlen zu verzeichnen.,0
2,c34bac7a38d7959b0d3c340810f5d2ac3187a3ec,Der Arbeitsmarkt hat sich so zugunsten der Lei...,0
3,3c6b7daf4e0cce25f45c8497da669c8936a66113,Es wird das PCR-Pooling mit Antigen-Schnelltes...,0
4,9d241c839c8b801bdbc526cd6e20f70ecd11f341,"Als er bemerkte, dass Internetbetreiber die Ge...",0
...,...,...,...
795,26742376756a241814a3e089349d1c08dd267012,Der lettische Regierungschef Krisjans Karins h...,0
796,6c68576e3227cd0ae611abe6527e441c196a4e6d,Das Angiotensin-konvertierende Enzym 2 wird ha...,0
797,5c2da5d7dc995638e388fa7c0a8b643bdd266a18,„In enger Abstimmung mit dem Ministerium für A...,0
798,7f2b3c069add8f1db904077337748b6c79d659be,Unter anderem sollen die Patrouillen an Land u...,0


In [5]:
# Load tokenizer
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')

In [6]:
train_encodings = tokenizer(train_data['sentence'].tolist(), truncation=True, padding=True)
eval_encodings = tokenizer(dev_data['sentence'].tolist(), truncation=True, padding=True)

In [8]:
#train_encodings["input_ids"]
#train_encodings["attention_mask"]


In [7]:
# Create PyTorch datasets
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_data['label'])
)

val_dataset = torch.utils.data.TensorDataset(
    torch.tensor(eval_encodings['input_ids']),
    torch.tensor(eval_encodings['attention_mask']),
    torch.tensor(dev_data['label'])
)

In [8]:
print(train_dataset[0][0].shape) # input_ids shape
print(train_dataset[0][1].shape) # attention_mask shape
print(train_dataset[0][2].shape) # train_labels_onehot shape

torch.Size([157])
torch.Size([157])
torch.Size([])


In [9]:
# Define model
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-large', num_labels=2)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# trainer parameters
epochs = 5

In [12]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,  # Train for more epochs
    per_device_train_batch_size=8,  # Decrease batch size for better generalization
    per_device_eval_batch_size=8,  # Keep evaluation batch size consistent with training
    warmup_steps=1000,  # Increase warmup steps for longer training
    weight_decay=0.001,  # Slightly decrease weight decay for regularization
    logging_dir='./logs',
    logging_steps=50,  # Log less frequently for faster training
    evaluation_strategy='steps',
    eval_steps=100,  # Evaluate less frequently for faster training
    learning_rate=2e-5,  # Slightly decrease learning rate for smoother training
)

In [13]:
# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=lambda data: {'input_ids': torch.stack([item[0] for item in data]),
                                'attention_mask': torch.stack([item[1] for item in data]),
                                'labels': torch.stack([item[2] for item in data])},
)

In [14]:
# Train the model
trainer.train()

# epoch 4 / 50 = 200	0.538800	0.644553

  0%|          | 0/150 [00:00<?, ?it/s]

{'loss': 0.6898, 'grad_norm': 8.063957214355469, 'learning_rate': 1e-05, 'epoch': 2.0}
{'train_runtime': 4488.6922, 'train_samples_per_second': 0.535, 'train_steps_per_second': 0.033, 'train_loss': 0.6680028279622395, 'epoch': 3.0}


TrainOutput(global_step=150, training_loss=0.6680028279622395, metrics={'train_runtime': 4488.6922, 'train_samples_per_second': 0.535, 'train_steps_per_second': 0.033, 'total_flos': 685843237699200.0, 'train_loss': 0.6680028279622395, 'epoch': 3.0})

In [15]:
model.save_pretrained("./model")

In [17]:
#model = XLMRobertaForSequenceClassification.from_pretrained("./model")

In [16]:
# test
test_data = pd.read_csv("data/subtask-2-english/dev_test_en.tsv", sep='\t')  # Update with your dev data file
test_data['label'] = test_data['label'].map(label_map)
test_encodings = tokenizer(test_data['sentence'].tolist(), truncation=True, padding=True)

test_dataset = torch.utils.data.TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(test_data['label'])
)

In [17]:
preds = trainer.predict(test_dataset)
pred_labels = preds.predictions.argmax(-1)



  0%|          | 0/16 [00:00<?, ?it/s]

In [18]:
print(pred_labels)

[1 0 1 0 1 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 1 0 1 1 0 0 0 1 1 1 1 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [19]:
# Convert the true labels to integers

# Compute the accuracy and classification report
accuracy = accuracy_score(test_data['label'], pred_labels)
class_report = classification_report(test_data['label'], pred_labels, target_names=['OBJ', 'SUBJ'])

print(f"Accuracy for RoBERTa: {accuracy}")
print(f"Classification Report:\n{class_report}")


# Now, let's use ChatGPT-4 for evaluation
from transformers import pipeline
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Load the ChatGPT-4 text classification pipeline
chatgpt4_classifier = pipeline("text-classification", model="EleutherAI/gpt-neo-2.7B")

# Load the test data
test_data = pd.read_csv("data/subtask-2-english/dev_test_en.tsv", sep='\t')

# Mapping label strings to integers
label_map = {"OBJ": 0, "SUBJ": 1}
test_data['label'] = test_data['label'].map(label_map)

# Make predictions using ChatGPT-4
chatgpt4_predictions = chatgpt4_classifier(test_data['sentence'].tolist())

# Extract predicted labels from ChatGPT-4 predictions
chatgpt4_pred_labels = [1 if pred["label"] == "SUBJ" else 0 for pred in chatgpt4_predictions]

# Compute accuracy and classification report for ChatGPT-4
chatgpt4_accuracy = accuracy_score(test_data['label'], chatgpt4_pred_labels)
chatgpt4_class_report = classification_report(test_data['label'], chatgpt4_pred_labels, target_names=['OBJ', 'SUBJ'], zero_division=1)

print(f"Accuracy for ChatGPT-4: {chatgpt4_accuracy}")
print(f"Classification Report for ChatGPT-4:\n{chatgpt4_class_report}")

Accuracy for RoBERTa: 0.5967078189300411
Classification Report:
              precision    recall  f1-score   support

         OBJ       0.55      0.89      0.68       116
        SUBJ       0.76      0.33      0.46       127

    accuracy                           0.60       243
   macro avg       0.66      0.61      0.57       243
weighted avg       0.66      0.60      0.56       243



Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-2.7B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy for ChatGPT-4: 0.4773662551440329
Classification Report for ChatGPT-4:
              precision    recall  f1-score   support

         OBJ       0.48      1.00      0.65       116
        SUBJ       1.00      0.00      0.00       127

    accuracy                           0.48       243
   macro avg       0.74      0.50      0.32       243
weighted avg       0.75      0.48      0.31       243



In [22]:
# epochs 3 eval steps 50 - acc 0.58 - OBJ 0.54, SUBJ 0.81
# epochs 10 eval steps 60 - BROKE
# epochs 2 eval steps 50 - acc 0.61 - OBJ 0.56, SUBJ 0.71 - f1 0.66 - 0.54
# epochs 4 eval steps 50 - acc 0.70 - OBJ 0.67, SUBJ 0.74 - f1 0.70 - 0.70
# epochs 5 eval steps 50 - acc 0.79 - OBJ 0.85, SUBJ 0.75 - f1 0.76 - 0.82