<a href="https://colab.research.google.com/github/JeanMusenga/PhD-Thesis_2024_Musenga/blob/main/RoBERTa_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://chatgpt.com/c/1fc42dba-30ac-4f9f-b652-a7cf53e9c07a

In [1]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [7]:
file_path = './saved_file'
file_path = ('Posts.xlsx')

data = pd.read_excel(file_path)

# Split dataset

In [8]:
# Split dataset
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['Question_body'].tolist(),
    data['Label'].tolist(),
    test_size=0.2,
    random_state=42
)

# Load the RoBERTa tokenizer and Tokenize Data

In [9]:
# Tokenize data
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Create dataset class

In [10]:
# Create dataset class
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, train_labels)
test_dataset = Dataset(test_encodings, test_labels)

# Load Model

In [11]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Define training arguments

In [12]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)



# Define evaluation metric

In [13]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


# Create Trainer instance

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1019,0.112645,0.974222,0.974696,0.950641,1.0
2,0.0593,0.115457,0.977235,0.977558,0.957337,0.998651
3,0.0382,0.114707,0.9769,0.977205,0.957902,0.997303


TrainOutput(global_step=4482, training_loss=0.1155890189835387, metrics={'train_runtime': 3872.001, 'train_samples_per_second': 9.255, 'train_steps_per_second': 1.158, 'total_flos': 9428584668825600.0, 'train_loss': 0.1155890189835387, 'epoch': 3.0})

# Evaluate the model

In [15]:
# Evaluate the model
#trainer.evaluate()

# Evaluate the model
results = trainer.evaluate()
print(results)

{'eval_loss': 0.11470744013786316, 'eval_accuracy': 0.9768998995647807, 'eval_f1': 0.9772051536174429, 'eval_precision': 0.957901554404145, 'eval_recall': 0.9973027646662171, 'eval_runtime': 93.8579, 'eval_samples_per_second': 31.825, 'eval_steps_per_second': 3.985, 'epoch': 3.0}


# Make predictions on Evaluation dataset

In [17]:
# Make predictions
predictions = trainer.predict(test_dataset)
y_true = test_labels
y_pred = predictions.predictions.argmax(-1)

# Print metrics

In [24]:
# Print overall metrics
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
accuracy = accuracy_score(y_true, y_pred)

print(f'Overall Accuracy: {accuracy}')
print(f'Overall Precision: {precision}')
print(f'Overall Recall: {recall}')
print(f'Overall F1-score: {f1}')

# Print metrics for each class
precision_class, recall_class, f1_class, _ = precision_recall_fscore_support(y_true, y_pred, average=None, labels=[0, 1])

print(f'Class 0 - Precision: {precision_class[0]}, Recall: {recall_class[0]}, F1-score: {f1_class[0]}')
print(f'Class 1 - Precision: {precision_class[1]}, Recall: {recall_class[1]}, F1-score: {f1_class[1]}')


Overall Accuracy: 0.9768998995647807
Overall Precision: 0.957901554404145
Overall Recall: 0.9973027646662171
Overall F1-score: 0.9772051536174429
Class 0 - Precision: 0.9972279972279973, Recall: 0.956781914893617, F1-score: 0.9765863590091618
Class 1 - Precision: 0.957901554404145, Recall: 0.9973027646662171, F1-score: 0.9772051536174429
