<a href="https://colab.research.google.com/github/JeanMusenga/PhD-Thesis_2024_Musenga/blob/main/RoBERTa_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://chatgpt.com/share/4650a1f9-a8a4-4b31-b01f-aee216098c6e


In [1]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [3]:
file_path = './saved_file'
file_path = ('posts.xlsx')

data = pd.read_excel(file_path)

# Split dataset

In [4]:
# Split dataset
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['Question_body'].tolist(),
    data['Label'].tolist(),
    test_size=0.2,
    random_state=42
)

# Load the RoBERTa tokenizer and Tokenize Data

In [5]:
# Tokenize data
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

# Create dataset class

In [6]:
# Create dataset class
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, train_labels)
test_dataset = Dataset(test_encodings, test_labels)

# Load Model

In [7]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Define training arguments

In [8]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)



# Define evaluation metric

In [9]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [11]:
# Initialize the Adam optimizer
import torch.optim as optim
adam_optimizer = optim.Adam(model.parameters(), lr=2e-5)

# Create Trainer instance

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
     optimizers=(adam_optimizer, None)  # Pass the Adam optimizer here
)

In [13]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0065,0.126078,0.973887,0.974274,0.953518,0.995954


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0065,0.126078,0.973887,0.974274,0.953518,0.995954
2,0.0644,0.111682,0.974891,0.975157,0.958333,0.992583
3,0.0169,0.132199,0.975896,0.97608,0.962017,0.99056


TrainOutput(global_step=4482, training_loss=0.11534490778631648, metrics={'train_runtime': 3942.2623, 'train_samples_per_second': 9.09, 'train_steps_per_second': 1.137, 'total_flos': 9428584668825600.0, 'train_loss': 0.11534490778631648, 'epoch': 3.0})

# Evaluate the model

In [14]:
# Evaluate the model
#trainer.evaluate()

# Evaluate the model
results = trainer.evaluate()
print(results)

{'eval_loss': 0.13219864666461945, 'eval_accuracy': 0.9758955473719451, 'eval_f1': 0.9760797342192691, 'eval_precision': 0.9620170268500328, 'eval_recall': 0.9905596763317599, 'eval_runtime': 98.7052, 'eval_samples_per_second': 30.262, 'eval_steps_per_second': 3.789, 'epoch': 3.0}


# Make predictions on Evaluation dataset

In [18]:
# Make predictions
predictions = trainer.predict(test_dataset)
y_true = test_labels
y_pred = predictions.predictions.argmax(-1)

In [19]:
# Print overall metrics
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
accuracy = accuracy_score(y_true, y_pred)

print(f'Overall Accuracy: {accuracy}')
print(f'Overall Precision: {precision}')
print(f'Overall Recall: {recall}')
print(f'Overall F1-score: {f1}')

# Print metrics for each class
precision_class, recall_class, f1_class, _ = precision_recall_fscore_support(y_true, y_pred, average=None, labels=[0, 1])

print(f'Class 0 - Precision: {precision_class[0]}, Recall: {recall_class[0]}, F1-score: {f1_class[0]}')
print(f'Class 1 - Precision: {precision_class[1]}, Recall: {recall_class[1]}, F1-score: {f1_class[1]}')


Overall Accuracy: 0.9758955473719451
Overall Precision: 0.9620170268500328
Overall Recall: 0.9905596763317599
Overall F1-score: 0.9760797342192691
Class 0 - Precision: 0.9904109589041096, Recall: 0.961436170212766, F1-score: 0.9757085020242915
Class 1 - Precision: 0.9620170268500328, Recall: 0.9905596763317599, F1-score: 0.9760797342192691


# Print metrics
https://chatgpt.com/share/b9c91705-7de3-4e91-bc71-10245e77211e

To calculate accuracy for each class label individually, you can consider each class separately as the "positive" class and compute accuracy accordingly. Here is how you can do it:

Calculate True Positives (TP), True Negatives (TN), False Positives (FP), and False Negatives (FN) for each class.
Compute accuracy for each class using the formula:
(
𝑇
𝑃
+
𝑇
𝑁
)
/
(
𝑇
𝑃
+
𝑇
𝑁
+
𝐹
𝑃
+
𝐹
𝑁
)
(TP+TN)/(TP+TN+FP+FN).

In [22]:
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

# Compute metrics for each class
precision_class, recall_class, f1_class, support_class = precision_recall_fscore_support(y_true, y_pred, average=None, labels=[0, 1])

# Compute confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred, labels=[0, 1])

# Extract TP, TN, FP, FN for each class
TP_0, FN_0 = conf_matrix[0, 0], conf_matrix[0, 1]
FP_0, TN_0 = conf_matrix[1, 0], conf_matrix[1, 1]

TP_1, FN_1 = conf_matrix[1, 1], conf_matrix[1, 0]
FP_1, TN_1 = conf_matrix[0, 1], conf_matrix[0, 0]

# Calculate accuracy for each class
accuracy_class_0 = (TP_0 + TN_0) / (TP_0 + TN_0 + FP_0 + FN_0)
accuracy_class_1 = (TP_1 + TN_1) / (TP_1 + TN_1 + FP_1 + FN_1)

print(f'Class 0 - Precision: {precision_class[0]}, Recall: {recall_class[0]}, Accuracy: {accuracy_class_0}, F1-score: {f1_class[0]}, Support: {support_class[0]}')
print(f'Class 1 - Precision: {precision_class[1]}, Recall: {recall_class[1]}, Accuracy: {accuracy_class_1}, F1-score: {f1_class[1]}, Support: {support_class[1]}')


Class 0 - Precision: 0.9904109589041096, Recall: 0.961436170212766, Accuracy: 0.9758955473719451, F1-score: 0.9757085020242915, Support: 1504
Class 1 - Precision: 0.9620170268500328, Recall: 0.9905596763317599, Accuracy: 0.9758955473719451, F1-score: 0.9760797342192691, Support: 1483
