<a href="https://colab.research.google.com/github/MartijnPuts/DSS_thesis/blob/main/RobBERT_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install -U accelerate
! pip install -U transformers
!pip3 install datasets

In [None]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt

from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_from_disk

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, matthews_corrcoef

In [None]:
os.getcwd()

## Load data

In [None]:
data = load_from_disk("/kaggle/input/hr-head-truncated-tokenised/HR_tokenized_dataset_head_truncation_shuffled.hf")


In [None]:
data

In [None]:
data['train']['label'][:10]

##Model


In [None]:
# Define checkpoint
checkpoint = "DTAI-KULeuven/robbert-2023-dutch-base"
#checkpoint = "mputs1234/RobBERT-legal"

#Initialize model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("DTAI-KULeuven/robbert-2023-dutch-base")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) #,hidden_dropout_prob=0.15, attention_probs_dropout_prob=0.15, classifier_dropout=0)

In [None]:
#Define training arguments
training_args = TrainingArguments(
                                  output_dir = '/kaggle/working/',

                                  evaluation_strategy="epoch",
                                    logging_strategy="epoch",
                                  save_strategy="epoch",

                                  #logging_steps=300,
                                  #eval_steps=300,
                                  #save_steps=600,

                                  per_device_train_batch_size=16,
                                  per_device_eval_batch_size=16,
                                  #gradient_accumulation_steps=8,
                                  #gradient_checkpointing=True,

                                  num_train_epochs=5,

                                  learning_rate=1e-5,
                                  lr_scheduler_type="linear",
                                  warmup_ratio=0.1,
                                    weight_decay=0.1,

                                  optim="adamw_torch",

                                  save_total_limit=2,
                                  load_best_model_at_end=True,
                                    fp16=True,
                                    report_to='none'
                                  )

In [None]:
# Define metrics function with sklearn
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)
    mcc = matthews_corrcoef(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1, "mcc": mcc}

In [None]:
# Define Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=data["train"],
    eval_dataset=data["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
# Test that the eval metric is being calculated.

eval_results_dict = trainer.evaluate()
print('\n Check that eval is working:')
print(eval_results_dict)
print('\n')

In [None]:
print('start training')

In [None]:
# Train
trainer.train()

In [None]:
#get metric history
history = pd.DataFrame(trainer.state.log_history)

In [None]:
history

In [None]:
#clean history
history.fillna(method='ffill', inplace=True)

In [None]:
history.drop_duplicates(subset=['step'], keep='last', inplace=True)

In [None]:
history

In [None]:
def loss_graph(history):
    plt.figure(figsize=(8, 6))

    # Plot loss
    plt.subplot(2, 1, 1)
    plt.plot(history['step'], history['loss'], color='red', linestyle='-', label='Training Loss')
    plt.plot(history['step'], history['eval_loss'], color='green', linestyle='--', label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Step')
    plt.ylabel('Loss')
    min_val_loss = min(history['eval_loss'])
    #min_val_loss_epoch = history['eval_loss'].index(min_val_loss)
    #plt.scatter(min_val_loss_epoch, min_val_loss, color='blue', label='Best Epoch')
    plt.legend()

    # Add legend
    plt.legend()

    plt.tight_layout()
    #plt.savefig(file_dir_graphs + save_name + '.png', bbox_inches='tight')
    plt.show()

In [None]:
#plot loss graph

loss_graph(history)

In [None]:
# Predict on evaluation set
eval_pred = trainer.predict(data['val'])

In [None]:
print(eval_pred.predictions.shape, eval_pred.label_ids.shape)


In [None]:
preds = np.argmax(eval_pred.predictions, axis=-1)

In [None]:
preds

In [None]:
#Confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns

def plot_confusion_matrix(y_true, y_pred, classes=[0,1]):
    # Create confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Create seaborn heatmap
    plt.figure(figsize=(8, 6))
    sns.set(font_scale=1.2)  # Adjust font size
    sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', cbar=False,
                xticklabels=classes, yticklabels=classes)

    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()




In [None]:
plot_confusion_matrix(data['val']['label'], preds)