In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd '/content/drive/MyDrive/Thesis'

In [None]:
! pip install -U accelerate
! pip install -U transformers
!pip3 install datasets

In [None]:
import numpy as np
import pandas as pd
import math

from transformers import Trainer, TrainingArguments, AutoModelForMaskedLM, AutoTokenizer, DataCollatorForLanguageModeling
from datasets import load_dataset, load_from_disk, concatenate_datasets, DatasetDict



In [None]:
#Define checkpoint and initialize tokenizer
checkpoint= "DTAI-KULeuven/robbert-2023-dutch-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


##Data

In [None]:
#Import dataset
dataset = load_dataset('multi_eurlex', 'nl')

In [None]:
#check dataset
dataset

In [None]:
#check text in dataset
dataset["train"]["text"][2]

In [None]:
#Remove unneeded columns
dataset = dataset.remove_columns(["labels", "celex_id"])


In [None]:
dataset

In [None]:
#Define tokenizer
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

#tokenize data
tokenized_datasets = dataset.map(
    tokenize_function, batched=True, remove_columns=["text"]
)


In [None]:
tokenized_datasets

In [None]:
#Create chunked data

chunk_size = 128

def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_dataset = tokenized_datasets.map(group_texts, batched=True)
lm_dataset

In [None]:
lm_dataset.save_to_disk("EURLEX_tokenized_dataset_full.hf")

In [None]:
tokenized_dataset.save_to_disk("/content/drive/MyDrive/thesis/EURLEX_tokenized_dataset_input_ids.hf")

##Model

In [None]:
reloaded_encoded_dataset = load_from_disk("/content/drive/MyDrive/Thesis/EURLEX_tokenized_dataset_input_ids_collapsed.hf")


In [None]:
reloaded_encoded_dataset

In [None]:
#collapse train and validation
reloaded_encoded_dataset["train"] = concatenate_datasets([reloaded_encoded_dataset["train"], reloaded_encoded_dataset["validation"]])
data = DatasetDict({key: dataset for key, dataset in reloaded_encoded_dataset.items() if key != "validation"})


In [None]:
data

In [None]:
data.save_to_disk("/content/drive/MyDrive/Thesis/EURLEX_tokenized_dataset_input_ids_collapsed.hf")

In [None]:
#initialize data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
#Initialize model
model = AutoModelForMaskedLM.from_pretrained(checkpoint)

In [None]:
#Define training arguments
batch_size = 64
logging_steps = 5000

training_args = TrainingArguments(
                                  output_dir = 'RobBERT-legal',
                                  evaluation_strategy="epoch",

                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  num_train_epochs=1,

                                  learning_rate=1e-5,
                                  lr_scheduler_type="polynomial",
                                  warmup_ratio=0.1,
                                  weight_decay=0.1,


                                  optim='adamw_torch',

                                  push_to_hub=True,
                                  fp16=False,
                                  logging_steps=logging_steps,
                                  remove_unused_columns=True,


                                  )

In [None]:
# Define Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=data["train"],
    eval_dataset=data["test"],
    data_collator=data_collator,
)

In [None]:
#Get perplexity before further pre-training
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.train()

In [None]:
# Get perplexity after further pre-training
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")