In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
! pip install -U accelerate
! pip install -U transformers
! pip install evaluate
!pip3 install datasets

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import time

import re

import matplotlib.pyplot as plt
import seaborn as sns

from datasets import Dataset, DatasetDict



In [None]:
%cd '/content/drive/MyDrive/Thesis'

##Data

In [None]:
# Load data
start = time.time()
print('Loading dataframes...')
X_train = pd.read_csv("/content/drive/MyDrive/Thesis/data/HR_X_train.csv")
X_val = pd.read_csv("/content/drive/MyDrive/Thesis/data/HR_X_val.csv")
X_test = pd.read_csv("/content/drive/MyDrive/Thesis/data/HR_X_test.csv")

y_train = pd.read_csv("/content/drive/MyDrive/Thesis/data/HR_y_train.csv")
y_val = pd.read_csv("/content/drive/MyDrive/Thesis/data/HR_y_val.csv")
y_test = pd.read_csv("/content/drive/MyDrive/Thesis/data/HR_y_test.csv")


print(f'Done loading dataframe in {time.time() - start} seconds.')

In [None]:
#Check data
print(X_train.shape)
print(y_train.shape)
print('\n')

print(X_val.shape)
print(y_val.shape)
print('\n')

print(X_test.shape)
print(y_test.shape)


In [None]:
#Check split on date
print(X_train["date_decision"].max())
print(X_val["date_decision"].max())
print(X_test["date_decision"].max())


In [None]:
#Check columns
list(X_train.columns)

In [None]:
# Keep text
X_train_text = X_train['full_text']
X_val_text =  X_val['full_text']
X_test_text = X_test['full_text']

In [None]:
#Concatenate train and test for converging to HuggingFace Dataset
Xy_train = pd.concat([X_train_text, y_train], axis=1)
Xy_val = pd.concat([X_val_text, y_val], axis=1)
Xy_test = pd.concat([X_test_text, y_test], axis=1)

In [None]:
print(Xy_train.shape)
print(Xy_val.shape)
print(Xy_test.shape)

In [None]:
#Create Hugging Face Dataset
train = Dataset.from_pandas(Xy_train, split= "train",preserve_index=False)
val = Dataset.from_pandas(Xy_val, split= "val",preserve_index=False)
test = Dataset.from_pandas(Xy_test, split= "test",preserve_index=False)

In [None]:
raw_data = DatasetDict({"train":train , "val":val, "test":test})

In [None]:
raw_data

In [None]:
len(raw_data['train']['full_text'][0])

## Tokenize

In [None]:
#Initialize tokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
checkpoint = "DTAI-KULeuven/robbert-2023-dutch-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
# Get head truncated dataset
def tokenize_function(example):

    return tokenizer(example['full_text'], padding=True, truncation=True)

tokenized_dataset_head_truncation = raw_data.map(tokenize_function, batched=True)



In [None]:
tokenized_dataset_head_truncation

In [None]:
tokenized_dataset_head_truncation.save_to_disk("HR_tokenized_dataset_head_truncated.hf")

In [None]:
# Get not truncated dataset
def tokenize_function(example):

    return tokenizer(example['full_text'], padding=False, truncation=False)

tokenized_dataset_no_truncation = raw_data.map(tokenize_function, batched=True)




In [None]:
tokenized_dataset_no_truncation.save_to_disk("HR_tokenized_dataset_no_truncation.hf")

In [None]:
examples = tokenized_dataset_no_truncation["train"]["input_ids"]
n_tokens_list = [len(x) for x in examples]
larger_512_list = [x>512 for x in n_tokens_list]

In [None]:
#proportion of cases larger than 512 tokens
sum(larger_512_list)/len(larger_512_list)

In [None]:
# min and max amount of tokens
print(min(n_tokens_list))
print(max(n_tokens_list))

In [None]:
# Create plot n_tokens distribution

# Convert list to DataFrame
n_tokens_df = pd.DataFrame(n_tokens_list, columns=['Values'])
bins = [0, 512, 1024, 2048, float('inf')]  # float('inf') represents infinity for the upper bound
labels = ['< 512','>512 < 1024','>1024 < 2048', '> 2048']

n_tokens_df['Category'] = pd.cut(n_tokens_df['Values'], bins=bins, labels=labels)


# Count the occurrences of each category
category_counts = n_tokens_df['Category'].value_counts().sort_index().reset_index()
category_counts.columns = ['Category', 'Count']  # Rename columns for clarity

# Create a bar plot using Seaborn
sns.barplot(x='Category', y='Count', data=category_counts, palette='viridis')
plt.xlabel('Tokens')
plt.ylabel('Count')
plt.title('Distribution of n tokens')
plt.xticks(rotation=45)  # Rotate x labels for better visibility
plt.show()

In [None]:
# Define function for middle truncation with tokenisation for use with HuggingFace

n_tokens_list2 = []
larger_512_count2 = 0


def truncate_text(example):
    global n_tokens_list2, larger_512_count2

    # Tokenize the text
    tokens = tokenizer(example['full_text'], add_special_tokens=True)
    n_tokens2 = len(tokens['input_ids'])

    n_tokens_list2.append(n_tokens2)

    if n_tokens2 > 512:
      larger_512_count2 += 1

    # If the tokenized text is longer than 512, find the minimum index of the phrases
    if n_tokens2 > 512:
        min_index = len(example['full_text'])  # Set a high initial value for comparison
        start_pos = 0 # Default to using the text from beginning if none of the phrases are found

        # Declare phrases
        phrases = ["beoordeling van het middel", "beoordeling van het eerste middel", "beoordeling van het tweede middel", "beoordeling van het derde middel", "beoordeling van de middelen", "beoordeling van de ontvankelijkheid"]

        # Search for each phrase and update the start position if a phrase is found earlier
        for phrase in phrases:
            # Find the first occurrence of the phrase using regex for exact match
            match = re.search(re.escape(phrase), str(example['full_text']), flags=re.IGNORECASE)
            if match and match.start() < min_index:
                min_index = match.start()
                start_pos = min_index

        #search for 'beoordeling' if no phrase is found
        if start_pos == 0:
          match = re.search(re.escape("beoordeling"), str(example['full_text']), flags=re.IGNORECASE)
          if match and match.start() < min_index:
              min_index = match.start()
              start_pos = min_index

        # If a phrase is found, use the text from its first occurrence, otherwise start from beginning
        text = example['full_text'][start_pos:]
        tokens = tokenizer(text, add_special_tokens=True, truncation=True)

    return tokens




In [None]:
# Create middle truncated dataset
tokenized_dataset = raw_data.map(truncate_text, batched=False)


In [None]:
tokenized_dataset

In [None]:
# Rename column y to label
tokenized_dataset = tokenized_dataset.rename_column("cit_in_binary", "label")

In [None]:
tokenized_dataset

In [None]:
tokenized_dataset.save_to_disk("HR_tokenized_dataset.hf")

In [None]:
 from datasets import load_from_disk

In [None]:
reloaded_encoded_dataset = load_from_disk("/content/drive/MyDrive/Thesis/HR_tokenized_dataset.hf")


In [None]:
reloaded_encoded_dataset