In [None]:
# prompt: connect to drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


# Basics

In [None]:

!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m41.9 MB/s[0m eta [36m0:00:00

In [None]:
import os
import re
import copy
import json
import torch
from pathlib import Path
from transformers import AutoTokenizer, RobertaTokenizerFast, RobertaForMaskedLM, RobertaModel, BertModel
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, RobertaForTokenClassification
from transformers import pipeline, DataCollatorWithPadding, EarlyStoppingCallback, DataCollatorForTokenClassification
from collections import defaultdict
from tqdm import tqdm
import torch
import torch.nn.functional as F
from scipy.stats import spearmanr, linregress
import matplotlib.pyplot as plt
import pandas as pd
import time
from datasets import Dataset, DatasetDict, ClassLabel
from sklearn.model_selection import train_test_split
import random
import pandas as pd
from datasets import load_dataset, concatenate_datasets
import logging
from transformers import RobertaTokenizer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

# Load tokenizers

In [None]:
def load_json(path):
    with open(path, 'r') as f:
        my_dict = json.load(f)
    return my_dict

def store_json(path, object):
    with open(path, 'w') as f:
        json.dump(object, f)

In [None]:

tokenizer_wp = AutoTokenizer.from_pretrained('/content/drive/MyDrive/Thesis/Code/Tokenizers/WP_snellius')


# Create basic dataset

In [None]:
base_path = "/content/drive/MyDrive/Thesis/Code/Datasets/DBRD"



def load_files_from_directory(directory, label):
    data = []
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read().strip().lower()
                data.append({'text': text, 'label': label})
    return data


train_pos_dir = os.path.join(base_path, 'train', 'pos')
train_neg_dir = os.path.join(base_path, 'train', 'neg')
test_pos_dir = os.path.join(base_path, 'test', 'pos')
test_neg_dir = os.path.join(base_path, 'test', 'neg')

print(train_pos_dir)
print(train_neg_dir)
print(test_pos_dir)
print(test_neg_dir)

train_pos_data = load_files_from_directory(train_pos_dir, 1)
train_neg_data = load_files_from_directory(train_neg_dir, 0)
test_pos_data = load_files_from_directory(test_pos_dir, 1)
test_neg_data = load_files_from_directory(test_neg_dir, 0)

train_data = train_pos_data + train_neg_data
test_data = test_pos_data + test_neg_data

train_dataset = Dataset.from_list(train_data).shuffle(seed=42)
test_dataset = Dataset.from_list(test_data).shuffle(seed=42)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

def check_duplicates(dataset):
    texts = set()
    duplicates = 0
    for example in dataset:
        text = example['text']
        if text in texts:
            duplicates += 1
        else:
            texts.add(text)
    return duplicates

train_duplicates = check_duplicates(dataset_dict['train'])
test_duplicates = check_duplicates(dataset_dict['test'])

print(f"Number of duplicates in the train dataset: {train_duplicates}")
print(f"Number of duplicates in the test dataset: {test_duplicates}")



/content/drive/MyDrive/Thesis/Code/Datasets/DBRD/train/pos
/content/drive/MyDrive/Thesis/Code/Datasets/DBRD/train/neg
/content/drive/MyDrive/Thesis/Code/Datasets/DBRD/test/pos
/content/drive/MyDrive/Thesis/Code/Datasets/DBRD/test/neg
Number of duplicates in the train dataset: 0
Number of duplicates in the test dataset: 0


In [None]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 20028
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2224
    })
})

In [None]:
train_test_split = dataset_dict['train'].train_test_split(test_size=0.1, seed=42)

dataset_dict_ = DatasetDict({
    'train': train_test_split['train'],
    'eval': train_test_split['test'],
    'test': dataset_dict['test']
})

print(dataset_dict_)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 18025
    })
    eval: Dataset({
        features: ['text', 'label'],
        num_rows: 2003
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2224
    })
})


# WP

In [None]:
tokenizer = tokenizer_wp

def tokenize_function(example):
    return tokenizer(example['text'])

tokenized_datasets = dataset_dict_.map(tokenize_function)


print(tokenized_datasets)

Map:   0%|          | 0/18025 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (604 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/2003 [00:00<?, ? examples/s]

Map:   0%|          | 0/2224 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 18025
    })
    eval: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2003
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2224
    })
})


In [None]:
def count_and_percentage_long_sequences(dataset, max_length=512):
    count = 0
    total = len(dataset)
    for example in dataset:
        if len(example['input_ids']) > max_length:
            count += 1
    percentage = (count / total) * 100
    return count, percentage


num_long_sequences_train, perc_long_sequences_train = count_and_percentage_long_sequences(tokenized_datasets['train'])
num_long_sequences_test, perc_long_sequences_test = count_and_percentage_long_sequences(tokenized_datasets['test'])

print(f"Number of sequences in the train dataset longer than 512 tokens: {num_long_sequences_train} ({perc_long_sequences_train:.2f}%)")
print(f"Number of sequences in the test dataset longer than 512 tokens: {num_long_sequences_test} ({perc_long_sequences_test:.2f}%)")

Number of sequences in the train dataset longer than 512 tokens: 3722 (20.65%)
Number of sequences in the test dataset longer than 512 tokens: 461 (20.73%)


In [None]:
def truncate_from_beginning(example, max_length):
    input_ids = example['input_ids']
    attention_mask = example['attention_mask']
    label = example['label']

    if len(input_ids) > max_length:
        input_ids = input_ids[-max_length:]
        attention_mask = attention_mask[-max_length:]

    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'label': label}


def apply_custom_truncation(tokenized_datasets, max_length=512):
    return tokenized_datasets.map(lambda x: truncate_from_beginning(x, max_length))

truncated_datasets = apply_custom_truncation(tokenized_datasets, max_length=512)

truncated_datasets.set_format(type='torch', columns=['label', 'input_ids', 'attention_mask'])

Map:   0%|          | 0/18025 [00:00<?, ? examples/s]

Map:   0%|          | 0/2003 [00:00<?, ? examples/s]

Map:   0%|          | 0/2224 [00:00<?, ? examples/s]

In [None]:
num_long_sequences_train, perc_long_sequences_train = count_and_percentage_long_sequences(truncated_datasets['train'])
num_long_sequences_test, perc_long_sequences_test = count_and_percentage_long_sequences(truncated_datasets['test'])

print(f"Number of sequences in the train dataset longer than 512 tokens: {num_long_sequences_train} ({perc_long_sequences_train:.2f}%)")
print(f"Number of sequences in the test dataset longer than 512 tokens: {num_long_sequences_test} ({perc_long_sequences_test:.2f}%)")

Number of sequences in the train dataset longer than 512 tokens: 0 (0.00%)
Number of sequences in the test dataset longer than 512 tokens: 0 (0.00%)


In [None]:
def custom_collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_masks = [item['attention_mask'] for item in batch]
    labels = [item['label'] for item in batch]


    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0)
    labels = torch.stack(labels)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'labels': labels
    }


train_loader = DataLoader(truncated_datasets['train'], batch_size=16, collate_fn=custom_collate_fn)
validation_loader = DataLoader(truncated_datasets['eval'], batch_size=16, collate_fn=custom_collate_fn)
test_loader = DataLoader(truncated_datasets['test'], batch_size=16, collate_fn=custom_collate_fn)


class CustomTrainer(Trainer):
    def get_train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.args.per_device_train_batch_size,
            collate_fn=custom_collate_fn
        )

    def get_eval_dataloader(self, eval_dataset=None):
        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
        return DataLoader(
            eval_dataset,
            batch_size=self.args.per_device_eval_batch_size,
            collate_fn=custom_collate_fn
        )

In [None]:
model = RobertaForSequenceClassification.from_pretrained('/content/drive/MyDrive/Thesis/Code/Models/WP/x4/x4')

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/Thesis/Code/Models/WP/x4/x4 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:


training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    learning_rate=2e-5,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=1,
    logging_steps=200
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=truncated_datasets['train'],
    eval_dataset=truncated_datasets['eval'],
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)



In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.2999,0.227913
2,0.1814,0.284739
3,0.1332,0.351876
4,0.0939,0.331692


TrainOutput(global_step=4508, training_loss=0.17708866556027134, metrics={'train_runtime': 1158.2845, 'train_samples_per_second': 311.236, 'train_steps_per_second': 19.46, 'total_flos': 3772561970616576.0, 'train_loss': 0.17708866556027134, 'epoch': 4.0})

# Evaluation

### WP

In [None]:
from datasets import load_metric
import numpy as np


eval_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=200,
    disable_tqdm=False,
    do_train=False,
    do_eval=True
)


accuracy_metric = load_metric("accuracy")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
f1_metric = load_metric("f1")



def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average='binary')
    recall = recall_metric.compute(predictions=predictions, references=labels, average='binary')
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='binary')


    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"],
    }


trainer = CustomTrainer(
    model=model,
    args=eval_args,
    eval_dataset=truncated_datasets['test'],
    compute_metrics=compute_metrics
)


# Evaluate the model on the test set
test_results_wp = trainer.evaluate()

print("Test set evaluation results:", test_results_wp)

  accuracy_metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

The repository for accuracy contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/accuracy.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading builder script:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

The repository for precision contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/precision.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading builder script:   0%|          | 0.00/2.52k [00:00<?, ?B/s]

The repository for recall contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/recall.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

The repository for f1 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/f1.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Test set evaluation results: {'eval_loss': 0.2183023989200592, 'eval_accuracy': 0.9181654676258992, 'eval_precision': 0.9297597042513863, 'eval_recall': 0.9046762589928058, 'eval_f1': 0.9170464904284412, 'eval_runtime': 11.5141, 'eval_samples_per_second': 193.155, 'eval_steps_per_second': 12.072}


In [None]:
test_results_wp

{'eval_loss': 0.2183023989200592,
 'eval_accuracy': 0.9181654676258992,
 'eval_precision': 0.9297597042513863,
 'eval_recall': 0.9046762589928058,
 'eval_f1': 0.9170464904284412,
 'eval_runtime': 11.5141,
 'eval_samples_per_second': 193.155,
 'eval_steps_per_second': 12.072}

In [None]:
store_json('/content/drive/MyDrive/Thesis/Code/SA_WP_results.json', test_results_wp)