In [1]:
# prompt: connect to drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


# Basics

In [2]:

!pip install datasets

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

In [3]:
import os
import re
import copy
import json
import torch
from pathlib import Path
from transformers import AutoTokenizer, RobertaTokenizerFast, RobertaForMaskedLM, RobertaModel, BertModel
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, RobertaForTokenClassification
from transformers import pipeline, DataCollatorWithPadding, EarlyStoppingCallback, DataCollatorForTokenClassification
from collections import defaultdict
from tqdm import tqdm
import torch
import torch.nn.functional as F
from scipy.stats import spearmanr, linregress
import matplotlib.pyplot as plt
import pandas as pd
import time
from datasets import Dataset, DatasetDict, ClassLabel
from sklearn.model_selection import train_test_split
import random
import pandas as pd
from datasets import load_dataset, concatenate_datasets
import logging
from transformers import RobertaTokenizer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

# Load tokenizers

In [4]:
class CustomTokenizer:

    def __init__(self, segmentation_dictionary, wp_tokenizer, max_length=None, pad_to_multiple_of=None, model_max_length=None):



        self.wp_tokenizer = wp_tokenizer
        self.wp_vocab = self.wp_tokenizer.get_vocab()

        self.segmentations = {word: seg for word, seg in segmentation_dictionary.items() if len(seg) > 0}
        self.seg_dict = {}


        for word, segs in self.segmentations.items():
            out = []
            for i, seg in enumerate(segs):
                if i == 0:
                    out.append(seg)
                else:
                    out.append('##' + seg)
            self.seg_dict[word] = out


        self.segments = {seg for segs in self.seg_dict.values() for seg in segs}

        self.vocab = self.wp_vocab.copy()



        next_index = len(self.vocab)

        for element in self.segments:
            if element not in self.vocab:
                self.vocab[element] = next_index
                next_index += 1



        self.vocab_size = len(self.vocab)

        self.seg_dict_numbered = {}

        for word, segs in self.seg_dict.items():
            s = []
            for seg in segs:
                s.append(self.vocab[seg])
            self.seg_dict_numbered[word] = s


        self.inverted_vocab = {value: key for key, value in self.vocab.items()}
        self.max_length = max_length
        self.pad_to_multiple_of = pad_to_multiple_of
        self.padding_side = 'right'
        self.model_max_length = model_max_length

        ### special tokens

        special_tokens = ['[UNK]', '[MASK]', '[CLS]', '[SEP]', '[PAD]']
        special_token_ids = {}

        for token in special_tokens:
            if token not in self.vocab:
                self.vocab[token] = len(self.vocab)
            special_token_ids[token] = self.vocab[token]

        self.unk_token = '[UNK]'
        self.unk_token_id = special_token_ids['[UNK]']

        self.mask_token = '[MASK]'
        self.mask_token_id = special_token_ids['[MASK]']


        self.pad_token = '[PAD]'
        self.pad_token_id = special_token_ids['[PAD]']

        self.bos_token = '[CLS]'
        self.bos_token_id = special_token_ids['[CLS]']

        self.eos_token = '[SEP]'
        self.eos_token_id = special_token_ids['[SEP]']


        self.special_tokens = [self.vocab['[PAD]'], self.vocab['[UNK]'], self.vocab['[CLS]'], self.vocab['[SEP]'], self.vocab['[MASK]']]

        self.special_tokens_map = wp_tokenizer.special_tokens_map



    def __len__(self):
        return len(self.vocab)


    def check_tokens_in_dict(self, ids, tokens, dic_a):

        combined_tokens = []
        current_word = ''
        current_ids = []

        for i, token in enumerate(tokens):
            if token.startswith('##'):
                current_word += token[2:]
                current_ids.append(ids[i])
            else:
                if current_word:
                    combined_tokens.append((current_word, current_ids))
                current_word = token
                current_ids = [ids[i]]

        if current_word:
            combined_tokens.append((current_word, current_ids))

        result = []
        for word, ids_list in combined_tokens:
            if word in dic_a:
                result.extend(dic_a[word])
            else:
                result.extend(ids_list)
        return result


    def check_tokens_in_dict_v2(self, ids, tokens, dic_a):

        combined_tokens = []
        current_word = ''
        current_tokens = []

        for i, token in enumerate(tokens):
            if token.startswith('##'):
                current_word += token[2:]
                current_tokens.append(token)
            else:
                if current_word:
                    combined_tokens.append((current_word, current_tokens))

                current_word = token
                current_tokens = [token]

        if current_word:
            combined_tokens.append((current_word, current_tokens))

        result = []
        for word, tokens_list in combined_tokens:
            if word in dic_a:
                result.extend(dic_a[word])
            else:
                result.extend(tokens_list)

        return result



    def get_vocab(self):
        return self.vocab







    def _convert_token_to_id(self, token):
        if token in self.vocab:
            return self.vocab[token]
        else:
            return self.unk_token_id


    def convert_tokens_to_ids(self, tokens):
        if isinstance(tokens, list):
            return [self._convert_token_to_id(token) for token in tokens]
        return self._convert_token_to_id(tokens)



    def _convert_id_to_token(self, id):
        return self.inverted_vocab[id]


    def convert_ids_to_tokens(self, ids):
        if isinstance(ids, list):
            return [self._convert_id_to_token(id) for id in ids]
        return self._convert_id_to_token(ids)


    def get_special_tokens_mask(self, token_ids, already_has_special_tokens=False):
        return [1 if self._is_special_token(token_id) else 0 for token_id in token_ids]


    def _is_special_token(self, token_id):

        if token_id in self.special_tokens:
            return True
        else:
            return False




    def pad(self, batch, return_tensors="pt", pad_to_multiple_of=None, padding=True, max_length=None):
        if pad_to_multiple_of is None:
            pad_to_multiple_of = self.pad_to_multiple_of

        input_ids_list = []
        for dictionary in batch:
            for key, value in dictionary.items():
                if key == "input_ids":

                    if isinstance(value, torch.Tensor):
                        input_ids_list.append(value.tolist())
                    else:
                        input_ids_list.append(value)


        max_length = max(len(x) for x in input_ids_list)

        if pad_to_multiple_of is not None:
            max_length = (max_length + pad_to_multiple_of - 1) // pad_to_multiple_of * pad_to_multiple_of

        padded_batch = []
        for seq in input_ids_list:
            if len(seq) < max_length:
                seq.extend([self.pad_token_id] * (max_length - len(seq)))
            padded_batch.append(seq)

        attention_list = []
        for inner_list in padded_batch:
            p_list = [1 if value != self.pad_token_id else 0 for value in inner_list]
            attention_list.append(p_list)

        if return_tensors == "pt":
            return {'input_ids': torch.tensor(padded_batch, dtype=torch.long), 'attention_mask': torch.tensor(attention_list, dtype=torch.long)}

        return {'input_ids': padded_batch, 'attention_mask': attention_list}


    def tokenize(self, text):
        if isinstance(text, list):
            tokens_list = []
            for t in text:
                encoded = self.wp_tokenizer(t)
                tokens = encoded.tokens()
                tokens = self.check_tokens_in_dict_v2(encoded['input_ids'], tokens, self.seg_dict)[1:-1]
                tokens_list.append(tokens)
            return tokens_list
        else:
            encoded = self.wp_tokenizer(text)
            tokens = encoded.tokens()
            tokens = self.check_tokens_in_dict_v2(encoded['input_ids'], tokens, self.seg_dict)[1:-1]
            return tokens


    def encode(self, text, text_pair=None, add_special_tokens=True, return_tensors=None, max_length=None, pad_to_max_length=False, truncation=False):
        if text_pair:
            text = f"{text} {self.eos_token} {text_pair}"

        tokens = self.tokenize(text)

        if add_special_tokens:
            tokens = [self.bos_token] + tokens + [self.eos_token]

        if truncation and max_length and len(tokens) > max_length:
            tokens = tokens[:max_length]

        if pad_to_max_length and max_length and len(tokens) < max_length:
            tokens += [self.pad_token] * (max_length - len(tokens))

        input_ids = self.convert_tokens_to_ids(tokens)
        attention_mask = [1 if token != self.pad_token else 0 for token in tokens]

        if return_tensors == "pt":
            input_ids = torch.tensor([input_ids], dtype=torch.long)
            attention_mask = torch.tensor([attention_mask], dtype=torch.long)

        return {'input_ids': input_ids, 'attention_mask': attention_mask}

    def batch_encode_plus(self, texts, text_pairs=None, add_special_tokens=True, return_tensors=None, max_length=None, pad_to_max_length=False, truncation=False, pad_to_multiple_of=None):
        batch = []

        if text_pairs:
            for text, text_pair in zip(texts, text_pairs):
                batch.append(self.encode(
                    text,
                    text_pair=text_pair,
                    add_special_tokens=add_special_tokens,
                    return_tensors=None,
                    max_length=max_length,
                    pad_to_max_length=pad_to_max_length,
                    truncation=truncation
                ))
        else:
            for text in texts:
                batch.append(self.encode(
                    text,
                    add_special_tokens=add_special_tokens,
                    return_tensors=None,
                    max_length=max_length,
                    pad_to_max_length=pad_to_max_length,
                    truncation=truncation
                ))

        padded_batch = self.pad(batch, return_tensors=return_tensors, pad_to_multiple_of=pad_to_multiple_of)

        return padded_batch


    def decode(self, ids, skip_special_tokens=False):
        out = ''
        for id in ids:
            token = self._convert_id_to_token(id)
            if skip_special_tokens and self._is_special_token(id):
                continue
            if token[:2] == '##':
                out += token[2:]
            else:
                out += ' ' + token


        out = re.sub(r'\s+([?.!,\'"])', r'\1', out)
        return out.strip()


    def __call__(self, text, text_pair=None, add_special_tokens=True, return_tensors=None, max_length=None, pad_to_max_length=False, truncation=False, pad_to_multiple_of=None):
        if isinstance(text, str):

            return self.encode(
                text,
                text_pair=text_pair,
                add_special_tokens=add_special_tokens,
                return_tensors=return_tensors,
                max_length=max_length,
                pad_to_max_length=pad_to_max_length,
                truncation=truncation
            )
        elif isinstance(text, list) and all(isinstance(t, str) for t in text):

            return self.batch_encode_plus(
                text,
                text_pairs=text_pair,
                add_special_tokens=add_special_tokens,
                return_tensors=return_tensors,
                max_length=max_length,
                pad_to_max_length=pad_to_max_length,
                truncation=truncation,
                pad_to_multiple_of=pad_to_multiple_of
            )
        else:
            raise ValueError("Input text should be either a single string or a list of strings.")





    def __len__(self):
        return self.vocab_size

In [5]:
def load_json(path):
    with open(path, 'r') as f:
        my_dict = json.load(f)
    return my_dict

def store_json(path, object):
    with open(path, 'w') as f:
        json.dump(object, f)

In [6]:


path_to_dict = '/content/drive/MyDrive/Thesis/Code/segmentation_dictionary_final.json'
path_to_tokenizer = "/content/drive/MyDrive/Thesis/Code/HELP/MORF/help_2815"

segmentation_dictionary = load_json(path_to_dict)
help_tokenizer = AutoTokenizer.from_pretrained(path_to_tokenizer)

tokenizer_morf = CustomTokenizer(segmentation_dictionary, help_tokenizer)

# Create basic dataset

In [7]:
base_path = "/content/drive/MyDrive/Thesis/Code/Datasets/DBRD"



def load_files_from_directory(directory, label):
    data = []
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read().strip().lower()
                data.append({'text': text, 'label': label})
    return data


train_pos_dir = os.path.join(base_path, 'train', 'pos')
train_neg_dir = os.path.join(base_path, 'train', 'neg')
test_pos_dir = os.path.join(base_path, 'test', 'pos')
test_neg_dir = os.path.join(base_path, 'test', 'neg')

print(train_pos_dir)
print(train_neg_dir)
print(test_pos_dir)
print(test_neg_dir)

train_pos_data = load_files_from_directory(train_pos_dir, 1)
train_neg_data = load_files_from_directory(train_neg_dir, 0)
test_pos_data = load_files_from_directory(test_pos_dir, 1)
test_neg_data = load_files_from_directory(test_neg_dir, 0)

train_data = train_pos_data + train_neg_data
test_data = test_pos_data + test_neg_data

train_dataset = Dataset.from_list(train_data).shuffle(seed=42)
test_dataset = Dataset.from_list(test_data).shuffle(seed=42)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

def check_duplicates(dataset):
    texts = set()
    duplicates = 0
    for example in dataset:
        text = example['text']
        if text in texts:
            duplicates += 1
        else:
            texts.add(text)
    return duplicates

train_duplicates = check_duplicates(dataset_dict['train'])
test_duplicates = check_duplicates(dataset_dict['test'])

print(f"Number of duplicates in the train dataset: {train_duplicates}")
print(f"Number of duplicates in the test dataset: {test_duplicates}")

/content/drive/MyDrive/Thesis/Code/Datasets/DBRD/train/pos
/content/drive/MyDrive/Thesis/Code/Datasets/DBRD/train/neg
/content/drive/MyDrive/Thesis/Code/Datasets/DBRD/test/pos
/content/drive/MyDrive/Thesis/Code/Datasets/DBRD/test/neg
Number of duplicates in the train dataset: 0
Number of duplicates in the test dataset: 0


In [None]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 20028
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2224
    })
})

In [9]:
train_test_split = dataset_dict['train'].train_test_split(test_size=0.1, seed=42)


dataset_dict_ = DatasetDict({
    'train': train_test_split['train'],
    'eval': train_test_split['test'],
    'test': dataset_dict['test']
})

print(dataset_dict_)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 18025
    })
    eval: Dataset({
        features: ['text', 'label'],
        num_rows: 2003
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2224
    })
})


# MORF

In [10]:
tokenizer = tokenizer_morf

def tokenize_function(example):
    return tokenizer(example['text'])

tokenized_datasets = dataset_dict_.map(tokenize_function)


print(tokenized_datasets)

Map:   0%|          | 0/18025 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (825 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/2003 [00:00<?, ? examples/s]

Map:   0%|          | 0/2224 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 18025
    })
    eval: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2003
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2224
    })
})


In [11]:
def count_and_percentage_long_sequences(dataset, max_length=512):
    count = 0
    total = len(dataset)
    for example in dataset:
        if len(example['input_ids']) > max_length:
            count += 1
    percentage = (count / total) * 100
    return count, percentage

# Apply the function to the tokenized train and test datasets
num_long_sequences_train, perc_long_sequences_train = count_and_percentage_long_sequences(tokenized_datasets['train'])
num_long_sequences_test, perc_long_sequences_test = count_and_percentage_long_sequences(tokenized_datasets['test'])

print(f"Number of sequences in the train dataset longer than 512 tokens: {num_long_sequences_train} ({perc_long_sequences_train:.2f}%)")
print(f"Number of sequences in the test dataset longer than 512 tokens: {num_long_sequences_test} ({perc_long_sequences_test:.2f}%)")

Number of sequences in the train dataset longer than 512 tokens: 4751 (26.36%)
Number of sequences in the test dataset longer than 512 tokens: 583 (26.21%)


In [12]:
def truncate_from_beginning(example, max_length):
    input_ids = example['input_ids']
    attention_mask = example['attention_mask']
    label = example['label']

    if len(input_ids) > max_length:
        input_ids = input_ids[-max_length:]  # Keep the last max_length tokens
        attention_mask = attention_mask[-max_length:]  # Keep the corresponding attention mask

    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'label': label}

# Apply custom truncation to the tokenized dataset
def apply_custom_truncation(tokenized_datasets, max_length=512):
    return tokenized_datasets.map(lambda x: truncate_from_beginning(x, max_length))

# Apply the truncation
truncated_datasets = apply_custom_truncation(tokenized_datasets, max_length=512)

# Ensure the dataset is in the correct format for PyTorch
truncated_datasets.set_format(type='torch', columns=['label', 'input_ids', 'attention_mask'])

Map:   0%|          | 0/18025 [00:00<?, ? examples/s]

Map:   0%|          | 0/2003 [00:00<?, ? examples/s]

Map:   0%|          | 0/2224 [00:00<?, ? examples/s]

In [13]:
num_long_sequences_train, perc_long_sequences_train = count_and_percentage_long_sequences(truncated_datasets['train'])
num_long_sequences_test, perc_long_sequences_test = count_and_percentage_long_sequences(truncated_datasets['test'])

print(f"Number of sequences in the train dataset longer than 512 tokens: {num_long_sequences_train} ({perc_long_sequences_train:.2f}%)")
print(f"Number of sequences in the test dataset longer than 512 tokens: {num_long_sequences_test} ({perc_long_sequences_test:.2f}%)")

Number of sequences in the train dataset longer than 512 tokens: 0 (0.00%)
Number of sequences in the test dataset longer than 512 tokens: 0 (0.00%)


In [14]:
def custom_collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_masks = [item['attention_mask'] for item in batch]
    labels = [item['label'] for item in batch]

    # Pad sequences
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0)
    labels = torch.stack(labels)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'labels': labels
    }

# Initialize DataLoaders with the custom collate function
train_loader = DataLoader(truncated_datasets['train'], batch_size=16, collate_fn=custom_collate_fn)
validation_loader = DataLoader(truncated_datasets['eval'], batch_size=16, collate_fn=custom_collate_fn)
test_loader = DataLoader(truncated_datasets['test'], batch_size=16, collate_fn=custom_collate_fn)


class CustomTrainer(Trainer):
    def get_train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.args.per_device_train_batch_size,
            collate_fn=custom_collate_fn
        )

    def get_eval_dataloader(self, eval_dataset=None):
        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
        return DataLoader(
            eval_dataset,
            batch_size=self.args.per_device_eval_batch_size,
            collate_fn=custom_collate_fn
        )

In [None]:
model = RobertaForSequenceClassification.from_pretrained('/content/drive/MyDrive/Thesis/Code/Models/MORF/x4/x4')



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/Thesis/Code/Models/MORF/x4/x4 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:


training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    learning_rate=2e-5,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=1,
    logging_steps=200
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=truncated_datasets['train'],
    eval_dataset=truncated_datasets['eval'],
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)



In [None]:
trainer.train()

# Save

In [None]:
model.save_pretrained('/content/drive/MyDrive/Thesis/Code/Models/MORF/trained_model_SA')

In [16]:
model = RobertaForSequenceClassification.from_pretrained('/content/drive/MyDrive/Thesis/Code/Models/MORF/trained_model_SA')

# Evaluation

### MORF

In [17]:
from datasets import load_metric
import numpy as np


# Define new training arguments for evaluation
eval_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=200,
    disable_tqdm=False,
    do_train=False,
    do_eval=True
)

# Load evaluation metric
accuracy_metric = load_metric("accuracy")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
f1_metric = load_metric("f1")



def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average='binary')
    recall = recall_metric.compute(predictions=predictions, references=labels, average='binary')
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='binary')

    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"],
    }


trainer = CustomTrainer(
    model=model,
    args=eval_args,
    eval_dataset=truncated_datasets['test'],
    compute_metrics=compute_metrics
)

# Evaluate the model on the test set
test_results_morf = trainer.evaluate()

print("Test set evaluation results:", test_results_morf)

  accuracy_metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

The repository for accuracy contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/accuracy.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading builder script:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

The repository for precision contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/precision.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading builder script:   0%|          | 0.00/2.52k [00:00<?, ?B/s]

The repository for recall contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/recall.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

The repository for f1 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/f1.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Test set evaluation results: {'eval_loss': 0.3369973599910736, 'eval_accuracy': 0.8669064748201439, 'eval_precision': 0.8695652173913043, 'eval_recall': 0.8633093525179856, 'eval_f1': 0.8664259927797834, 'eval_runtime': 5.9898, 'eval_samples_per_second': 371.297, 'eval_steps_per_second': 23.206}


In [18]:
test_results_morf

{'eval_loss': 0.3369973599910736,
 'eval_accuracy': 0.8669064748201439,
 'eval_precision': 0.8695652173913043,
 'eval_recall': 0.8633093525179856,
 'eval_f1': 0.8664259927797834,
 'eval_runtime': 5.9898,
 'eval_samples_per_second': 371.297,
 'eval_steps_per_second': 23.206}

In [19]:
store_json('/content/drive/MyDrive/Thesis/Code/SA_MORF_results.json', test_results_morf)