In [None]:
# prompt: connect to drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m43.9 MB/s[0m eta [36m0:00:00

# Basics

In [None]:
import os
import re
import copy
import json
import torch
from pathlib import Path
from transformers import AutoTokenizer, RobertaTokenizerFast, RobertaForMaskedLM, RobertaModel, BertModel
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, RobertaForTokenClassification
from transformers import pipeline, DataCollatorWithPadding, EarlyStoppingCallback, DataCollatorForTokenClassification
from collections import defaultdict
from tqdm import tqdm
import torch
import torch.nn.functional as F
from scipy.stats import spearmanr, linregress
import matplotlib.pyplot as plt
import pandas as pd
import time
from datasets import Dataset, DatasetDict, ClassLabel
from sklearn.model_selection import train_test_split
import random
import pandas as pd
from datasets import load_dataset, concatenate_datasets

In [None]:
!pip install conllu

Collecting conllu
  Downloading conllu-5.0.1-py3-none-any.whl.metadata (21 kB)
Downloading conllu-5.0.1-py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-5.0.1


Create dataset

In [None]:
dataset_pos = load_dataset("universal_dependencies", "nl_lassysmall")

The repository for universal_dependencies contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/universal_dependencies.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/969k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/152k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/151k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5787 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/676 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/875 [00:00<?, ? examples/s]

In [None]:
def load_json(path):
    with open(path, 'r') as f:
        my_dict = json.load(f)
    return my_dict

def store_json(path, object):
    with open(path, 'w') as f:
        json.dump(object, f)

In [None]:
def tokenize_and_align_labels(examples, tokenizer, max_length=512):
    tokens = [[token.lower() for token in token_list] for token_list in examples['tokens']]

    tokenized_inputs = tokenizer(
        tokens,
        is_split_into_words=True,
        truncation=True,
        max_length=max_length,
        add_special_tokens=True
    )

    labels = examples['upos']
    aligned_labels = []

    for i in range(len(tokens)):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                if word_idx < len(labels[i]):
                    label_ids.append(labels[i][word_idx])
                else:
                    label_ids.append(-100)
            else:
                if word_idx < len(labels[i]):
                    label_ids.append(labels[i][word_idx])
                else:
                    label_ids.append(-100)
            previous_word_idx = word_idx

        while len(label_ids) < len(tokenized_inputs['input_ids'][i]):
            label_ids.append(-100)

        aligned_labels.append(label_ids)

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs

def tokenize_and_align_labels_custom(example, tokenizer, max_length=512):
    tokens = [token.lower() for token in example['tokens']]

    input_ids = []
    attention_mask = []
    labels = []

    for idx, token in enumerate(tokens):
        encoding = tokenizer.encode(
            token,
            add_special_tokens=False,
            #max_length=max_length,
            pad_to_max_length=False
        )

        token_ids = encoding['input_ids']
        token_attention_mask = [1] * len(token_ids)

        token_labels = [example['upos'][idx]] * len(token_ids)

        input_ids.extend(token_ids)
        attention_mask.extend(token_attention_mask)
        labels.extend(token_labels)

    if tokenizer.bos_token_id is not None:
        input_ids = [tokenizer.bos_token_id] + input_ids
        attention_mask = [1] + attention_mask
        labels = [-100] + labels

    if tokenizer.eos_token_id is not None:
        input_ids = input_ids + [tokenizer.eos_token_id]
        attention_mask = attention_mask + [1]
        labels = labels + [-100]

    if len(input_ids) > max_length:
        input_ids = input_ids[:max_length - 1] + [tokenizer.eos_token_id]
        attention_mask = attention_mask[:max_length]
        labels = labels[:max_length]


    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }


# BPE

## Load tokenizers

In [None]:

tokenizer_bpe = RobertaTokenizerFast.from_pretrained('/content/drive/MyDrive/Thesis/Code/Tokenizers/BPE_snellius', add_prefix_space=True)

## Tokenize dataset

In [None]:
tokenized_dataset_pos_bpe = dataset_pos.map(lambda examples: tokenize_and_align_labels(examples, tokenizer_bpe), batched=True)

Map:   0%|          | 0/5787 [00:00<?, ? examples/s]

Map:   0%|          | 0/676 [00:00<?, ? examples/s]

Map:   0%|          | 0/875 [00:00<?, ? examples/s]

In [None]:
print(tokenized_dataset_pos_bpe['train'][9]['input_ids'])
print(tokenized_dataset_pos_bpe['train'][9]['attention_mask'])
print(tokenized_dataset_pos_bpe['train'][9]['labels'])

[0, 16624, 3992, 3429, 6303, 1406, 272, 2167, 396, 20265, 1230, 26781, 2]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[-100, 10, 10, 10, 10, 1, 8, 11, 16, 16, 0, 0, -100]


In [None]:
label_list = dataset_pos['train'].features['upos'].feature.names
num_labels = len(label_list)
print('labels list:', label_list)

model_bpe_pos = RobertaForTokenClassification.from_pretrained('/content/drive/MyDrive/Thesis/Code/Models/BPE/x4/x4', num_labels=num_labels)

labels list: ['NOUN', 'PUNCT', 'ADP', 'NUM', 'SYM', 'SCONJ', 'ADJ', 'PART', 'DET', 'CCONJ', 'PROPN', 'PRON', 'X', '_', 'ADV', 'INTJ', 'VERB', 'AUX']


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at /content/drive/MyDrive/Thesis/Code/Models/BPE/x4/x4 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Train

In [None]:
tokenizer = tokenizer_bpe
model = model_bpe_pos
tokenized_dataset = tokenized_dataset_pos_bpe

In [None]:
# Initialize the data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_strategy="epoch",
    eval_strategy="epoch",
    disable_tqdm=False,
    gradient_accumulation_steps=2
)

# Define the early stopping callback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.0
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    callbacks=[early_stopping_callback],
    data_collator=data_collator,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.3124,0.274319
2,0.1949,0.209089
3,0.1481,0.192732
4,0.1231,0.181123
5,0.1086,0.18042
6,0.0892,0.178443
7,0.0904,0.183053
8,0.0776,0.183625
9,0.0621,0.182388


TrainOutput(global_step=1629, training_loss=0.20933760145531613, metrics={'train_runtime': 102.4258, 'train_samples_per_second': 564.995, 'train_steps_per_second': 17.671, 'total_flos': 241846996840020.0, 'train_loss': 0.20933760145531613, 'epoch': 9.0})

## Save model

In [None]:
model.save_pretrained('/content/drive/MyDrive/Thesis/Code/Models/BPE/trained_model_POS')

In [None]:
model_bpe_trained = model.from_pretrained('/content/drive/MyDrive/Thesis/Code/Models/BPE/trained_model_POS')

# WP

## Tokenize

In [None]:
# WordPiece tokenizer
tokenizer_wp = AutoTokenizer.from_pretrained('/content/drive/MyDrive/Thesis/Code/Tokenizers/WP_snellius')

In [None]:
tokenized_dataset_pos_wp = dataset_pos.map(lambda examples: tokenize_and_align_labels(examples, tokenizer_wp), batched=True)

Map:   0%|          | 0/5787 [00:00<?, ? examples/s]

Map:   0%|          | 0/676 [00:00<?, ? examples/s]

Map:   0%|          | 0/875 [00:00<?, ? examples/s]

## Model

In [None]:
label_list = dataset_pos['train'].features['upos'].feature.names
num_labels = len(label_list)
print('labels list:', label_list)

model_wp_pos = RobertaForTokenClassification.from_pretrained('/content/drive/MyDrive/Thesis/Code/Models/WP/x4/x4', num_labels=num_labels)

labels list: ['NOUN', 'PUNCT', 'ADP', 'NUM', 'SYM', 'SCONJ', 'ADJ', 'PART', 'DET', 'CCONJ', 'PROPN', 'PRON', 'X', '_', 'ADV', 'INTJ', 'VERB', 'AUX']


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at /content/drive/MyDrive/Thesis/Code/Models/WP/x4/x4 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Train

In [None]:
tokenizer = tokenizer_wp
model = model_wp_pos
tokenized_dataset = tokenized_dataset_pos_wp

In [None]:
# Initialize the data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_strategy="epoch",
    eval_strategy="epoch",
    disable_tqdm=False,
    gradient_accumulation_steps=2
)

# Define the early stopping callback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.0
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    callbacks=[early_stopping_callback],
    data_collator=data_collator,
)


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.3366,0.301115
2,0.2066,0.220022
3,0.1453,0.203358
4,0.1417,0.191431
5,0.1161,0.189166
6,0.0947,0.188981
7,0.0919,0.18933
8,0.0725,0.195356
9,0.0693,0.194221


TrainOutput(global_step=1629, training_loss=0.21914886601356146, metrics={'train_runtime': 106.1459, 'train_samples_per_second': 545.193, 'train_steps_per_second': 17.052, 'total_flos': 247310735082840.0, 'train_loss': 0.21914886601356146, 'epoch': 9.0})

## Save model

In [None]:
model.save_pretrained('/content/drive/MyDrive/Thesis/Code/Models/WP/trained_model_POS')

In [None]:
model_wp_trained = model.from_pretrained('/content/drive/MyDrive/Thesis/Code/Models/WP/trained_model_POS')

# MORF

## Tokenizer

In [None]:
class CustomTokenizer:

    def __init__(self, segmentation_dictionary, wp_tokenizer, max_length=None, pad_to_multiple_of=None, model_max_length=None):



        self.wp_tokenizer = wp_tokenizer
        self.wp_vocab = self.wp_tokenizer.get_vocab()

        self.segmentations = {word: seg for word, seg in segmentation_dictionary.items() if len(seg) > 0}
        self.seg_dict = {}


        for word, segs in self.segmentations.items():
            out = []
            for i, seg in enumerate(segs):
                if i == 0:
                    out.append(seg)
                else:
                    out.append('##' + seg)
            self.seg_dict[word] = out


        self.segments = {seg for segs in self.seg_dict.values() for seg in segs}

        self.vocab = self.wp_vocab.copy()



        next_index = len(self.vocab)

        for element in self.segments:
            if element not in self.vocab:
                self.vocab[element] = next_index
                next_index += 1



        self.vocab_size = len(self.vocab)

        self.seg_dict_numbered = {}

        for word, segs in self.seg_dict.items():
            s = []
            for seg in segs:
                s.append(self.vocab[seg])
            self.seg_dict_numbered[word] = s


        self.inverted_vocab = {value: key for key, value in self.vocab.items()}
        self.max_length = max_length
        self.pad_to_multiple_of = pad_to_multiple_of
        self.padding_side = 'right'
        self.model_max_length = model_max_length

        ### special tokens

        special_tokens = ['[UNK]', '[MASK]', '[CLS]', '[SEP]', '[PAD]']
        special_token_ids = {}

        for token in special_tokens:
            if token not in self.vocab:
                self.vocab[token] = len(self.vocab)
            special_token_ids[token] = self.vocab[token]

        self.unk_token = '[UNK]'
        self.unk_token_id = special_token_ids['[UNK]']

        self.mask_token = '[MASK]'
        self.mask_token_id = special_token_ids['[MASK]']


        self.pad_token = '[PAD]'
        self.pad_token_id = special_token_ids['[PAD]']

        self.bos_token = '[CLS]'
        self.bos_token_id = special_token_ids['[CLS]']

        self.eos_token = '[SEP]'
        self.eos_token_id = special_token_ids['[SEP]']


        self.special_tokens = [self.vocab['[PAD]'], self.vocab['[UNK]'], self.vocab['[CLS]'], self.vocab['[SEP]'], self.vocab['[MASK]']]

        self.special_tokens_map = wp_tokenizer.special_tokens_map



    def __len__(self):
        return len(self.vocab)


    def check_tokens_in_dict(self, ids, tokens, dic_a):

        combined_tokens = []
        current_word = ''
        current_ids = []

        for i, token in enumerate(tokens):
            if token.startswith('##'):
                current_word += token[2:]
                current_ids.append(ids[i])
            else:
                if current_word:
                    combined_tokens.append((current_word, current_ids))
                current_word = token
                current_ids = [ids[i]]

        if current_word:
            combined_tokens.append((current_word, current_ids))

        result = []
        for word, ids_list in combined_tokens:
            if word in dic_a:
                result.extend(dic_a[word])
            else:
                result.extend(ids_list)
        return result


    def check_tokens_in_dict_v2(self, ids, tokens, dic_a):

        combined_tokens = []
        current_word = ''
        current_tokens = []

        for i, token in enumerate(tokens):
            if token.startswith('##'):
                current_word += token[2:]
                current_tokens.append(token)
            else:
                if current_word:
                    combined_tokens.append((current_word, current_tokens))

                current_word = token
                current_tokens = [token]

        if current_word:
            combined_tokens.append((current_word, current_tokens))

        result = []
        for word, tokens_list in combined_tokens:
            if word in dic_a:
                result.extend(dic_a[word])
            else:
                result.extend(tokens_list)

        return result



    def get_vocab(self):
        return self.vocab









    def _convert_token_to_id(self, token):
        if token in self.vocab:
            return self.vocab[token]
        else:
            return self.unk_token_id


    def convert_tokens_to_ids(self, tokens):
        if isinstance(tokens, list):
            return [self._convert_token_to_id(token) for token in tokens]
        return self._convert_token_to_id(tokens)



    def _convert_id_to_token(self, id):
        return self.inverted_vocab[id]


    def convert_ids_to_tokens(self, ids):
        if isinstance(ids, list):
            return [self._convert_id_to_token(id) for id in ids]
        return self._convert_id_to_token(ids)


    def get_special_tokens_mask(self, token_ids, already_has_special_tokens=False):
        return [1 if self._is_special_token(token_id) else 0 for token_id in token_ids]


    def _is_special_token(self, token_id):

        if token_id in self.special_tokens:
            return True
        else:
            return False




    def pad(self, batch, return_tensors="pt", pad_to_multiple_of=None, padding=True, max_length=None):
        if pad_to_multiple_of is None:
            pad_to_multiple_of = self.pad_to_multiple_of

        input_ids_list = []
        for dictionary in batch:
            for key, value in dictionary.items():
                if key == "input_ids":

                    if isinstance(value, torch.Tensor):
                        input_ids_list.append(value.tolist())
                    else:
                        input_ids_list.append(value)


        max_length = max(len(x) for x in input_ids_list)

        if pad_to_multiple_of is not None:
            max_length = (max_length + pad_to_multiple_of - 1) // pad_to_multiple_of * pad_to_multiple_of

        padded_batch = []
        for seq in input_ids_list:
            if len(seq) < max_length:
                seq.extend([self.pad_token_id] * (max_length - len(seq)))
            padded_batch.append(seq)

        attention_list = []
        for inner_list in padded_batch:
            p_list = [1 if value != self.pad_token_id else 0 for value in inner_list]
            attention_list.append(p_list)

        if return_tensors == "pt":
            return {'input_ids': torch.tensor(padded_batch, dtype=torch.long), 'attention_mask': torch.tensor(attention_list, dtype=torch.long)}

        return {'input_ids': padded_batch, 'attention_mask': attention_list}


    def tokenize(self, text):
        if isinstance(text, list):
            tokens_list = []
            for t in text:
                encoded = self.wp_tokenizer(t)
                tokens = encoded.tokens()
                tokens = self.check_tokens_in_dict_v2(encoded['input_ids'], tokens, self.seg_dict)[1:-1]
                tokens_list.append(tokens)
            return tokens_list
        else:
            encoded = self.wp_tokenizer(text)
            tokens = encoded.tokens()
            tokens = self.check_tokens_in_dict_v2(encoded['input_ids'], tokens, self.seg_dict)[1:-1]
            return tokens


    def encode(self, text, text_pair=None, add_special_tokens=True, return_tensors=None, max_length=None, pad_to_max_length=False, truncation=False):
        if text_pair:
            text = f"{text} {self.eos_token} {text_pair}"

        tokens = self.tokenize(text)

        if add_special_tokens:
            tokens = [self.bos_token] + tokens + [self.eos_token]

        if truncation and max_length and len(tokens) > max_length:
            tokens = tokens[:max_length]

        if pad_to_max_length and max_length and len(tokens) < max_length:
            tokens += [self.pad_token] * (max_length - len(tokens))

        input_ids = self.convert_tokens_to_ids(tokens)
        attention_mask = [1 if token != self.pad_token else 0 for token in tokens]

        if return_tensors == "pt":
            input_ids = torch.tensor([input_ids], dtype=torch.long)
            attention_mask = torch.tensor([attention_mask], dtype=torch.long)

        return {'input_ids': input_ids, 'attention_mask': attention_mask}

    def batch_encode_plus(self, texts, text_pairs=None, add_special_tokens=True, return_tensors=None, max_length=None, pad_to_max_length=False, truncation=False, pad_to_multiple_of=None):
        batch = []

        if text_pairs:
            for text, text_pair in zip(texts, text_pairs):
                batch.append(self.encode(
                    text,
                    text_pair=text_pair,
                    add_special_tokens=add_special_tokens,
                    return_tensors=None,
                    max_length=max_length,
                    pad_to_max_length=pad_to_max_length,
                    truncation=truncation
                ))
        else:
            for text in texts:
                batch.append(self.encode(
                    text,
                    add_special_tokens=add_special_tokens,
                    return_tensors=None,
                    max_length=max_length,
                    pad_to_max_length=pad_to_max_length,
                    truncation=truncation
                ))

        padded_batch = self.pad(batch, return_tensors=return_tensors, pad_to_multiple_of=pad_to_multiple_of)

        return padded_batch


    def decode(self, ids, skip_special_tokens=False):
        out = ''
        for id in ids:
            token = self._convert_id_to_token(id)
            if skip_special_tokens and self._is_special_token(id):
                continue
            if token[:2] == '##':
                out += token[2:]
            else:
                out += ' ' + token


        out = re.sub(r'\s+([?.!,\'"])', r'\1', out)
        return out.strip()


    def __call__(self, text, text_pair=None, add_special_tokens=True, return_tensors=None, max_length=None, pad_to_max_length=False, truncation=False, pad_to_multiple_of=None):
        if isinstance(text, str):

            return self.encode(
                text,
                text_pair=text_pair,
                add_special_tokens=add_special_tokens,
                return_tensors=return_tensors,
                max_length=max_length,
                pad_to_max_length=pad_to_max_length,
                truncation=truncation
            )
        elif isinstance(text, list) and all(isinstance(t, str) for t in text):

            return self.batch_encode_plus(
                text,
                text_pairs=text_pair,
                add_special_tokens=add_special_tokens,
                return_tensors=return_tensors,
                max_length=max_length,
                pad_to_max_length=pad_to_max_length,
                truncation=truncation,
                pad_to_multiple_of=pad_to_multiple_of
            )
        else:
            raise ValueError("Input text should be either a single string or a list of strings.")















    def __len__(self):
        return self.vocab_size

In [None]:
path_to_dict = '/content/drive/MyDrive/Thesis/Code/segmentation_dictionary_final.json'
path_to_tokenizer = "/content/drive/MyDrive/Thesis/Code/HELP/MORF/help_2815"

segmentation_dictionary = load_json(path_to_dict)
help_tokenizer = AutoTokenizer.from_pretrained(path_to_tokenizer)

tokenizer_morf = CustomTokenizer(segmentation_dictionary, help_tokenizer)

## Dataset

In [None]:
tokenized_dataset_pos_morf = dataset_pos.map(lambda examples: tokenize_and_align_labels_custom(examples, tokenizer_morf))

Map:   0%|          | 0/5787 [00:00<?, ? examples/s]

Map:   0%|          | 0/676 [00:00<?, ? examples/s]

Map:   0%|          | 0/875 [00:00<?, ? examples/s]

In [None]:
print(tokenized_dataset_pos_morf['train'][9]['input_ids'])
print(tokenized_dataset_pos_morf['train'][9]['attention_mask'])
print(tokenized_dataset_pos_morf['train'][9]['labels'])

[2, 36767, 18467, 39915, 17587, 24628, 30, 16867, 20385, 12781, 17040, 27097, 21202, 12786, 24628, 3]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[-100, 10, 10, 10, 10, 10, 1, 8, 11, 11, 16, 16, 0, 0, 0, -100]


## Model

In [None]:
label_list = dataset_pos['train'].features['upos'].feature.names
num_labels = len(label_list)
print('labels list:', label_list)

model_morf_pos = RobertaForTokenClassification.from_pretrained('/content/drive/MyDrive/Thesis/Code/Models/MORF/x4/x4', num_labels=num_labels)

labels list: ['NOUN', 'PUNCT', 'ADP', 'NUM', 'SYM', 'SCONJ', 'ADJ', 'PART', 'DET', 'CCONJ', 'PROPN', 'PRON', 'X', '_', 'ADV', 'INTJ', 'VERB', 'AUX']


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at /content/drive/MyDrive/Thesis/Code/Models/MORF/x4/x4 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Train

In [None]:
tokenizer = tokenizer_morf
model = model_morf_pos
tokenized_dataset = tokenized_dataset_pos_morf

In [None]:
# Initialize the data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_strategy="epoch",
    eval_strategy="epoch",
    disable_tqdm=False,
    gradient_accumulation_steps=2
)

# Define the early stopping callback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.0
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    callbacks=[early_stopping_callback],
    data_collator=data_collator,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.5178,0.496169
2,0.3487,0.387307
3,0.2805,0.346936
4,0.2402,0.32903
5,0.233,0.32044
6,0.1935,0.311904
7,0.1942,0.310728
8,0.1667,0.314536
9,0.154,0.313988
10,0.1554,0.314669


TrainOutput(global_step=1810, training_loss=0.31626506510360464, metrics={'train_runtime': 121.8404, 'train_samples_per_second': 474.965, 'train_steps_per_second': 14.855, 'total_flos': 323315667743640.0, 'train_loss': 0.31626506510360464, 'epoch': 10.0})

## Save

In [None]:
model.save_pretrained('/content/drive/MyDrive/Thesis/Code/Models/MORF/trained_model_POS')

In [None]:
model_morf_trained = model.from_pretrained('/content/drive/MyDrive/Thesis/Code/Models/MORF/trained_model_POS')

# Evaluation

### BPE

In [None]:
model = RobertaForTokenClassification.from_pretrained('/content/drive/MyDrive/Thesis/Code/Models/BPE/trained_model_POS')
dataset = tokenized_dataset_pos_bpe
tokenizer = tokenizer_bpe

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

from transformers import TrainingArguments, Trainer, RobertaForTokenClassification
from datasets import load_metric
import numpy as np

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Define new training arguments for evaluation
eval_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
    disable_tqdm=False,
    do_train=False,
    do_eval=True
)

# Load evaluation metric
accuracy_metric = load_metric("accuracy")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
f1_metric = load_metric("f1")

# Define compute metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [pred for pred, label in zip(prediction, label) if label != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label for pred, label in zip(prediction, label) if label != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Flatten lists
    true_predictions = [item for sublist in true_predictions for item in sublist]
    true_labels = [item for sublist in true_labels for item in sublist]

    # Calculate metrics
    accuracy = accuracy_metric.compute(predictions=true_predictions, references=true_labels)
    precision = precision_metric.compute(predictions=true_predictions, references=true_labels, average="weighted")
    recall = recall_metric.compute(predictions=true_predictions, references=true_labels, average="weighted")
    f1 = f1_metric.compute(predictions=true_predictions, references=true_labels, average="weighted")

    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"],
    }

# Initialize the Trainer with the model, new evaluation arguments, test dataset, and compute metrics function
trainer = Trainer(
    model=model,
    args=eval_args,
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

# Evaluate the model on the test set
test_results_bpe = trainer.evaluate()

print("Test set evaluation results:", test_results_bpe)

Test set evaluation results: {'eval_loss': 0.21009138226509094, 'eval_accuracy': 0.946883230904302, 'eval_precision': 0.946751788498569, 'eval_recall': 0.946883230904302, 'eval_f1': 0.9462901313276587, 'eval_runtime': 1.6624, 'eval_samples_per_second': 526.333, 'eval_steps_per_second': 33.084}


In [None]:
test_results_bpe

{'eval_loss': 0.21009138226509094,
 'eval_accuracy': 0.946883230904302,
 'eval_precision': 0.946751788498569,
 'eval_recall': 0.946883230904302,
 'eval_f1': 0.9462901313276587,
 'eval_runtime': 1.5755,
 'eval_samples_per_second': 555.379,
 'eval_steps_per_second': 34.91}

In [None]:
store_json('/content/drive/MyDrive/Thesis/Code/POS_BPE_results.json', test_results_bpe)

### WP

In [None]:
model = model_wp_trained
dataset = tokenized_dataset_pos_wp
tokenizer = tokenizer_wp

from transformers import TrainingArguments, Trainer, RobertaForTokenClassification
from datasets import load_metric
import numpy as np

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


# Define new training arguments for evaluation
eval_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
    disable_tqdm=False,
    do_train=False,
    do_eval=True
)

# Load evaluation metric
accuracy_metric = load_metric("accuracy")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
f1_metric = load_metric("f1")

# Define compute metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [pred for pred, label in zip(prediction, label) if label != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label for pred, label in zip(prediction, label) if label != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Flatten lists
    true_predictions = [item for sublist in true_predictions for item in sublist]
    true_labels = [item for sublist in true_labels for item in sublist]

    # Calculate metrics
    accuracy = accuracy_metric.compute(predictions=true_predictions, references=true_labels)
    precision = precision_metric.compute(predictions=true_predictions, references=true_labels, average="weighted")
    recall = recall_metric.compute(predictions=true_predictions, references=true_labels, average="weighted")
    f1 = f1_metric.compute(predictions=true_predictions, references=true_labels, average="weighted")

    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"],
    }

# Initialize the Trainer with the model, new evaluation arguments, test dataset, and compute metrics function
trainer = Trainer(
    model=model,
    args=eval_args,
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

# Evaluate the model on the test set
test_results_wp = trainer.evaluate()

print("Test set evaluation results:", test_results_wp)

Test set evaluation results: {'eval_loss': 0.22823336720466614, 'eval_accuracy': 0.942211414676012, 'eval_precision': 0.9421391145098517, 'eval_recall': 0.942211414676012, 'eval_f1': 0.9416481857713931, 'eval_runtime': 1.6446, 'eval_samples_per_second': 532.044, 'eval_steps_per_second': 33.443}


In [None]:
test_results_wp

{'eval_loss': 0.22823336720466614,
 'eval_accuracy': 0.942211414676012,
 'eval_precision': 0.9421391145098517,
 'eval_recall': 0.942211414676012,
 'eval_f1': 0.9416481857713931,
 'eval_runtime': 1.6446,
 'eval_samples_per_second': 532.044,
 'eval_steps_per_second': 33.443}

In [None]:
store_json('/content/drive/MyDrive/Thesis/Code/POS_WP_results.json', test_results_wp)

### MORF

In [None]:
model = model_morf_trained
dataset = tokenized_dataset_pos_morf
tokenizer = tokenizer_morf

from transformers import TrainingArguments, Trainer, RobertaForTokenClassification
from datasets import load_metric
import numpy as np

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


# Define new training arguments for evaluation
eval_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
    disable_tqdm=False,
    do_train=False,
    do_eval=True
)

# Load evaluation metric
accuracy_metric = load_metric("accuracy")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
f1_metric = load_metric("f1")

# Define compute metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [pred for pred, label in zip(prediction, label) if label != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label for pred, label in zip(prediction, label) if label != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Flatten lists
    true_predictions = [item for sublist in true_predictions for item in sublist]
    true_labels = [item for sublist in true_labels for item in sublist]

    # Calculate metrics
    accuracy = accuracy_metric.compute(predictions=true_predictions, references=true_labels)
    precision = precision_metric.compute(predictions=true_predictions, references=true_labels, average="weighted")
    recall = recall_metric.compute(predictions=true_predictions, references=true_labels, average="weighted")
    f1 = f1_metric.compute(predictions=true_predictions, references=true_labels, average="weighted")

    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"],
    }

# Initialize the Trainer with the model, new evaluation arguments, test dataset, and compute metrics function
trainer = Trainer(
    model=model,
    args=eval_args,
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

# Evaluate the model on the test set
test_results_morf = trainer.evaluate()

print("Test set evaluation results:", test_results_morf)

Test set evaluation results: {'eval_loss': 0.3402920365333557, 'eval_accuracy': 0.9068885861517503, 'eval_precision': 0.9066191535219507, 'eval_recall': 0.9068885861517503, 'eval_f1': 0.9062189907625517, 'eval_runtime': 1.8257, 'eval_samples_per_second': 479.27, 'eval_steps_per_second': 30.126}


In [None]:
test_results_morf

{'eval_loss': 0.3402920365333557,
 'eval_accuracy': 0.9068885861517503,
 'eval_precision': 0.9066191535219507,
 'eval_recall': 0.9068885861517503,
 'eval_f1': 0.9062189907625517,
 'eval_runtime': 1.8257,
 'eval_samples_per_second': 479.27,
 'eval_steps_per_second': 30.126}

In [None]:
store_json('/content/drive/MyDrive/Thesis/Code/POS_MORF_results.json', test_results_morf)