In [None]:
%pip install accelerate -U
%pip install transformers
%pip install datasets
%pip install seqeval

In [1]:
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, Trainer
from datasets import Dataset
import numpy as np
import pandas as pd
import pickle
from huggingface_hub import login
import os

## Testing

! IMPORTANT 
To run the testing you need to have a trained model that is already saved as well as a pickle of the label_indices from training.

Specify the path of the testing set and the path for saving the file with predictions

In [32]:
# dataset path
test_path = "data\sk_snk-ud-test.iob2"

# specify the name of the iob2 file that will have the predictions from testing
filename = "polish_on_slovak_predictions"

# where should we pull the model from
hub_folder = "annamariagnat/trained_polish"

# specify a path to your label_indices
labels = "01_label_indices/labels_polish.pkl"

label_all_tokens = True # dw about it

In [25]:
# repeated the functions needed to load in and format the test dataset

#reading in data as a dataframe
def read_iob2_file(path):
    data = []
    current_words = []
    current_tags = []

    for line in open(path, encoding='utf-8'):
        line = line.strip()

        if line:
            if line[0] == '#':
                continue
            tok = line.split('\t')

            current_words.append(tok[1])
            current_tags.append(tok[2])
        else:
            if current_words:
                data.append((current_words, current_tags))
            current_words = []
            current_tags = []

    if current_tags != []:
        data.append((current_words, current_tags))

    df = pd.DataFrame(data, columns=['words', 'tags'])
    df['id'] = df.index
    df = df[['id', 'words', 'tags']]
    
    return df

# tokenizing the labels
def tokenize_and_align_labels(dataset, word_column, tag_column, tokenizer):
    tokenized_inputs = tokenizer(dataset[word_column].tolist(), truncation=True , is_split_into_words=True, padding = True)

    labels = []
    for i, label in enumerate(dataset[tag_column]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs.data


class Vocab():
    def __init__(self, pad_unk='<PAD>'):
        self.pad_unk = pad_unk
        self.word2idx = {}
        self.idx2word = []

    def getIdx(self, word, add=False):
        if word is None or word == self.pad_unk:
            return None
        if word not in self.word2idx:
            if add:
                idx = len(self.idx2word)
                self.word2idx[word] = idx
                self.idx2word.append(word)
                return idx
            else:
                return None
        return self.word2idx[word]

    def getWord(self, idx):
        return self.idx2word[idx]

In [4]:

# Password for the hub (Anna)
# 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

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
loaded_model = AutoModelForTokenClassification.from_pretrained(hub_folder)
trainer = Trainer(model = loaded_model)
tokenizer = tokenizer = AutoTokenizer.from_pretrained(hub_folder)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [33]:
# Load label_indices dictionary from the pickle file
with open(labels, 'rb') as f:
    label_indices = pickle.load(f)

label_list = label_indices.idx2word

In [34]:
test_data = read_iob2_file(test_path)

test_data['tag_idx'] = test_data['tags'].apply(lambda x: [label_indices.word2idx[tag] for tag in x])

tokenized_test_data = tokenize_and_align_labels(test_data, "words", "tag_idx", tokenizer)

test_dataset = Dataset.from_dict({
    'id': range(len(tokenized_test_data['input_ids'])),
    'input_ids': tokenized_test_data['input_ids'],
    'attention_mask': tokenized_test_data['attention_mask'],
    'labels': tokenized_test_data['labels']
})

test_dataset_new = Dataset.from_dict({
    'input_ids': test_dataset['input_ids'],
    'attention_mask': test_dataset['attention_mask'],
    'labels': test_dataset['labels']
})

In [35]:
# functions needed for the testing phase

def un_tok_labs(list_of_labels, list_of_words):
    tokenized_inputs = tokenizer(list_of_words, is_split_into_words=True)
    labels = []
    for i, label in enumerate(list_of_labels):
        print(label)
        label_copy = label.copy()  # Create a copy of the label list
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:  # Only label the first token of a given word.
                continue
            elif word_idx == previous_word_idx:
                label_copy.pop(word_idx)
                continue
            else:
                label_ids.append(label_copy[word_idx])
            previous_word_idx = word_idx 
        labels.append(label_ids)
    return labels



def read_list_of_words(path):
    """
    read in iob2 file
    
    :param path: path to read from
    :returns: list with sequences of words for each sentence
    """
    data = []
    current_words = []

    for line in open(path, encoding='utf-8'):
        line = line.strip()

        if line:
            if line[0] == '#':
                continue # skip comments
            tok = line.split('\t')

            current_words.append(tok[1])
        else:
            if current_words:  # skip empty lines
                data.append(current_words)
            current_words = []

    # Check for the last sentence
    if current_words:
        data.append(current_words)
    
    return data



def save_preds(filename, tok, untok_labs):
    file_path = "data/" + filename
    if os.path.exists(file_path):
        raise FileExistsError(f"The file {filename} already exists. It won't be overwritten.")
    
    with open(file_path, "w", encoding="utf-8") as f: 
        for t, l in zip(tok, untok_labs): 
            for i in range(len(t)): 
                f.write(f"{i+1}\t{t[i]}\t{l[i]}\n")

            f.write("\n")
    return "File has been saved"

In [36]:
trainer.predict(test_dataset_new)

predictions, labels, _ = trainer.predict(test_dataset_new)
predictions = np.argmax(predictions, axis=2)

true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]


test_words = read_list_of_words(test_path)
untok_labs = un_tok_labs(true_predictions, test_words)

  0%|          | 0/133 [00:00<?, ?it/s]

  0%|          | 0/133 [00:00<?, ?it/s]

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'B-LOC', 'B-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'B-LOC', 'O']
['B-ORG', 'B-ORG', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG']
['B-PER', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'B-PER', 'B-PER', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['B-ORG', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG']
['B-PER', 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'B-LOC', 'O', 'O', 'O']
['B-ORG', 'B-ORG', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O

In [37]:
save_preds(filename, test_words, untok_labs)

'File has been saved'