## Step 1. Checking all installs, importing libraries and loading all the functions we'll use


In [1]:
%pip install accelerate -U
%pip install transformers
%pip install datasets
%pip install seqeval





[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import torch
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import load_metric
from datasets import Dataset
import numpy as np
import pandas as pd
from huggingface_hub import notebook_login

All the definitions can be found in the cell below

In [3]:
# definitions of all functions


#reading in data as a adataframe
def read_iob2_file(path):
    data = []
    current_words = []
    current_tags = []

    for line in open(path, encoding='utf-8'):
        line = line.strip()

        if line:
            if line[0] == '#':
                continue
            tok = line.split('\t')

            current_words.append(tok[1])
            current_tags.append(tok[2])
        else:
            if current_words:
                data.append((current_words, current_tags))
            current_words = []
            current_tags = []

    if current_tags != []:
        data.append((current_words, current_tags))

    df = pd.DataFrame(data, columns=['words', 'tags'])
    df['id'] = df.index
    df = df[['id', 'words', 'tags']]
    
    return df


# creating the dictionary of labels and their indices
class Vocab():
    def __init__(self, pad_unk='<PAD>'):
        self.pad_unk = pad_unk
        self.word2idx = {}
        self.idx2word = []

    def getIdx(self, word, add=False):
        if word is None or word == self.pad_unk:
            return None
        if word not in self.word2idx:
            if add:
                idx = len(self.idx2word)
                self.word2idx[word] = idx
                self.idx2word.append(word)
                return idx
            else:
                return None
        return self.word2idx[word]

    def getWord(self, idx):
        return self.idx2word[idx]
    

# tokenizing the labels
def tokenize_and_align_labels(dataset, word_column, tag_column, tokenizer):
    tokenized_inputs = tokenizer(dataset[word_column].tolist(), truncation=True, is_split_into_words=True, padding = True)

    labels = []
    for i, label in enumerate(dataset[tag_column]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs.data


# function for computing the metrics from a trained model
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

## Step 2. Log in into the hugging face

In [4]:
# Password for the hub (Anna)
# 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

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Step 3. Set up all the neccessary paths and variables

In [21]:
# specify the datasets you want to use
train_path = "data\en_ewt-ud-train.iob2"
dev_path = "data\en_ewt-ud-dev.iob2"
test_path = "data\en_ewt-ud-test-masked.iob2"



# specify the model and the tokenizer you want to use
model_name = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True)


# This will be used to create a repository on the hubbging face
# for saving our model and the logs (checkpoints)
hub_folder = "annamariagnat/final_test"


# specify the name of the iob2 file that will have the predictions from testing
filename = "test2_preds"

# setting up variables for the model
batch_size = 16
evaluation_strategy = "epoch"
learning_rate = 2e-5
num_train_epochs = 3
weight_decay = 0.01
push_to_hub=True


data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")


# what does this do???
label_all_tokens = True # dw about it

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


## Step 4. Loading and formatting the data for the model

In [16]:
# read in the data as a dataframe
train_data = read_iob2_file("data\en_ewt-ud-train.iob2")[0:50]
dev_data = read_iob2_file("data\en_ewt-ud-dev.iob2")[0:50]
test_data = read_iob2_file("data\en_ewt-ud-test-masked.iob2")[0:50]


# get the tag dictionary
label_indices = Vocab()
tags_column = train_data["tags"]

for tags in tags_column:
    for tag in tags:
        label_indices.getIdx(tag, add=True)

label_list = label_indices.idx2word


# translate tags into numerical
train_data['tag_idx'] = train_data['tags'].apply(lambda x: [label_indices.word2idx[tag] for tag in x])
dev_data['tag_idx'] = dev_data['tags'].apply(lambda x: [label_indices.word2idx[tag] for tag in x])
test_data['tag_idx'] = test_data['tags'].apply(lambda x: [label_indices.word2idx[tag] for tag in x])


# tokenize the data
tokenized_data = tokenize_and_align_labels(train_data, "words", "tag_idx", tokenizer)
tokenized_dev_data = tokenize_and_align_labels(dev_data, "words", "tag_idx", tokenizer)
tokenized_test_data = tokenize_and_align_labels(test_data, "words", "tag_idx", tokenizer)


# turning the data into datasetdicts, to make them compatible with the trainer (otherwise they can't be indexed)
train_dataset = Dataset.from_dict({
    'id': range(len(tokenized_data['input_ids'])),
    'input_ids': tokenized_data['input_ids'],
    'attention_mask': tokenized_data['attention_mask'],
    'labels': tokenized_data['labels']
})

dev_dataset = Dataset.from_dict({
    'id': range(len(tokenized_dev_data['input_ids'])),
    'input_ids': tokenized_dev_data['input_ids'],
    'attention_mask': tokenized_dev_data['attention_mask'],
    'labels': tokenized_dev_data['labels']
})

test_dataset = Dataset.from_dict({
    'id': range(len(tokenized_test_data['input_ids'])),
    'input_ids': tokenized_test_data['input_ids'],
    'attention_mask': tokenized_test_data['attention_mask'],
    'labels': tokenized_test_data['labels']
})

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


## Step 5. Training the model

In [24]:
 # verify the tokenizers compatibility with hugging face
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

#Specify waht model you want to deploy
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))



args = TrainingArguments(
    output_dir = hub_folder,
    evaluation_strategy = evaluation_strategy,
    learning_rate = learning_rate,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = num_train_epochs,
    weight_decay = weight_decay,
    push_to_hub = push_to_hub,
)

trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)



Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training the model

In [25]:
trainer.train()

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.2422748804092407, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.891271056661562, 'eval_runtime': 0.4578, 'eval_samples_per_second': 109.21, 'eval_steps_per_second': 8.737, 'epoch': 1.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.9826931953430176, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9157733537519143, 'eval_runtime': 0.4485, 'eval_samples_per_second': 111.474, 'eval_steps_per_second': 8.918, 'epoch': 2.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.8890056610107422, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9188361408882083, 'eval_runtime': 0.4566, 'eval_samples_per_second': 109.508, 'eval_steps_per_second': 8.761, 'epoch': 3.0}
{'train_runtime': 3.4585, 'train_samples_per_second': 43.372, 'train_steps_per_second': 3.47, 'train_loss': 1.1677655378977458, 'epoch': 3.0}


TrainOutput(global_step=12, training_loss=1.1677655378977458, metrics={'train_runtime': 3.4585, 'train_samples_per_second': 43.372, 'train_steps_per_second': 3.47, 'train_loss': 1.1677655378977458, 'epoch': 3.0})

Saving the trained model on the hub

In [26]:
trainer.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/annamariagnat/final_test/commit/5aa778a60978a21c72bc38b3868f0cfd348182bd', commit_message='End of training', commit_description='', oid='5aa778a60978a21c72bc38b3868f0cfd348182bd', pr_url=None, pr_revision=None, pr_num=None)

## Step 6. Model evaluation

In [12]:
trainer.evaluate()

  0%|          | 0/4 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.9266831874847412,
 'eval_precision': 0.0,
 'eval_recall': 0.0,
 'eval_f1': 0.0,
 'eval_accuracy': 0.9188361408882083,
 'eval_runtime': 0.5229,
 'eval_samples_per_second': 95.618,
 'eval_steps_per_second': 7.649,
 'epoch': 3.0}

In [13]:
# evaluating using dev data
dev_dataset_new = Dataset.from_dict({
    'input_ids': dev_dataset['input_ids'],
    'attention_mask': dev_dataset['attention_mask'],
    'labels': dev_dataset['labels']
})



predictions, labels, _ = trainer.predict(dev_dataset_new)
predictions = np.argmax(predictions, axis=2)

true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

  0%|          | 0/4 [00:00<?, ?it/s]

{'LOC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 43},
 'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 0.9188361408882083}

## Step 8. Testing

In [27]:
# functions needed for the testing phase

def un_tok_labs(list_of_labels, list_of_words):
    tokenized_inputs = tokenizer(list_of_words, truncation=True, is_split_into_words=True)
    print(tokenized_inputs)
    labels = []
    for i, label in enumerate(list_of_labels):
        print(label)
        label_copy = label.copy()  # Create a copy of the label list

        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        print(word_ids)
        print(tokenizer.convert_ids_to_tokens(tokenized_inputs["input_ids"][i]))
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            print("word_idx", word_idx)
            if word_idx is None:  # Only label the first token of a given word.
                continue
            elif word_idx == previous_word_idx:
                label_copy.pop(word_idx)
                continue
            else:
                label_ids.append(label_copy[word_idx])
            previous_word_idx = word_idx 
        labels.append(label_ids)
    return labels



def read_list_of_words(path):
    """
    read in iob2 file
    
    :param path: path to read from
    :returns: list with sequences of words for each sentence
    """
    data = []
    current_words = []

    for line in open(path, encoding='utf-8'):
        line = line.strip()

        if line:
            if line[0] == '#':
                continue # skip comments
            tok = line.split('\t')

            current_words.append(tok[1])
        else:
            if current_words:  # skip empty lines
                data.append(current_words)
            current_words = []

    # Check for the last sentence
    if current_words:
        data.append(current_words)
    
    return data



def save_preds(tok, untok_labs):
    with open("group2_preds", "w", encoding="utf-8") as f: 
        for t,l in zip(tok, untok_labs): 
            for i in range(len(t)): 
                f.write(f"{i+1}\t{t[i]}\t{l[i]}\n")

            f.write("\n")
    return ("File has been saved")

In [29]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [30]:
loaded_model = AutoModelForTokenClassification.from_pretrained(hub_folder)
trainer = Trainer(model = loaded_model)

OSError: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like annamariagnat/test2 is not the path to a directory containing a file named config.json.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.

Configuring the test data to matvch the format the mdoel requires

In [14]:
test_dataset_new = Dataset.from_dict({
    'input_ids': test_dataset['input_ids'],
    'attention_mask': test_dataset['attention_mask'],
    'labels': test_dataset['labels']
})

NameError: name 'test_dataset' is not defined

In [13]:
trainer.predict(test_dataset_new)

predictions, labels, _ = trainer.predict(test_dataset_new)
predictions = np.argmax(predictions, axis=2)

true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]


test_words = read_list_of_words(test_path)
untok_labs = un_tok_labs(true_predictions, test_words)

NameError: name 'test_dataset_new' is not defined

In [None]:
save_preds(filename, test_words, untok_labs)