# Text mining assignment 2
## Emma Vonk, Julius Ruijgrok

### Convert the IOB data to the correct data structure for token classification in Huggingface (words and labels like the conll2023 data in the tutorial) and align the labels with the tokens. Note that since you are working with a custom dataset, the data conversion is a necessary step for using the Huggingface training function. 

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
import datasets

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [2]:
# Function to make the dataset to the correct huggingface structure explained in: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
#
def read_bio_file(filepath):
    sentences = []
    current_sentence = {"tokens": [], "ner_tags": []}
    
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                if current_sentence["tokens"]:
                    sentences.append(current_sentence)
                    current_sentence = {"tokens": [], "ner_tags": []}
            else:
                token, pos, label = line.split()  # Each line is a token POS-label
                current_sentence["tokens"].append(token)
                # Map labels to an integer ID
                current_sentence["ner_tags"].append(label_to_id(label)) 

        # Add the last sentence if file doesn't end with a blank line
        if current_sentence["tokens"]:
            sentences.append(current_sentence)
    
    return sentences

def label_to_id(label):
    # This function should map each label to a unique integer (e.g., B-PER -> 0, I-PER -> 1, O -> 2).
    label_mapping = {"O": 0, "B-PER": 1, "I-PER": 2, "B-LOC": 3, "I-LOC": 4}
    return label_mapping.get(label, -100)  # Return -100 for unknown labels

# Read datasets
train_data = read_bio_file("train.txt")
val_data = read_bio_file("val.txt")
test_data = read_bio_file("test.txt")

# Load into HuggingFace dataset structure
dataset = datasets.DatasetDict({
    "train": datasets.Dataset.from_dict({"tokens": [d["tokens"] for d in train_data], "ner_tags": [d["ner_tags"] for d in train_data]}),
    "validation": datasets.Dataset.from_dict({"tokens": [d["tokens"] for d in val_data], "ner_tags": [d["ner_tags"] for d in val_data]}),
    "test": datasets.Dataset.from_dict({"tokens": [d["tokens"] for d in test_data], "ner_tags": [d["ner_tags"] for d in test_data]}),
})

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1992
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 850
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 864
    })
})

In [4]:
ner_feature = dataset["train"].features["ner_tags"]
print(ner_feature)
label_names = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
print(label_names)

Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)
['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [5]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
words = dataset["train"][0]["tokens"]
labels = dataset["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

Figure 18 . 
O      O  O 


In [6]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

Map:   0%|          | 0/1992 [00:00<?, ? examples/s]

Map:   0%|          | 0/850 [00:00<?, ? examples/s]

Map:   0%|          | 0/864 [00:00<?, ? examples/s]

### Fine-tune a model with the default hyperparameter settings on the train set and evaluate the model on the test set. These are your baseline results.

In [7]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, return_tensors="tf"
)

In [8]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)

tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

In [9]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [10]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
from transformers import TFAutoModelForTokenClassification

model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from transformers import create_optimizer
import tensorflow as tf

# Set training parameters
num_train_steps = len(tf_train_dataset) * 3  # 3 epochs
num_warmup_steps = num_train_steps // 10

model.compile()

# Fine-tuning the model on the train set
model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    epochs=3,
)

# Now, let's evaluate the model on the test set
tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

# Evaluate on test set
test_results = model.evaluate(tf_test_dataset)

# Print the evaluation results
print("Test set evaluation results:", test_results)

Epoch 1/3
  2/125 [..............................] - ETA: 12:25 - loss: 1.2374  

KeyboardInterrupt: 

In [12]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
import evaluate

metric = evaluate.load("seqeval")

labels = dataset["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

import numpy as np

all_predictions = []
all_labels = []
for batch in tf_eval_dataset:
    logits = model.predict_on_batch(batch)["logits"]
    labels = batch["labels"]
    predictions = np.argmax(logits, axis=-1)
    for prediction, label in zip(predictions, labels):
        for predicted_idx, label_idx in zip(prediction, label):
            if label_idx == -100:
                continue
            all_predictions.append(label_names[predicted_idx])
            all_labels.append(label_names[label_idx])
metric.compute(predictions=[all_predictions], references=[all_labels])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


{'ORG': {'precision': 0.6467065868263473,
  'recall': 0.7248322147651006,
  'f1': 0.6835443037974682,
  'number': 149},
 'PER': {'precision': 0.6915422885572139,
  'recall': 0.8249258160237388,
  'f1': 0.7523680649526386,
  'number': 337},
 'overall_precision': 0.6783831282952548,
 'overall_recall': 0.7942386831275721,
 'overall_f1': 0.7317535545023697,
 'overall_accuracy': 0.9832358852548876}

### Set up hyperparameter optimization with the AdamW optimizer as explained in the tutorial. During optimization, use the val set as validation. After the model has been optimized, evaluate the result on the test set.

In [13]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
from transformers import create_optimizer
import tensorflow as tf

# Train in mixed-precision float16
# Comment this line out if you're using a GPU that will not benefit from this
tf.keras.mixed_precision.set_global_policy("mixed_float16")

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_epochs = 3
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


In [14]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
from transformers.keras_callbacks import PushToHubCallback
# TODO check if this can be ignored, im doing it now lol and it still works
#callback = PushToHubCallback(output_dir="bert-finetuned-ner", tokenizer=tokenizer)

model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    #callbacks=[callback],
    epochs=num_epochs,
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x23682939d50>

In [15]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
import evaluate

metric = evaluate.load("seqeval")

labels = dataset["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

{'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 1.0}

In [16]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
import numpy as np

all_predictions = []
all_labels = []
for batch in tf_eval_dataset:
    logits = model.predict_on_batch(batch)["logits"]
    labels = batch["labels"]
    predictions = np.argmax(logits, axis=-1)
    for prediction, label in zip(predictions, labels):
        for predicted_idx, label_idx in zip(prediction, label):
            if label_idx == -100:
                continue
            all_predictions.append(label_names[predicted_idx])
            all_labels.append(label_names[label_idx])
metric.compute(predictions=[all_predictions], references=[all_labels])

{'ORG': {'precision': 0.6484848484848484,
  'recall': 0.7181208053691275,
  'f1': 0.6815286624203821,
  'number': 149},
 'PER': {'precision': 0.7012987012987013,
  'recall': 0.8011869436201781,
  'f1': 0.7479224376731302,
  'number': 337},
 'overall_precision': 0.6854545454545454,
 'overall_recall': 0.7757201646090535,
 'overall_f1': 0.7277992277992279,
 'overall_accuracy': 0.9831445276813447}

Extend the evaluation function so that it shows the Precision, Recall and F-score for each of the entity types (location, artefact, etc.) on the test set. Include the metrics for the B-label of the entity type, the I-label, and the full entities.

Look up the definitions of macro- and micro-average scores and compute the macro- and micro average F1 scores over all entities.

In [22]:
import numpy as np
import evaluate
from seqeval.metrics import classification_report as seqeval_classification_report

# Load seqeval metric
metric = evaluate.load("seqeval")

all_predictions = []
all_labels = []

# Evaluate on the test set
for batch in tf_test_dataset:
    logits = model.predict_on_batch(batch)["logits"]
    batch_labels = batch["labels"].numpy()  # Avoid overwriting variable 'labels'
    batch_predictions = np.argmax(logits, axis=-1)  # Avoid overwriting variable 'predictions'

    for pred, true_label in zip(batch_predictions, batch_labels):
        pred_sequence = []
        label_sequence = []
        for predicted_idx, label_idx in zip(pred, true_label):
            if label_idx == -100:  # Skip padding
                continue
            pred_label = label_names[predicted_idx]
            true_label_str = label_names[label_idx]
            pred_sequence.append(pred_label)
            label_sequence.append(true_label_str)
        all_predictions.append(pred_sequence)
        all_labels.append(label_sequence)

# Compute overall metrics
results = metric.compute(predictions=all_predictions, references=all_labels)

print("Overall Metrics:")
print(f"Precision: {results['overall_precision']:.4f}")
print(f"Recall: {results['overall_recall']:.4f}")
print(f"F1-Score: {results['overall_f1']:.4f}")
print(f"Accuracy: {results['overall_accuracy']:.4f}")

# Use seqeval's classification_report to get detailed per-entity metrics
print("\nPer-Entity Metrics:")
print(seqeval_classification_report(all_labels, all_predictions))

Overall Metrics:
Precision: 0.7354
Recall: 0.8525
F1-Score: 0.7896
Accuracy: 0.9856

Per-Entity Metrics:
              precision    recall  f1-score   support

         ORG       0.66      0.78      0.72       144
         PER       0.77      0.89      0.83       283

   micro avg       0.74      0.85      0.79       427
   macro avg       0.72      0.83      0.77       427
weighted avg       0.74      0.85      0.79       427



In [25]:
from collections import Counter

# Count label occurrences in the training dataset
label_counter = Counter()
for batch in tf_train_dataset:
    labels = batch["labels"].numpy()
    for label_seq in labels:
        for label in label_seq:
            if label != -100:  # Exclude padding labels
                label_counter[label_names[label]] += 1

print("Label distribution in the training set:")
for label, count in label_counter.items():
    print(f"{label}: {count}")

# TODO er is ergens een grote imbalance zoals te zien is en die worden dus niet meegenomen in de training? 
# goed kijken naar hoe labels gedistribute worden en eventueel trainen met meer weight op de andere labels dan O?

Label distribution in the training set:
O: 57424
B-PER: 1213
I-PER: 1631
B-ORG: 342
I-ORG: 679
