# Text mining assignment 2
## Emma Vonk, Julius Ruijgrok

### Convert the IOB data to the correct data structure for token classification in Huggingface (words and labels like the conll2023 data in the tutorial) and align the labels with the tokens. Note that since you are working with a custom dataset, the data conversion is a necessary step for using the Huggingface training function. 

In [15]:
from datasets import load_dataset
from transformers import AutoTokenizer
import datasets

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [16]:
# Function to make the dataset to the correct huggingface structure explained in: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
#
def read_bio_file(filepath):
    sentences = []
    current_sentence = {"tokens": [], "ner_tags": []}
    
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                if current_sentence["tokens"]:
                    sentences.append(current_sentence)
                    current_sentence = {"tokens": [], "ner_tags": []}
            else:
                token, pos, label = line.split()  # Each line is a token POS-label
                current_sentence["tokens"].append(token)
                # Map labels to an integer ID
                current_sentence["ner_tags"].append(label_to_id(label)) 

        # Add the last sentence if file doesn't end with a blank line
        if current_sentence["tokens"]:
            sentences.append(current_sentence)
    
    return sentences

def label_to_id(label):
    # This function should map each label to a unique integer (e.g., B-PER -> 0, I-PER -> 1, O -> 2).
    label_mapping = {"O": 0, "B-PER": 1, "I-PER": 2, "B-LOC": 3, "I-LOC": 4}
    return label_mapping.get(label, -100)  # Return -100 for unknown labels

# Read datasets
train_data = read_bio_file("train.txt")
val_data = read_bio_file("val.txt")
test_data = read_bio_file("test.txt")

# Load into HuggingFace dataset structure
dataset = datasets.DatasetDict({
    "train": datasets.Dataset.from_dict({"tokens": [d["tokens"] for d in train_data], "ner_tags": [d["ner_tags"] for d in train_data]}),
    "validation": datasets.Dataset.from_dict({"tokens": [d["tokens"] for d in val_data], "ner_tags": [d["ner_tags"] for d in val_data]}),
    "test": datasets.Dataset.from_dict({"tokens": [d["tokens"] for d in test_data], "ner_tags": [d["ner_tags"] for d in test_data]}),
})

In [17]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1992
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 850
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 864
    })
})

In [18]:
ner_feature = dataset["train"].features["ner_tags"]
print(ner_feature)
label_names = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
print(label_names)

Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)
['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [19]:
words = dataset["train"][0]["tokens"]
labels = dataset["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

Figure 18 . 
O      O  O 


In [20]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

Map:   0%|          | 0/1992 [00:00<?, ? examples/s]

Map:   0%|          | 0/850 [00:00<?, ? examples/s]

Map:   0%|          | 0/864 [00:00<?, ? examples/s]

### Fine-tune a model with the default hyperparameter settings on the train set and evaluate the model on the test set. These are your baseline results.

In [21]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, return_tensors="tf"
)

In [22]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)

tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

In [23]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [24]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
from transformers import TFAutoModelForTokenClassification

model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
# TODO check how het zit met model trainen want vgm moet je die stap hierna hier al doen? als je niet compiled kan je niet trainen namelijk

### Set up hyperparameter optimization with the AdamW optimizer as explained in the tutorial. During optimization, use the val set as validation. After the model has been optimized, evaluate the result on the test set.

In [26]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
from transformers import create_optimizer
import tensorflow as tf

# Train in mixed-precision float16
# Comment this line out if you're using a GPU that will not benefit from this
tf.keras.mixed_precision.set_global_policy("mixed_float16")

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_epochs = 3
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


In [27]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
from transformers.keras_callbacks import PushToHubCallback
# TODO check if this can be ignored, im doing it now lol and it still works
#callback = PushToHubCallback(output_dir="bert-finetuned-ner", tokenizer=tokenizer)

model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    #callbacks=[callback],
    epochs=num_epochs,
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x20b8762af10>

In [28]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
import evaluate

metric = evaluate.load("seqeval")

labels = dataset["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


{'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 1.0}

In [29]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
import numpy as np

all_predictions = []
all_labels = []
for batch in tf_eval_dataset:
    logits = model.predict_on_batch(batch)["logits"]
    labels = batch["labels"]
    predictions = np.argmax(logits, axis=-1)
    for prediction, label in zip(predictions, labels):
        for predicted_idx, label_idx in zip(prediction, label):
            if label_idx == -100:
                continue
            all_predictions.append(label_names[predicted_idx])
            all_labels.append(label_names[label_idx])
metric.compute(predictions=[all_predictions], references=[all_labels])

{'ORG': {'precision': 0.5975609756097561,
  'recall': 0.6577181208053692,
  'f1': 0.6261980830670927,
  'number': 149},
 'PER': {'precision': 0.6814814814814815,
  'recall': 0.8189910979228486,
  'f1': 0.7439353099730458,
  'number': 337},
 'overall_precision': 0.6572934973637962,
 'overall_recall': 0.7695473251028807,
 'overall_f1': 0.709004739336493,
 'overall_accuracy': 0.9830988488945733}

Extend the evaluation function so that it shows the Precision, Recall and F-score for each of the entity types (location, artefact, etc.) on the test set. Include the metrics for the B-label of the entity type, the I-label, and the full entities.


In [None]:
# TODO insert code

Look up the definitions of macro- and micro-average scores and compute the macro- and micro average F1 scores over all entities.

In [None]:
# TODO insert code