# Text mining assignment 2 (Emma Vonk and Julius Ruijgrok)

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
import datasets
from transformers import DataCollatorForTokenClassification
from transformers import TFAutoModelForTokenClassification
from transformers import create_optimizer
import tensorflow as tf
import evaluate
import numpy as np
from seqeval.metrics import classification_report as seqeval_classification_report
import pandas as pd
from collections import Counter
import random

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, num_labels=5)



# Question 2
Convert the IOB data to the correct data structure for token classification in Huggingface
(words and labels like the conll2023 data in the tutorial) and align the labels with the tokens.

In [2]:
# Function to make the dataset to the correct huggingface structure explained in: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt


def read_bio_file(filepath):
    sentences = []
    current_sentence = {"tokens": [], "ner_tags": [], "pos_tags": []}

    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line == "":
                if current_sentence["tokens"]:
                    sentences.append(current_sentence)
                    current_sentence = {"tokens": [], "ner_tags": [], "pos_tags": []}
            else:
                token, pos, label = line.split()  # Each line is a token POS-label
                current_sentence["tokens"].append(token)
                # Map labels to an integer ID
                current_sentence["ner_tags"].append(label_to_id(label))
                current_sentence["pos_tags"].append(pos)

        # Add the last sentence if file doesn't end with a blank line
        if current_sentence["tokens"]:
            sentences.append(current_sentence)

    return sentences

def label_to_id(label):
    # This function should map each label to a unique integer (e.g., B-PER -> 0, I-PER -> 1, O -> 2).
    label_mapping = {"O": 0, "B-PER": 1, "I-PER": 2, "B-LOC": 3, "I-LOC": 4, 'B-ART': 5, 'I-ART': 6, 'I-MAT': 7, 'B-MAT': 8, 'I-CON': 9, 'B-CON': 10, 'I-SPE': 11, 'B-SPE': 12}
    return label_mapping.get(label, -100)  # Return -100 for unknown labels

# Read datasets
train_data = read_bio_file("train.txt")
val_data = read_bio_file("val.txt")
test_data = read_bio_file("test.txt")

# Load into HuggingFace dataset structure
dataset = datasets.DatasetDict({
    "train": datasets.Dataset.from_dict({"tokens": [d["tokens"] for d in train_data], "ner_tags": [d["ner_tags"] for d in train_data], "pos_tags": [d["pos_tags"] for d in train_data]}),
    "validation": datasets.Dataset.from_dict({"tokens": [d["tokens"] for d in val_data], "ner_tags": [d["ner_tags"] for d in val_data], "pos_tags": [d["pos_tags"] for d in val_data]}),
    "test": datasets.Dataset.from_dict({"tokens": [d["tokens"] for d in test_data], "ner_tags": [d["ner_tags"] for d in test_data], "pos_tags": [d["pos_tags"] for d in test_data
                                                                                                                                                ]})
})

# Define label mapping
label_names = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ART', 'I-ART', 'I-MAT', 'B-MAT', 'I-CON', 'B-CON', 'I-SPE', 'B-SPE']

# Create the ClassLabel feature with only the names (otherwise the number does not overlap)
ner_feature = datasets.ClassLabel(names=label_names)

dataset = dataset.cast_column("ner_tags", datasets.Sequence(ner_feature))

Casting the dataset:   0%|          | 0/1992 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/850 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/864 [00:00<?, ? examples/s]

In [3]:
# print the structure of the data
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'pos_tags'],
        num_rows: 1992
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'pos_tags'],
        num_rows: 850
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'pos_tags'],
        num_rows: 864
    })
})

In [4]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

# Pre-processing the data and tokenize

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names,
)


Map:   0%|          | 0/1992 [00:00<?, ? examples/s]

Map:   0%|          | 0/850 [00:00<?, ? examples/s]

Map:   0%|          | 0/864 [00:00<?, ? examples/s]

Now, the pre-processing of the data is finished

# Question 3
Fine-tune a model with the default hyperparameter settings on the train set and evaluate the model on the test set. These are your baseline results.

In [None]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    return_tensors="tf"
)

In [6]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)

tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

In [7]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

num_epochs = 3
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)


The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


In [None]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    epochs=num_epochs,
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x1e2a74c2890>

In [None]:
# Load seqeval metric
metric = evaluate.load("seqeval")

all_predictions = []
all_labels = []

# Evaluate on the test set
for batch in tf_test_dataset:
    logits = model.predict_on_batch(batch)["logits"]
    batch_labels = batch["labels"].numpy()  # Avoid overwriting variable 'labels'
    batch_predictions = np.argmax(logits, axis=-1)  # Avoid overwriting variable 'predictions'

    for pred, true_label in zip(batch_predictions, batch_labels):
        pred_sequence = []
        label_sequence = []
        for predicted_idx, label_idx in zip(pred, true_label):
            if label_idx == -100:  # Skip padding
                continue
            pred_label = label_names[predicted_idx]
            true_label_str = label_names[label_idx]
            pred_sequence.append(pred_label)
            label_sequence.append(true_label_str)
        all_predictions.append(pred_sequence)
        all_labels.append(label_sequence)

# Compute overall metrics
results = metric.compute(predictions=all_predictions, references=all_labels)

print("Overall Metrics:")
print(f"Precision: {results['overall_precision']:.4f}")
print(f"Recall: {results['overall_recall']:.4f}")
print(f"F1-Score: {results['overall_f1']:.4f}")
print(f"Accuracy: {results['overall_accuracy']:.4f}")

# Use seqeval's classification_report to get detailed per-entity metrics
print("\nPer-Entity Metrics:")
print(seqeval_classification_report(all_labels, all_predictions))

# Calculate metrics for each label individually
print("\nMetrics Per Label:")
label_metrics = {}
for label in ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ART', 'I-ART', 'I-MAT', 'B-MAT', 'I-CON', 'B-CON', 'I-SPE', 'B-SPE']:
    # Filter predictions and references for the current label
    label_predictions = [
        [tag if tag == label else "O" for tag in pred_seq]
        for pred_seq in all_predictions
    ]
    label_references = [
        [tag if tag == label else "O" for tag in true_seq]
        for true_seq in all_labels
    ]

    # Compute precision, recall, and F1 score for the specific label
    label_result = metric.compute(predictions=label_predictions, references=label_references)
    label_metrics[label] = {
        "precision": label_result['overall_precision'],
        "recall": label_result['overall_recall'],
        "f1": label_result['overall_f1']
    }

# Print metrics for each label
for label, metrics in label_metrics.items():
    print(f"Label: {label}")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall: {metrics['recall']:.4f}")
    print(f"  F1-Score: {metrics['f1']:.4f}")

Overall Metrics:
Precision: 0.4983
Recall: 0.6293
F1-Score: 0.5562
Accuracy: 0.9535

Per-Entity Metrics:
              precision    recall  f1-score   support

         ART       0.35      0.59      0.44       168
         CON       0.38      0.58      0.46       216
         LOC       0.54      0.71      0.61       144
         MAT       0.00      0.00      0.00       107
         PER       0.72      0.89      0.80       283
         SPE       0.00      0.00      0.00         2

   micro avg       0.50      0.63      0.56       920
   macro avg       0.33      0.46      0.38       920
weighted avg       0.46      0.63      0.53       920


Metrics Per Label:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


Label: O
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: B-PER
  Precision: 0.7681
  Recall: 0.9364
  F1-Score: 0.8439
Label: I-PER
  Precision: 0.5455
  Recall: 0.7156
  F1-Score: 0.6190
Label: B-LOC
  Precision: 0.7202
  Recall: 0.8403
  F1-Score: 0.7756
Label: I-LOC
  Precision: 0.5290
  Recall: 0.6518
  F1-Score: 0.5840
Label: B-ART
  Precision: 0.4901
  Recall: 0.7381
  F1-Score: 0.5891
Label: I-ART
  Precision: 0.2832
  Recall: 0.4667
  F1-Score: 0.3525
Label: I-MAT
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: B-MAT
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: I-CON
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: B-CON
  Precision: 0.4464
  Recall: 0.6944
  F1-Score: 0.5435
Label: I-SPE
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: B-SPE
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000


# Question 4
Set up hyperparameter optimization (HPO), use the val set as validation. Optimize at least three hyperparameters (learning rate, batch size and weight decay). You can choose your own way to implement this and select your own grid. After the model has been optimized, evaluate the result on the test set.

In [None]:
params = {
    "learning_rates": [1e-5, 2e-5, 3e-5],
    "batch_sizes": [8, 16, 32],
    "weight_decays": [0.01, 0.001, 0.0001]
}

def random_search(params, results):
    results = pd.DataFrame(columns=["learning_rate", "batch_size", "weight_decay", "val_loss"])
    # Perform a random search on 10 combinations
    for i in range(10):
        lr = random.choice(params["learning_rates"])
        batch_size = random.choice(params["batch_sizes"])
        weight_decay = random.choice(params["weight_decays"])
        print(f"Training with learning rate: {lr}, Batch size: {batch_size}, and weight decay: {weight_decay}")

        # Create tf datasets with the current batch size
        tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
            columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
            collate_fn=data_collator,
            shuffle=True,
            batch_size=batch_size,
        )
        tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
            columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
            collate_fn=data_collator,
            shuffle=False,
            batch_size=batch_size,
        )

        # Set up optimizer with the current learning rate and weight decay
        optimizer, schedule = create_optimizer(
            init_lr=lr,
            num_warmup_steps=0,
            num_train_steps=len(tf_train_dataset) * num_epochs,
            weight_decay_rate=weight_decay,
        )
        model.compile(optimizer=optimizer)

        # Train the model
        history = model.fit(
            tf_train_dataset,
            validation_data=tf_eval_dataset,
            epochs=num_epochs,
        )

        # Get the validation loss for the final epoch
        val_loss = history.history['val_loss'][-1]
        # Append the results as a new DataFrame and concatenate
        new_row = pd.DataFrame({
            "learning_rate": [lr],
            "batch_size": [batch_size],
            "weight_decay": [weight_decay],
            "val_loss": [val_loss]
        })
        results = pd.concat([results, new_row], ignore_index=True)
    return results

results = pd.DataFrame(columns=["learning_rate", "batch_size", "weight_decay", "val_loss"])
results = random_search(params, results)

# Display all results
print(results)
results.to_csv("Results_question_4.csv")

# After optimization, evaluate on test set with best hyperparameters
best_params = results.loc[results['val_loss'].idxmin()]
print("Best hyperparameters:", best_params)

Epoch 1/3
Epoch 2/3
Epoch 3/3
Test {n} out of 6 has concluded


  results = pd.concat([results, new_row], ignore_index=True)


Epoch 1/3
Epoch 2/3
Epoch 3/3
Test {n} out of 6 has concluded
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test {n} out of 6 has concluded
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test {n} out of 6 has concluded
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test {n} out of 6 has concluded
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test {n} out of 6 has concluded
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test {n} out of 6 has concluded
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test {n} out of 6 has concluded
   learning_rate batch_size  weight_decay  val_loss
0        0.00001         16         0.010  0.265483
1        0.00001         16         0.001  0.249348
2        0.00001         32         0.010  0.246995
3        0.00001         32         0.001  0.272050
4        0.00002         16         0.010  0.308213
5        0.00002         16         0.001  0.335678
6        0.00002         32         0.010  0.343800
7        0.00002         32         0.001  0.369150
Best hyperparameters: learning_rate     0.00001
batch_size             32
weight_decay         0.0

In [None]:
# Re-train with best hyperparameters and evaluate on test set
best_lr = float(best_params["learning_rate"])
best_batch_size = int(best_params["batch_size"])
best_weight_decay = float(best_params["weight_decay"])

print(best_lr, best_batch_size, best_weight_decay)

tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=best_batch_size,
)
tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=best_batch_size,
)

# Set up optimizer with the current learning rate and weight decay
optimizer, schedule = create_optimizer(
    init_lr=best_lr,
    num_warmup_steps=0,
    num_train_steps=len(tf_train_dataset) * num_epochs,
    weight_decay_rate=best_weight_decay,
)

model.compile(optimizer=optimizer)

history = model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    epochs=num_epochs,
)

# Evaluate on test set
all_predictions = []
all_labels = []
for batch in tf_test_dataset:
    logits = model.predict_on_batch(batch)["logits"]
    labels = batch["labels"]
    predictions = np.argmax(logits, axis=-1)
    for prediction, label in zip(predictions, labels):
        for predicted_idx, label_idx in zip(prediction, label):
            if label_idx == -100:
                continue
            all_predictions.append(label_names[predicted_idx])
            all_labels.append(label_names[label_idx])

test_metric = metric.compute(predictions=[all_predictions], references=[all_labels])
print("Test set performance:", test_metric)

1e-05 32 0.01
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test set performance: {'ART': {'precision': 0.375, 'recall': 0.6071428571428571, 'f1': 0.46363636363636357, 'number': 168}, 'CON': {'precision': 0.39263803680981596, 'recall': 0.5925925925925926, 'f1': 0.47232472324723246, 'number': 216}, 'LOC': {'precision': 0.5614973262032086, 'recall': 0.7291666666666666, 'f1': 0.6344410876132931, 'number': 144}, 'MAT': {'precision': 0.5, 'recall': 0.04672897196261682, 'f1': 0.08547008547008547, 'number': 107}, 'PER': {'precision': 0.7175792507204611, 'recall': 0.8798586572438163, 'f1': 0.7904761904761906, 'number': 283}, 'SPE': {'precision': 0.1111111111111111, 'recall': 0.5, 'f1': 0.1818181818181818, 'number': 2}, 'overall_precision': 0.5125977410947002, 'overall_recall': 0.6413043478260869, 'overall_f1': 0.5697730564944471, 'overall_accuracy': 0.9551446356645917}


# Question 5
Extend the evaluation function so that it shows the Precision, Recall and F-score for each of the entity types (location, artefact, etc.) on the test set. Include the metrics for the B-label of the entity type, the I-label, and the full entities.

# Question 6
Look up the definitions of macro- and micro-average scores and compute the macro- and micro average F1 scores over all entities.

In [None]:
# Load seqeval metric
metric = evaluate.load("seqeval")

all_predictions = []
all_labels = []

# Evaluate on the test set
for batch in tf_test_dataset:
    logits = model.predict_on_batch(batch)["logits"]
    batch_labels = batch["labels"].numpy()  # Avoid overwriting variable 'labels'
    batch_predictions = np.argmax(logits, axis=-1)  # Avoid overwriting variable 'predictions'

    for pred, true_label in zip(batch_predictions, batch_labels):
        pred_sequence = []
        label_sequence = []
        for predicted_idx, label_idx in zip(pred, true_label):
            if label_idx == -100:  # Skip padding
                continue
            pred_label = label_names[predicted_idx]
            true_label_str = label_names[label_idx]
            pred_sequence.append(pred_label)
            label_sequence.append(true_label_str)
        all_predictions.append(pred_sequence)
        all_labels.append(label_sequence)

# Compute overall metrics
results = metric.compute(predictions=all_predictions, references=all_labels)

print("Overall Metrics:")
print(f"Precision: {results['overall_precision']:.4f}")
print(f"Recall: {results['overall_recall']:.4f}")
print(f"F1-Score: {results['overall_f1']:.4f}")
print(f"Accuracy: {results['overall_accuracy']:.4f}")

# Use seqeval's classification_report to get detailed per-entity metrics
print("\nPer-Entity Metrics:")
print(seqeval_classification_report(all_labels, all_predictions))

# Calculate metrics for each label individually
print("\nMetrics Per Label:")
label_metrics = {}
for label in ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ART', 'I-ART', 'I-MAT', 'B-MAT', 'I-CON', 'B-CON', 'I-SPE', 'B-SPE']:
    # Filter predictions and references for the current label
    label_predictions = [
        [tag if tag == label else "O" for tag in pred_seq]
        for pred_seq in all_predictions
    ]
    label_references = [
        [tag if tag == label else "O" for tag in true_seq]
        for true_seq in all_labels
    ]

    # Compute precision, recall, and F1 score for the specific label
    label_result = metric.compute(predictions=label_predictions, references=label_references)
    label_metrics[label] = {
        "precision": label_result['overall_precision'],
        "recall": label_result['overall_recall'],
        "f1": label_result['overall_f1']
    }

# Print metrics for each label
for label, metrics in label_metrics.items():
    print(f"Label: {label}")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall: {metrics['recall']:.4f}")
    print(f"  F1-Score: {metrics['f1']:.4f}")

Overall Metrics:
Precision: 0.5126
Recall: 0.6413
F1-Score: 0.5698
Accuracy: 0.9551

Per-Entity Metrics:
              precision    recall  f1-score   support

         ART       0.38      0.61      0.46       168
         CON       0.39      0.59      0.47       216
         LOC       0.56      0.73      0.63       144
         MAT       0.50      0.05      0.09       107
         PER       0.72      0.88      0.79       283
         SPE       0.11      0.50      0.18         2

   micro avg       0.51      0.64      0.57       920
   macro avg       0.44      0.56      0.44       920
weighted avg       0.53      0.64      0.55       920


Metrics Per Label:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


Label: O
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: B-PER
  Precision: 0.7729
  Recall: 0.9258
  F1-Score: 0.8424
Label: I-PER
  Precision: 0.5166
  Recall: 0.7156
  F1-Score: 0.6000
Label: B-LOC
  Precision: 0.7326
  Recall: 0.8750
  F1-Score: 0.7975
Label: I-LOC
  Precision: 0.5814
  Recall: 0.6696
  F1-Score: 0.6224
Label: B-ART
  Precision: 0.5191
  Recall: 0.7262
  F1-Score: 0.6055
Label: I-ART
  Precision: 0.2994
  Recall: 0.4762
  F1-Score: 0.3676
Label: I-MAT
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: B-MAT
  Precision: 0.5000
  Recall: 0.0467
  F1-Score: 0.0855
Label: I-CON
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: B-CON
  Precision: 0.4663
  Recall: 0.7037
  F1-Score: 0.5609
Label: I-SPE
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: B-SPE
  Precision: 0.1111
  Recall: 0.5000
  F1-Score: 0.1818


In [None]:
# Count label occurrences in the training dataset
label_counter = Counter()
for batch in tf_train_dataset:
    labels = batch["labels"].numpy()
    for label_seq in labels:
        for label in label_seq:
            if label != -100:  # Exclude padding labels
                label_counter[label_names[label]] += 1

print("Label distribution in the training set:")
for label, count in label_counter.items():
    print(f"{label}: {count}")

Label distribution in the training set:
O: 57424
B-ART: 1000
I-ART: 965
B-PER: 1213
I-PER: 1631
B-CON: 1423
B-MAT: 185
I-MAT: 26
I-CON: 153
B-LOC: 342
I-LOC: 679
B-SPE: 184
I-SPE: 9
