Libraries

In [None]:
from datasets import load_dataset, ClassLabel, Sequence, concatenate_datasets

import copy
import os
import sys

# Add the folder to the Python path
sys.path.append(os.path.abspath("../../0. Helpers"))

from datasetProcessing import tokens_to_entities
from datasetBalancedSplit import balanced_multilabel_sample, entity_map

Load and filter dataset

In [None]:
# Load
dataset = load_dataset(...)

# Filter for EN
dataset = dataset.filter(lambda x: x["lang"] == "en")

print(dataset)

In [None]:
print(dataset["train"][0])

Confirm all entities

In [None]:
entities = {
    "O": 0,
    "B-PER": 1,
    "I-PER": 2,
    "B-ORG": 3,
    "I-ORG": 4,
    "B-LOC": 5,
    "I-LOC": 6,
    "B-ANIM": 7,
    "I-ANIM": 8,
    "B-BIO": 9,
    "I-BIO": 10,
    "B-CEL": 11,
    "I-CEL": 12,
    "B-DIS": 13,
    "I-DIS": 14,
    "B-EVE": 15,
    "I-EVE": 16,
    "B-FOOD": 17,
    "I-FOOD": 18,
    "B-INST": 19,
    "I-INST": 20,
    "B-MEDIA": 21,
    "I-MEDIA": 22,
    "B-MYTH": 23,
    "I-MYTH": 24,
    "B-PLANT": 25,
    "I-PLANT": 26,
    "B-TIME": 27,
    "I-TIME": 28,
    "B-VEHI": 29,
    "I-VEHI": 30
}

print("Entities:", entities)

Process into default index labelling

In [None]:
# Entities
entity_names = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-ANIM", "I-ANIM", "B-BIO", "I-BIO", "B-CEL", "I-CEL", "B-DIS", "I-DIS", "B-EVE", "I-EVE", "B-FOOD", "I-FOOD", "B-INST", "I-INST", "B-MEDIA", "I-MEDIA", "B-MYTH", "I-MYTH", "B-PLANT", "I-PLANT", "B-TIME", "I-TIME", "B-VEHI", "I-VEHI"]
entity_names_parsed = {"PER": "Person", "LOC": "Location", "ORG": "Organization", "ANIM": "Animal", "BIO": "Biological entity", "CEL": "Celestial Body", "DIS": "Disease", "EVE": "Event", "FOOD": "Food", "INST": "Instrument", "MEDIA": "Media", "PLANT": "Plant", "MYTH": "Mythological entity", "TIME": "Time", "VEHI": "Vehicle"}

In [None]:
# Update the features with the list of ner_tags
dataset = dataset.cast_column("ner_tags", Sequence(feature = ClassLabel(names = entity_names)))

Confirmation

In [None]:
print(dataset)
print(); print("example"); print(dataset["train"][0])
print(); print("entities"); print(dataset["train"].features["ner_tags"].feature.names)

Save dataset!

In [None]:
dataset.save_to_disk('multinerd_en_hf_original')

Split into train and test 

In [None]:
dataset_low = copy.copy(dataset)

# merge test and validation
dataset_low["test"] = concatenate_datasets([dataset_low["test"], dataset_low["validation"]])
dataset_low.pop("validation")

print(dataset_low)

Balanced version

In [None]:
from entities_multinerd_en import entity_names, entity_names_parsed

# get the entity names
start_of_entity_indices = [i for i in range(len(entity_names)) if (entity_names[i].startswith("B-") or entity_names[i].startswith("U-"))]
entity_index_to_name = {i: entity_names[i].split("-")[1] for i in range(len(entity_names)) if entity_names[i] != "O"}
entity_index_to_name[0] = "O"

In [None]:
train_entity_map = entity_map(dataset_low["train"], entity_names_parsed, start_of_entity_indices, entity_index_to_name); print()
test_entity_map = entity_map(dataset_low["test"], entity_names_parsed, start_of_entity_indices, entity_index_to_name)

In [None]:
new_train_idx = balanced_multilabel_sample(train_entity_map, 200)
new_test_idx = balanced_multilabel_sample(test_entity_map, 600)

In [None]:
# Select the new datasets based on the new indices
dataset_low["train"] = dataset_low["train"].select(new_train_idx)
dataset_low["test"] = dataset_low["test"].select(new_test_idx)

print(dataset_low)

Confirm old vs new distribution

In [None]:
new_train_entity_map = entity_map(dataset_low["train"], entity_names_parsed, start_of_entity_indices, entity_index_to_name); print()
new_test_entity_map = entity_map(dataset_low["test"], entity_names_parsed, start_of_entity_indices, entity_index_to_name)

In [None]:
from collections import Counter

print("Instances with at least one entity of class:")

# Flatten all class lists and count frequencies
entity_counts = Counter(cls for classes in train_entity_map.values() for cls in classes)
print("Old train", entity_counts)

new_entity_counts = Counter(cls for classes in new_train_entity_map.values() for cls in classes)
print("New train", new_entity_counts)

new_test_counts = Counter(cls for classes in new_test_entity_map.values() for cls in classes)
print("New test", new_test_counts)

In [None]:
all_classes_train = {cls:0 for classes in new_train_entity_map.values() for cls in classes}
for instance in dataset_low["train"]:
    entity_list = tokens_to_entities(instance["tokens"], instance["ner_tags"], entity_names_parsed, start_of_entity_indices, entity_index_to_name)
    for entity in entity_list:
        all_classes_train[entity.entity] += 1

print("Total entities train:")
print(all_classes_train)

all_classes_test = {cls:0 for classes in new_test_entity_map.values() for cls in classes}
for instance in dataset_low["test"]:
    entity_list = tokens_to_entities(instance["tokens"], instance["ner_tags"], entity_names_parsed, start_of_entity_indices, entity_index_to_name)
    for entity in entity_list:
        all_classes_test[entity.entity] += 1

print("Total entities test:")
print(all_classes_test)

Save

In [None]:
dataset_low.save_to_disk('multinerd_en_hf_low')