Libraries

In [None]:
from datasets import Dataset, DatasetDict, ClassLabel, Sequence
import csv

import copy
import os
import sys

# Add the folder to the Python path
sys.path.append(os.path.abspath("../../0. Helpers"))

from datasetProcessing import tokens_to_entities
from datasetBalancedSplit import balanced_multilabel_sample, entity_map

Read txt file

In [None]:
file_path = "..."

sentences = []
labels = []

current_sentence = []
current_labels = []

with open(file_path, 'r', newline='', encoding='utf-8') as file:
    prev_label = "O" # Reset label

    reader = csv.reader(file)

    for row in reader:
        word, label = row
        
        # check to save sentence
        if word == "":
            if current_sentence != []:
                sentences.append(current_sentence)
                labels.append(current_labels)
                current_sentence = []
                current_labels = []
            
            prev_label = "O" # Reset label
        
        # add word and label to current sentence
        else:

            # get BIO label
            if label == "O":
                # Non-entity (O)
                bio_label = "O"
            else:
                if prev_label != "O" and label == prev_label:
                    # Inside of an entity (I-)
                    bio_label = f"I-{label}"
                else:
                    # Beginning of an entity (B-)
                    bio_label = f"B-{label}"

            # final processing of B-B, B-I and I-I
            bio_label = bio_label.replace("B-B-", "B-")
            bio_label = bio_label.replace("B-I-", "B-")
            bio_label = bio_label.replace("I-I-", "I-")

            current_sentence.append(word)
            current_labels.append(bio_label)

            prev_label = label  # Update

    if current_sentence != []:
            sentences.append(current_sentence)
            labels.append(current_labels)
            current_sentence = []
            current_labels = []

# print results
for sentence, label_list in zip(sentences, labels):
    print(" ".join(sentence))
    print(label_list)
    print()

Create HF dataset

In [None]:
# Create Hugging Face dataset
dataset = DatasetDict({
    'train': Dataset.from_dict({
        'id': list(range(1, len(sentences) + 1)),
        'tokens': sentences,
        'ner_tags': labels
    })
})

print(dataset)

Confirm all entities

In [None]:
entities = set()
for label_list in labels:
    for label in label_list:
        entities.add(label)

print("Entities:", entities)

Process into default index labelling

In [None]:
# Entities
entity_names = ["O", "B-PERSON", "I-PERSON", "B-COURT", "I-COURT", "B-BUSINESS", "I-BUSINESS", "B-GOVERNMENT", "I-GOVERNMENT", "B-LOCATION", "I-LOCATION", "B-LEGISLATION/ACT", "I-LEGISLATION/ACT", "B-MISCELLANEOUS", "I-MISCELLANEOUS"]
entity_names_parsed = {"PERSON": "Person", "COURT": "Court", "BUSINESS": "Business", "GOVERNMENT": "Government", "LOCATION": "Location", "LEGISLATION/ACT": "Legislation/Act", "MISCELLANEOUS": "Miscellaneous ", "O": "-"}

In [None]:
# Create a mapping between ner_tag and index
tag_to_index = {tag: idx for idx, tag in enumerate(entity_names)}

# Update the dataset with the index of each ner_tag
def update_ner_tags(example):
    example["ner_tags"] = [tag_to_index[tag] for tag in example["ner_tags"]]
    return example

# Apply the mapping to the entire dataset
dataset = dataset.map(update_ner_tags)

# Update the features with the list of ner_tags
dataset = dataset.cast_column("ner_tags", Sequence(feature = ClassLabel(names = entity_names)))

Confirmation

In [None]:
print(dataset)
print(); print("example"); print(dataset["train"][0])
print(); print("entities"); print(dataset["train"].features["ner_tags"].feature.names)

Save dataset!

In [None]:
dataset.save_to_disk('ener_hf_original')

Split into train and test 

In [None]:
dataset_low = copy.copy(dataset)

# merge test and validation
dataset_low["original"] = dataset_low["train"]
dataset_low.pop("train")

dataset_low["train"] = dataset_low["original"]
dataset_low["test"] = dataset_low["original"]

print(dataset_low)

Balanced version

In [None]:
from entities_eNER import entity_names, entity_names_parsed

# get the entity names
start_of_entity_indices = [i for i in range(len(entity_names)) if (entity_names[i].startswith("B-") or entity_names[i].startswith("U-"))]
entity_index_to_name = {i: entity_names[i].split("-")[1] for i in range(len(entity_names)) if entity_names[i] != "O"}
entity_index_to_name[0] = "O"

In [None]:
train_entity_map = entity_map(dataset_low["train"], entity_names_parsed, start_of_entity_indices, entity_index_to_name); print()

In [None]:
new_train_idx = balanced_multilabel_sample(train_entity_map, 200)

In [None]:
# Select the new datasets based on the new indices
dataset_low["train"] = dataset_low["train"].select(new_train_idx)

print(dataset_low)

Exclude new_train examples from new_test

In [None]:
# Indices to exclude
exclude_indices = new_train_idx

# Compute the complement
include_indices = [i for i in range(len(dataset_low["test"])) if i not in exclude_indices]

# Select all except 1, 2, 3
dataset_low["test"] = dataset_low["test"].select(include_indices)

print(dataset_low)

In [None]:
test_entity_map = entity_map(dataset_low["test"], entity_names_parsed, start_of_entity_indices, entity_index_to_name)
new_test_idx = balanced_multilabel_sample(test_entity_map, 600)
dataset_low["test"] = dataset_low["test"].select(new_test_idx)

dataset_low.pop("original")
print(dataset_low)

Confirm old vs new distribution

In [None]:
new_train_entity_map = entity_map(dataset_low["train"], entity_names_parsed, start_of_entity_indices, entity_index_to_name); print()
new_test_entity_map = entity_map(dataset_low["test"], entity_names_parsed, start_of_entity_indices, entity_index_to_name)

In [None]:
from collections import Counter

print("Instances with at least one entity of class:")

# Flatten all class lists and count frequencies
entity_counts = Counter(cls for classes in train_entity_map.values() for cls in classes)
print("Old train", entity_counts)

new_entity_counts = Counter(cls for classes in new_train_entity_map.values() for cls in classes)
print("New train", new_entity_counts)

new_test_counts = Counter(cls for classes in new_test_entity_map.values() for cls in classes)
print("New test", new_test_counts)

In [None]:
all_classes_train = {cls:0 for classes in new_train_entity_map.values() for cls in classes}
for instance in dataset_low["train"]:
    entity_list = tokens_to_entities(instance["tokens"], instance["ner_tags"], entity_names_parsed, start_of_entity_indices, entity_index_to_name)
    for entity in entity_list:
        all_classes_train[entity.entity] += 1

print("Total entities train:")
print(all_classes_train)

all_classes_test = {cls:0 for classes in new_test_entity_map.values() for cls in classes}
for instance in dataset_low["test"]:
    entity_list = tokens_to_entities(instance["tokens"], instance["ner_tags"], entity_names_parsed, start_of_entity_indices, entity_index_to_name)
    for entity in entity_list:
        all_classes_test[entity.entity] += 1

print("Total entities test:")
print(all_classes_test)

Save

In [None]:
dataset_low.save_to_disk('ener_hf_low')