Libraries

In [None]:
from datasets import Dataset, DatasetDict, ClassLabel, Sequence, concatenate_datasets

import copy
import os
import sys

# Add the folder to the Python path
sys.path.append(os.path.abspath("../../0. Helpers"))

from datasetProcessing import tokens_to_entities
from datasetBalancedSplit import balanced_multilabel_sample, entity_map

Read txt file

In [None]:
def process_dataset(file_path):
    
    sentences = []
    labels = []

    current_sentence = []
    current_labels = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()

            # check to save sentence
            if line == "":
                if current_sentence != []:
                    sentences.append(current_sentence)
                    labels.append(current_labels)
                    current_sentence = []
                    current_labels = []
                
            # add word and label to current sentence
            else:
                word, label = line.split()
                current_sentence.append(word)
                current_labels.append(label)

        if current_sentence != []:
                sentences.append(current_sentence)
                labels.append(current_labels)
                current_sentence = []
                current_labels = []

    return sentences, labels


# test dev
sentences, labels = process_dataset("...")

# print results
for sentence, label in zip(sentences, labels):
    print(" ".join(sentence))
    print(label)
    print()

Create HF dataset

In [None]:
# Create Hugging Face dataset
dev_sentences, dev_labels = process_dataset("...")
test_sentences, test_labels = process_dataset("...")
train_sentences, train_labels = process_dataset("...")

dataset = DatasetDict({
    'train': Dataset.from_dict({
        'id': list(range(1, len(train_sentences) + 1)),
        'tokens': train_sentences,
        'ner_tags': train_labels
    }),
    'validation': Dataset.from_dict({
        'id': list(range(1, len(dev_sentences) + 1)),
        'tokens': dev_sentences,
        'ner_tags': dev_labels
    }),
    'test': Dataset.from_dict({
        'id': list(range(1, len(test_sentences) + 1)),
        'tokens': test_sentences,
        'ner_tags': test_labels
    }),
})

print(dataset)

Confirm all entities

In [None]:
entities = set()

for labels in [train_labels, dev_labels, test_labels]:
    for label_list in labels:
        for label in label_list:
            entities.add(label)

print("Entities:", entities)

Process into default index labelling

In [None]:
# Entities
entity_names = ["O", "B-PESSOA", "I-PESSOA", "B-ORGANIZACAO", "I-ORGANIZACAO", "B-LOCAL", "I-LOCAL", "B-TEMPO", "I-TEMPO", "B-LEGISLACAO", "I-LEGISLACAO", "B-JURISPRUDENCIA", "I-JURISPRUDENCIA"]
entity_names_parsed = {"PESSOA": "Pessoa", "ORGANIZACAO": "Organização", "LOCAL": "Localização", "TEMPO": "Tempo", "LEGISLACAO": "Legislação", "JURISPRUDENCIA": "Jurisprudência", "O": "-"}

In [None]:
# Create a mapping between ner_tag and index
tag_to_index = {tag: idx for idx, tag in enumerate(entity_names)}

# Update the dataset with the index of each ner_tag
def update_ner_tags(example):
    example["ner_tags"] = [tag_to_index[tag] for tag in example["ner_tags"]]
    return example

# Apply the mapping to the entire dataset
dataset = dataset.map(update_ner_tags)

# Update the features with the list of ner_tags
dataset = dataset.cast_column("ner_tags", Sequence(feature = ClassLabel(names = entity_names)))

Confirmation

In [None]:
print(dataset)
print(); print("example"); print(dataset["train"][0])
print(); print("entities"); print(dataset["train"].features["ner_tags"].feature.names)

Save dataset!

In [None]:
dataset.save_to_disk('lener_hf_original')

Split into train and test 

In [None]:
dataset_low = copy.copy(dataset)

# merge test and validation
dataset_low["test"] = concatenate_datasets([dataset_low["test"], dataset_low["validation"]])
dataset_low.pop("validation")

print(dataset_low)

Remove zero-entity instances

In [None]:
# Remove instances with no entities
def filter_no_entities(example):
    return any(tag != tag_to_index["O"] for tag in example["ner_tags"])

dataset_low = dataset_low.filter(filter_no_entities)

print(dataset_low)

Balanced version

In [None]:
from entities_leNER import entity_names, entity_names_parsed

# get the entity names
start_of_entity_indices = [i for i in range(len(entity_names)) if (entity_names[i].startswith("B-") or entity_names[i].startswith("U-"))]
entity_index_to_name = {i: entity_names[i].split("-")[1] for i in range(len(entity_names)) if entity_names[i] != "O"}
entity_index_to_name[0] = "O"

In [None]:
train_entity_map = entity_map(dataset_low["train"], entity_names_parsed, start_of_entity_indices, entity_index_to_name); print()
test_entity_map = entity_map(dataset_low["test"], entity_names_parsed, start_of_entity_indices, entity_index_to_name)

In [None]:
new_train_idx = balanced_multilabel_sample(train_entity_map, 200)
new_test_idx = balanced_multilabel_sample(test_entity_map, 600)

In [None]:
# Select the new datasets based on the new indices
dataset_low["train"] = dataset_low["train"].select(new_train_idx)
dataset_low["test"] = dataset_low["test"].select(new_test_idx)

print(dataset_low)

Confirm old vs new distribution

In [None]:
new_train_entity_map = entity_map(dataset_low["train"], entity_names_parsed, start_of_entity_indices, entity_index_to_name); print()
new_test_entity_map = entity_map(dataset_low["test"], entity_names_parsed, start_of_entity_indices, entity_index_to_name)

In [None]:
from collections import Counter

print("Instances with at least one entity of class:")

# Flatten all class lists and count frequencies
entity_counts = Counter(cls for classes in train_entity_map.values() for cls in classes)
print("Old train", entity_counts)

new_entity_counts = Counter(cls for classes in new_train_entity_map.values() for cls in classes)
print("New train", new_entity_counts)

new_test_counts = Counter(cls for classes in new_test_entity_map.values() for cls in classes)
print("New test", new_test_counts)

In [None]:
all_classes_train = {cls:0 for classes in new_train_entity_map.values() for cls in classes}
for instance in dataset_low["train"]:
    entity_list = tokens_to_entities(instance["tokens"], instance["ner_tags"], entity_names_parsed, start_of_entity_indices, entity_index_to_name)
    for entity in entity_list:
        all_classes_train[entity.entity] += 1

print("Total entities train:")
print(all_classes_train)

all_classes_test = {cls:0 for classes in new_test_entity_map.values() for cls in classes}
for instance in dataset_low["test"]:
    entity_list = tokens_to_entities(instance["tokens"], instance["ner_tags"], entity_names_parsed, start_of_entity_indices, entity_index_to_name)
    for entity in entity_list:
        all_classes_test[entity.entity] += 1

print("Total entities test:")
print(all_classes_test)

Save

In [None]:
dataset_low.save_to_disk('lener_hf_low')