Explore CrossNER dataset

In [None]:
from datasets import load_dataset
import numpy as np

raw_datasets = {
    "conll": load_dataset("..."),
    "politics": load_dataset("..."),
    "science": load_dataset("..."),
    "music": load_dataset("..."),
    "literature": load_dataset("..."),
    "ai": load_dataset("..."),
}

In [1]:
import sys
import os

# Add the folder to the Python path
sys.path.append(os.path.abspath("../0. Helpers"))
sys.path.append(os.path.abspath("../2. Data Processing/_dataset_entities"))

# Import libraries
from datasetProcessing import tokens_to_sentence, tokens_to_entities
from entities_crossNER import entity_names, entity_names_parsed

# get the entity names
start_of_entity_indices = [i for i in range(len(entity_names)) if (entity_names[i].startswith("B-") or entity_names[i].startswith("U-"))]
entity_index_to_name = {i: entity_names[i].split("-")[1] for i in range(len(entity_names)) if entity_names[i] != "O"}
entity_index_to_name[0] = "O"

Entity processing function

In [None]:
entity_names = raw_datasets["conll"]["train"].features["ner_tags"].feature.names

def extract_actual_entities(indices):
    # get entity names for indices
    ind_entities = [entity_names[i] for i in indices]
    
    # remove B- and I- prefixes
    no_prefix = [entity[2:] if entity.startswith("B-") or entity.startswith("I-") else entity for entity in ind_entities]

    # return unique
    return sorted(list(set(no_prefix)))

print(extract_actual_entities(range(len(entity_names))))
print(len(extract_actual_entities(range(len(entity_names)))))

Get metrics for each subset

In [None]:
class Metrics:
    sentences = 0
    tokens = 0
    entities = 0

    def __init__(self, sentences, tokens, entities):
        self.sentences = sentences
        self.tokens = tokens
        self.entities = entities
    
    def __str__(self):
        return f"{self.sentences} sentences, {self.tokens} tokens, {self.entities} different entities"

In [None]:
metrics_datasets = {}
for key in raw_datasets.keys():
    metrics_datasets[key] = {}

for key, dataset in raw_datasets.items():

    total_examples = 0
    total_tokens = 0
    total_entity_list = []

    for split in dataset.keys():
        
        # examples
        split_instances = dataset[split].num_rows
        total_examples += split_instances

        split_tokens = 0
        split_entity_list = []

        for instance in dataset[split]:

            split_tokens += len(instance['tokens'])
            split_entity_list += instance['ner_tags']
        
        # tokens
        total_tokens += split_tokens

        # entities
        total_entity_list += split_entity_list
        split_entities_count = int((len(list(set(split_entity_list))) - 1)/2)

        # add split metrics
        metrics_datasets[key][split] = Metrics(split_instances, split_tokens, split_entities_count)

    # add total metrics
    total_entities_count = int((len(list(set(total_entity_list))) - 1)/2)
    metrics_datasets[key]["total"] = Metrics(total_examples, total_tokens, total_entities_count)

    # add set of all entities
    metrics_datasets[key]["entities"] = extract_actual_entities(list(set(total_entity_list)))

In [None]:
# Print the header with fixed-width columns
print(f"{'Dataset':<12} {'Split':<15} {'Sentences':<12} {'Tokens':<12} {'Entities':<12} {'Avg Tokens/Sentence':<12}")
print()

# Print the table rows with fixed-width columns
for key, dataset in metrics_datasets.items():
    for split, metrics in dataset.items():
        if split != "entities" and split != "total":
            avg_tokens_per_sentence = metrics.tokens / metrics.sentences if metrics.sentences > 0 else 0
            print(f"{key:<12} {split:<15} {metrics.sentences:<12} {metrics.tokens:<12} {metrics.entities:<12} {round(avg_tokens_per_sentence,1):<12}")
    print()

In [None]:
print("Tokens per Entity")

for key, dataset in raw_datasets.items():

    entity_length = []

    for instance in dataset['train']:
        
        tokens = instance['tokens']
        ner_tags = instance['ner_tags']
        entities = tokens_to_entities(tokens, ner_tags, entity_names_parsed, start_of_entity_indices, entity_index_to_name)

        entity_length.extend([len(entity.tokens) for entity in entities])
    
    min1 = np.min(entity_length)
    max2 = np.max(entity_length)
    avg = np.mean(entity_length)

    print(f"{key}: min={min1}, avg={round(avg,2)}, max={max2}")

    # save as file
    np.save(f'entity_length_crossner_{key}.npy', np.array(entity_length))

In [None]:
print("Entities for each dataset")

for key, dataset in metrics_datasets.items():
    if "entities" in dataset:
        print(f"{key:<12} {dataset['entities']}")

Entities distribution

In [None]:
topic = "ai"

In [None]:
from collections import defaultdict

def count_entities(dataset_split):
    class_spans = defaultdict(int)
    class_instances = defaultdict(int)

    for instance in dataset_split:
        tokens = instance['tokens']
        ner_tags = instance['ner_tags']
        entities = tokens_to_entities(tokens, ner_tags, entity_names_parsed, start_of_entity_indices, entity_index_to_name)
        
        instance_entity_classes = set()
        for ent in entities:
            class_spans[ent.entity] += 1
            instance_entity_classes.add(ent.entity)

        for ent_class in instance_entity_classes:
            class_instances[ent_class] += 1

    all_entities = list(set(list(class_spans.keys()) + list(class_instances.keys())))
    all_entities.sort()

    return all_entities, dict(class_spans), dict(class_instances)

train_entities, train_class_spans, train_class_instances = count_entities(raw_datasets[topic]['train'])
test_entities, test_class_spans, test_class_instances = count_entities(raw_datasets[topic]['test'])

all_entities = list(set(train_entities + test_entities))
all_entities.sort()

for ent in all_entities:
    train_instances = train_class_instances.get(ent, 0)
    test_instances = test_class_instances.get(ent, 0)
    train_spans = train_class_spans.get(ent, 0)
    test_spans = test_class_spans.get(ent, 0)
    print(f"{ent:<21} &  & {train_instances:<15} & {train_spans:<11} &  & {test_instances:<14} & {test_spans:<10} \\\\")


Find examples with specific criteria

In [None]:
# except_entities = ["Organisation", "Product", "Person", "Misc", "Algorithm", "Field", "Task"]
# contains_entities = ['Country', 'Event', 'Person', 'Protein']

criteria = []
for i, instance in enumerate(raw_datasets[topic]["test"]):
    true_entities = tokens_to_entities(instance['tokens'], instance['ner_tags'], entity_names_parsed, start_of_entity_indices, entity_index_to_name)
    sentence = tokens_to_sentence(instance['tokens'])
    criterium = {
        "index": i,
        "sentence": sentence,
        "total_count": len(true_entities),
        "unique_count": len(set([entity.entity for entity in true_entities])),
        # "except": all(entity.entity not in except_entities for entity in true_entities),
        # "contains_any": any(entity.entity in contains_entities for entity in true_entities),
        # "contains_strict": all(name in [entity.entity for entity in true_entities] for name in contains_entities),
        # "contains_unique_count": len(set(name for name in contains_entities if name in [entity.entity for entity in true_entities])),
        "sentence_contains": "genetic algorithm" in sentence
    }

    criteria.append(criterium)

In [None]:
for crit in criteria:
    # if crit["unique"] == 1 and crit["total"] == 2 and crit["except"] and len(crit["sentence"]) < 150:
    if crit["sentence_contains"]:
        print(f"index: {crit['index']}, Total: {crit['total_count']}, Unique: {crit['unique_count']}, Sentence Length: {len(crit['sentence'])}")
        print("                                                              ", crit["sentence"])

Process specific examples

In [None]:
# ai: instances = [("validation", 7), ("validation", 12), ("train", 1), ("train", 15), ("train", 58)]
# literature: instances = [("validation", 5), ("validation", 13), ("train", 57), ("train", 90), ("train", 75)]
# music: instances = [("validation", 17), ("validation", 19), ("train", 30), ("train", 54), ("train", 20)]
# politics: instances = [("validation", 2), ("validation", 3), ("train", 111), ("train", 136), ("train", 83)]
# science: instances = [("validation", 3), ("validation", 5), ("train", 7), ("train", 90), ("train", 86)]
instances = [("validation", 3), ("validation", 5), ("train", 7), ("train", 90), ("train", 86)]

for idx, instance_idx in enumerate(instances):
    instance = raw_datasets[topic][instance_idx[0]][instance_idx[1]]
    sentence = tokens_to_sentence(instance['tokens'])
    true_entities = tokens_to_entities(instance['tokens'], instance['ner_tags'], entity_names_parsed, start_of_entity_indices, entity_index_to_name)
    
    entities_str = ", ".join([f"{{'span': '{entity.span}', 'entity': '{entity.entity}'}}" for entity in true_entities])
    print(f"Example #{idx + 1}: {sentence}")
    print(f"Expected output: 'entities: [{entities_str}]'\n")

Inconsitencies

In [None]:
instances = [("test", 114)]

for idx, instance_idx in enumerate(instances):
    instance = raw_datasets[topic][instance_idx[0]][instance_idx[1]]
    sentence = tokens_to_sentence(instance['tokens'])
    true_entities = tokens_to_entities(instance['tokens'], instance['ner_tags'], entity_names_parsed, start_of_entity_indices, entity_index_to_name)
    
    entities_str = ", ".join([f"{{'span': '{entity.span}', 'entity': '{entity.entity}'}}" for entity in true_entities])
    print(f"Example #{idx + 1}: {sentence}")
    print(f"Example #{idx + 1}: {instance['tokens']}")
    print(f"Expected output: 'entities: [{entities_str}]'\n")