Libraries

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import json
from pathlib import Path
from collections import Counter

# add path 
sys.path.append(os.path.abspath("../0. Helpers"))
sys.path.append(os.path.abspath("../2. Data Processing/_dataset_entities"))

from datasets import load_dataset, load_from_disk
from datasetProcessing import tokens_to_sentence, tokens_to_entities, join_datasets, recursive_fix

Process whole dataset

In [None]:
topic = "multinerd_pt"

In [None]:
if topic == "lener":
    from entities_leNER import entity_names, entity_names_parsed
    dataset = load_from_disk("...")
    lang = "portuguese"

elif topic == "neuralshift":
    from entities_neuralshift import entity_names, entity_names_parsed
    dataset = load_from_disk("...")
    lang = "portuguese"

elif topic == "ener":
    from entities_eNER import entity_names, entity_names_parsed
    dataset = load_from_disk("...")
    lang = "english"

elif topic == "multinerd_en":
    from entities_multinerd_en import entity_names, entity_names_parsed
    dataset = load_from_disk("...")
    lang = "english"

elif topic == "multinerd_pt":
    from entities_multinerd_pt import entity_names, entity_names_parsed
    dataset = load_from_disk("...")
    lang = "portuguese"

else:
    from entities_crossNER import entity_names, entity_names_parsed
    dataset = load_dataset("...")
    lang = "english"

# train_data
train_data = dataset["train"]
test_data = dataset["test"]

# get the entity names
start_of_entity_indices = [i for i in range(len(entity_names)) if (entity_names[i].startswith("B-") or entity_names[i].startswith("U-"))]
entity_index_to_name = {i: entity_names[i].split("-")[1] for i in range(len(entity_names)) if entity_names[i] != "O"}
entity_index_to_name[0] = "O"

Token classification (entity, context, other)

In [None]:
def classify_tokens(tokens, bio_tags, context_window=2, other_tag=0):
    
    n = len(tokens)
    token_types = [''] * n

    entity_indices = {i for i, tag in enumerate(bio_tags) if tag != other_tag}

    # First, mark all entities
    for i in entity_indices:
        token_types[i] = 'entity'

    # Then, iterate over all tokens
    for i in range(n):
        if token_types[i] == 'entity':
            continue
        
        # Check if token is inside the window of any entity (context)
        for entity_idx in entity_indices:
            if (entity_idx - context_window <= i < entity_idx) or (entity_idx < i <= entity_idx + context_window):
                token_types[i] = 'context'
                break
        
        # Mark the rest as 'other'
        if not token_types[i]:
            token_types[i] = 'other'

    # Create dictionary with spans
    result = {"entity": [], "context": [], "other": []}
    for i, token_type in enumerate(token_types):
        result[token_type].append(tokens[i])

    return result

In [None]:
total_entity_tokens = 0
total_context_tokens = 0
total_other_tokens = 0

# Create directory if it doesn't exist
os.makedirs(f"classification/{topic}/train/data", exist_ok=True)

for i, example in enumerate(train_data):

    # check if json file already exists and skip
    file_path = f"classification/{topic}/train/data/{i}.json"
    if os.path.exists(file_path):
        print(f"↪️ Example #{i} already processed")
        continue

    tokens = example['tokens']
    bio_tags = example['ner_tags']
    sentence = tokens_to_sentence(example['tokens'])
    true_entities = tokens_to_entities(example['tokens'], example['ner_tags'], entity_names_parsed, start_of_entity_indices, entity_index_to_name)
    
    # Classify tokens
    classified_tokens = classify_tokens(tokens, bio_tags, context_window=2, other_tag=0)
    
    # Update totals
    total_entity_tokens += len(classified_tokens['entity'])
    total_context_tokens += len(classified_tokens['context'])   
    total_other_tokens += len(classified_tokens['other'])

    # Save to json file
    result_json = {
        "id": i,
        "sentence": sentence,
        "classification": classified_tokens,
        "tokens": tokens,
        "ner_tags": bio_tags,
        "true_entities": [entity.to_dict() for entity in true_entities],
    }  

    with open(file_path, "w", encoding="utf-8") as f:
        f.write(json.dumps(result_json, ensure_ascii=False, indent=4))

    print(f"✅ Example #{i} processed and saved to {file_path}")

Save totals

In [None]:
total_json = {
    "total_entity_tokens": total_entity_tokens,
    "total_context_tokens": total_context_tokens,
    "total_other_tokens": total_other_tokens,
}

print(total_json)

# Save to json file
file_path = f"classification/{topic}/train/_total.json"
with open(file_path, "a", encoding="utf-8") as f:
    f.write(json.dumps(total_json, ensure_ascii=False, indent=4))

Build probability dictionary

In [None]:
# Build full dataset lists
entity_tokens = []
context_tokens = []
other_tokens = []

# Load all json files and append to lists
folder = Path(f"classification/{topic}/train/data")
for file_path in folder.glob("*.json"):
    if file_path.name == "_total.json":
        continue  # Skip the total file

    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        entity_tokens.extend(data['classification']['entity'])
        context_tokens.extend(data['classification']['context'])
        other_tokens.extend(data['classification']['other'])

print(f"Total entity tokens: {len(entity_tokens)}")
print(f"Total context tokens: {len(context_tokens)}")
print(f"Total other tokens: {len(other_tokens)}")

In [None]:
# Counters
counter_e = Counter(entity_tokens)
counter_c = Counter(context_tokens)
counter_o = Counter(other_tokens)

# Vocabulary = all tokens
vocabulary = set(counter_e) | set(counter_c) | set(counter_o)

# Build the probability table
probs = {}
for token in vocabulary:

    token_total = counter_e[token] + counter_c[token] + counter_o[token]

    p_e = counter_e[token] / token_total
    p_c = counter_c[token] / token_total
    p_o = counter_o[token] / token_total

    probs[token] = {"prob_e": p_e, "prob_c": p_c, "prob_o": p_o}

print(probs)

# check if json file already exists and skip
file_path = f"classification/{topic}/train/_probs.json"

# Save to json file
with open(file_path, "w", encoding="utf-8") as f:
    f.write(json.dumps(probs, ensure_ascii=False, indent=4))