Libraries

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import json
from pathlib import Path
from collections import defaultdict

from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

from sklearn.metrics.pairwise import cosine_similarity

# add path 
sys.path.append(os.path.abspath("../0. Helpers"))
sys.path.append(os.path.abspath("../2. Data Processing/_dataset_entities"))

from datasets import load_dataset, load_from_disk("...")
from datasetProcessing import tokens_to_sentence, tokens_to_entities, join_datasets, recursive_fix
from reflection_helpers import word_only_punctuation

Define the model

In [None]:
model = SentenceTransformer(
    "Qwen/Qwen3-Embedding-4B",
    # model_kwargs = {
    #     # "attn_implementation": "flash_attention_2",
    #     "device_map": "auto"},
    tokenizer_kwargs = {
        "padding_side": "left"},
)

# Move model to CPU
model.to("cpu")

# Check model device
print(model.device)

Process whole dataset

In [None]:
topic = "lener"

In [None]:
if topic == "lener":
    from entities_leNER import entity_names, entity_names_parsed
    dataset = load_from_disk("...")
    lang = "portuguese"

elif topic == "neuralshift":
    from entities_neuralshift import entity_names, entity_names_parsed
    dataset = load_from_disk("...")
    lang = "portuguese"

elif topic == "ener":
    from entities_eNER import entity_names, entity_names_parsed
    dataset = load_from_disk("...")
    lang = "english"

elif topic == "multinerd_en":
    from entities_multinerd_en import entity_names, entity_names_parsed
    dataset = load_from_disk("...")
    lang = "english"

elif topic == "multinerd_pt":
    from entities_multinerd_pt import entity_names, entity_names_parsed
    dataset = load_from_disk("...")
    lang = "portuguese"

else:
    from entities_crossNER import entity_names, entity_names_parsed
    dataset = load_dataset("...")
    lang = "english"

# train_data
train_data = dataset["train"]

# get the entity names
start_of_entity_indices = [i for i in range(len(entity_names)) if (entity_names[i].startswith("B-") or entity_names[i].startswith("U-"))]
entity_index_to_name = {i: entity_names[i].split("-")[1] for i in range(len(entity_names)) if entity_names[i] != "O"}
entity_index_to_name[0] = "O"

Entity gathering

In [None]:
from nltk.corpus import stopwords
STOPWORDS = list(set(stopwords.words(lang))) + ["'s"]

def collect_entities(dataset_split, remove_duplicates = True):
    
    entities_by_type = defaultdict(list)
    
    for instance in dataset_split:
        extracted_entities = tokens_to_entities(instance["tokens"], instance["ner_tags"], entity_names_parsed, start_of_entity_indices, entity_index_to_name)
        
        for entity in extracted_entities:
            tokens = entity.tokens

            # remove punctuation tokens
            tokens = [token for token in tokens if not word_only_punctuation(token) and token.lower() not in STOPWORDS]
            
            entities_by_type[entity.entity].extend(tokens)
    
    if remove_duplicates:
        for entity_type in entities_by_type:
            entities_by_type[entity_type] = list(set(entities_by_type[entity_type]))
            
    return dict(entities_by_type)

In [None]:
entity_tokens_by_type = collect_entities(train_data)
print(entity_tokens_by_type)

print()
for entity_type, entities in entity_tokens_by_type.items():
    print(f"{entity_type}: {len(entities)}")

Compute point/center entities

In [None]:
def extract_point_entities(entities, model, k=4):
    
    if not entities:
        return []
    
    # If fewer entities than clusters, return unique mentions
    if len(entities) <= k:
        return list(set(entities))
    
    # Get embeddings
    embeddings = model.encode(entities, convert_to_numpy=True, normalize_embeddings=True)
    
    # KMeans clustering
    kmeans = KMeans(n_clusters=k, random_state = 73, n_init = 10)
    kmeans.fit(embeddings)
    
    centroids = kmeans.cluster_centers_
    labels = kmeans.labels_
    
    point_entities = []

    for cluster_id in range(k):
        cluster_indices = np.where(labels == cluster_id)[0]
        cluster_embeddings = embeddings[cluster_indices]

        sims = cosine_similarity([centroids[cluster_id]], cluster_embeddings)[0]
        
        best_idx = cluster_indices[np.argmax(sims)]
        point_entities.append(entities[best_idx])
    
    return point_entities

Run for all entity types

In [None]:
k=6
point_entities_dict = {}

for entity_type, entities in entity_tokens_by_type.items():
    point_entities = extract_point_entities(entities, model, k)
    point_entities_dict[entity_type] = point_entities
    print(entity_type, ">", point_entities)

Save to file

In [None]:
print(point_entities_dict)

point_type = "token"

# Make sure folder exists
Path(f"entity_info/point_entities/{point_type}/{topic}/train/data").mkdir(parents=True, exist_ok=True)

# Save to json file
file_path = f"entity_info/point_entities/{point_type}/{topic}/train/data/all_entities.json"
with open(file_path, "w", encoding="utf-8") as f:
    f.write(json.dumps(entity_tokens_by_type, ensure_ascii=False, indent=4))

# Save to json file
file_path = f"entity_info/point_entities/{point_type}/{topic}/train/_point_{point_type}_{k}.json"
with open(file_path, "w", encoding="utf-8") as f:
    f.write(json.dumps(point_entities_dict, ensure_ascii=False, indent=4))