Libraries

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import json
from pathlib import Path

from concurrent.futures import ThreadPoolExecutor

# add path 
sys.path.append(os.path.abspath("../0. Helpers"))
sys.path.append(os.path.abspath("../2. Data Processing/_dataset_entities"))

from datasets import load_dataset, load_from_disk
from datasetProcessing import tokens_to_sentence, tokens_to_entities, join_datasets, recursive_fix

Process whole dataset

In [None]:
topic = "music"

In [None]:
if topic == "lener":
    from entities_leNER import entity_names, entity_names_parsed
    dataset = load_from_disk("...")
    lang = "portuguese"

elif topic == "neuralshift":
    from entities_neuralshift import entity_names, entity_names_parsed
    dataset = load_from_disk("...")
    lang = "portuguese"

elif topic == "ener":
    from entities_eNER import entity_names, entity_names_parsed
    dataset = load_from_disk("...")
    lang = "english"

elif topic == "multinerd_en":
    from entities_multinerd_en import entity_names, entity_names_parsed
    dataset = load_from_disk("...")
    lang = "english"

elif topic == "multinerd_pt":
    from entities_multinerd_pt import entity_names, entity_names_parsed
    dataset = load_from_disk("...")
    lang = "portuguese"

else:
    from entities_crossNER import entity_names, entity_names_parsed
    dataset = load_dataset("...")
    lang = "english"

# train_data
train_data = dataset["train"]
test_data = dataset["test"]

# get the entity names
start_of_entity_indices = [i for i in range(len(entity_names)) if (entity_names[i].startswith("B-") or entity_names[i].startswith("U-"))]
entity_index_to_name = {i: entity_names[i].split("-")[1] for i in range(len(entity_names)) if entity_names[i] != "O"}
entity_index_to_name[0] = "O"

Read probs for topic

In [None]:
lambda_token = 1.0
lambda_embed = 1.0

w_e = 1.0
w_c = 1.0
w_o = 0.01

In [None]:
probs_path = f"classification/{topic}/train/_probs.json"

# read file
with open(probs_path, "r", encoding="utf-8") as f:
    probs = json.load(f)

vocabulary = list(probs.keys())

print(probs)
print(vocabulary)

Run for all test instances

In [None]:
test_len = len(dataset["test"])
train_len = len(dataset["train"])

In [None]:
# Loop test
for test_index in range(test_len):

    test_instance = dataset["test"][test_index]
    test_sentence = tokens_to_sentence(test_instance['tokens'])

    # Create folder
    os.makedirs(f"in_context/{topic}/test/data/{test_index}", exist_ok=True)

    # For each test instance, loop through all train instances
    for train_index in range(train_len):

        print(f"\rtest {test_index+1}/{test_len} | train {train_index+1}/{train_len}", end='', flush=True)

        similarity_test_train_path = f"in_context/{topic}/test/data/{test_index}/{train_index}.json"

        # read file if exists
        if os.path.exists(similarity_test_train_path):
            with open(similarity_test_train_path, "r", encoding="utf-8") as f:
                existing_data = json.loads(f.read())
                s_label_qwen = existing_data.get("s_label_qwen", None)
        
        if not s_label_qwen:
            db_file_path = f"classification/{topic}/train/data/{train_index}.json"
            db_file = json.load(open(db_file_path, "r", encoding="utf-8"))

            # Get similarity from folder
            with open(similarity_test_train_path, "r", encoding="utf-8") as f:
                train_similarity_data = json.load(f)
                s_embed_qwen = train_similarity_data.get('s_embed_qwen', None)
            
            # token similarity
            s_token = 0
            for token in test_instance['tokens']:
                if token in db_file['tokens']:
                    if token not in vocabulary:
                        s_token += 1
                    else:
                        s_token += probs[token]['prob_e'] * w_e + probs[token]['prob_c'] * w_c + probs[token]['prob_o'] * w_o

            # final similarity
            s_label_qwen = lambda_token * s_token + lambda_embed * s_embed_qwen if s_embed_qwen else None

            # Save to json file
            train_similarity_data["s_token"] = s_token
            train_similarity_data["s_label_qwen"] = s_label_qwen

            with open(similarity_test_train_path, "w", encoding="utf-8") as f:
                f.write(json.dumps(train_similarity_data, ensure_ascii=False, indent=4))

Get top n demos

In [None]:
# all_n = [10, 20]
all_n = [5]

# Ensure result dir exists
for n in all_n:
    os.makedirs(f"in_context/{topic}/test/label{str(n)}/qwen", exist_ok=True)

In [None]:
def load_similarity_file(similarity_path):
    with open(similarity_path, "r", encoding="utf-8") as f:
        return json.load(f)
    
for test_index, instance in enumerate(dataset["test"]):
    print(f"\rProcessing test instance {test_index+1}/{len(dataset['test'])}", end='', flush=True)

    # Check if output files already exist
    output_file = f"in_context/{topic}/test/label{str(n)}/qwen/{test_index}.txt"
    if os.path.exists(output_file):
        print(f" >>> Results for sentence #{test_index+1} already exist. Skipping...")
        continue

    # Get all train similarities and compute top n
    # similarity_files = []

    # for train_index in range(train_len):
    #     similarity_file = load_similarity_file(f"in_context/{topic}/test/data/{test_index}/{train_index}.json")
    #     similarity_files.append(similarity_file)

    # Build all file paths first (faster than string interpolation in loop)
    test_dir = f"in_context/{topic}/test/data/{test_index}"
    similarity_paths = [f"{test_dir}/{train_index}.json" for train_index in range(train_len)]

    # Parallel load all similarity files
    with ThreadPoolExecutor(max_workers=8) as executor:
        similarity_files = list(executor.map(load_similarity_file, similarity_paths))

    # Sort by final similarity (qwen)
    similarity_files.sort(key=lambda x: x['s_label_qwen'], reverse=True)
    
    for n in all_n:
        top_n_qwen = similarity_files[:n]

        qwen_demo_txt = ""
        for i, similarity_file in enumerate(top_n_qwen):
            qwen_demo_txt += f"Example #{i+1}: {similarity_file['sentence']}\n"
            qwen_demo_txt += f"Expected output: 'entities: {similarity_file['true_entities']}'\n\n"

        with open(output_file, "w", encoding="utf-8") as f:
            f.write(qwen_demo_txt)