Libraries

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import json
from pathlib import Path
from collections import Counter

from sentence_transformers import SentenceTransformer

# add path 
sys.path.append(os.path.abspath("../0. Helpers"))
sys.path.append(os.path.abspath("../2. Data Processing/_dataset_entities"))

from datasets import load_dataset, load_from_disk
from datasetProcessing import tokens_to_sentence, tokens_to_entities, join_datasets, recursive_fix

Define the model

In [None]:
model = SentenceTransformer(
    "Qwen/Qwen3-Embedding-4B",
    # model_kwargs = {
    #     # "attn_implementation": "flash_attention_2",
    #     "device_map": "auto"},
    tokenizer_kwargs = {
        "padding_side": "left"},
)

# Move model to CPU
model.to("cpu")

# Check model device
print(model.device)

Test one similiarity

In [None]:
# The queries and documents to embed
queries = [
    "What is the capital of China?",
    "Explain gravity",
]
documents = [
    "The capital of China is Beijing.",
    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
]

# Encode the queries and documents. Note that queries benefit from using a prompt
# Here we use the prompt called "query" stored under `model.prompts`, but you can
# also pass your own prompt via the `prompt` argument
query_embeddings = model.encode(queries, prompt_name="query")
document_embeddings = model.encode(documents)

# Compute the (cosine) similarity between the query and document embeddings
similarity = model.similarity(query_embeddings, document_embeddings)
print(model.similarity_fn_name)
print(similarity)

Process whole dataset

In [None]:
topic = "music"

In [None]:
if topic == "lener":
    from entities_leNER import entity_names, entity_names_parsed
    dataset = load_from_disk("...")
    lang = "portuguese"

elif topic == "neuralshift":
    from entities_neuralshift import entity_names, entity_names_parsed
    dataset = load_from_disk("...")
    lang = "portuguese"

elif topic == "ener":
    from entities_eNER import entity_names, entity_names_parsed
    dataset = load_from_disk("...")
    lang = "english"

elif topic == "multinerd_en":
    from entities_multinerd_en import entity_names, entity_names_parsed
    dataset = load_from_disk("...")
    lang = "english"

elif topic == "multinerd_pt":
    from entities_multinerd_pt import entity_names, entity_names_parsed
    dataset = load_from_disk("...")
    lang = "portuguese"

else:
    from entities_crossNER import entity_names, entity_names_parsed
    dataset = load_dataset("...")
    lang = "english"

# train_data
train_data = dataset["train"]
test_data = dataset["test"]

# get the entity names
start_of_entity_indices = [i for i in range(len(entity_names)) if (entity_names[i].startswith("B-") or entity_names[i].startswith("U-"))]
entity_index_to_name = {i: entity_names[i].split("-")[1] for i in range(len(entity_names)) if entity_names[i] != "O"}
entity_index_to_name[0] = "O"

Get and save embeddings

In [None]:
data_splits = ['train', 'test']

for split in data_splits:

    split_path = f"embeddings/{topic}/{split}"
    split_size = len(dataset[split])
    
    # Create folder
    os.makedirs(split_path, exist_ok=True)

    for i, instance in enumerate(dataset[split]):

        id = None
        sentence = None
        embedding_qwen = None

        file_path = f"embeddings/{topic}/{split}/{i}.json"

        # read file if exists and complete it
        if os.path.exists(file_path):
            with open(file_path, "r", encoding="utf-8") as f:

                # handle problem with files
                s = f.read()
                boundary = s.rfind("}{")

                existing_data = json.loads(s[boundary + 1:]) if boundary != -1 else json.loads(s)
                
                id = existing_data.get("id", None)
                sentence = existing_data.get("sentence", None)
                embedding_qwen = existing_data.get("embedding_qwen", None)
        
        if not id:
            id = i

        if not sentence:
            sentence = tokens_to_sentence(instance['tokens'])
        
        if not embedding_qwen:
            embedding_qwen = model.encode(sentence).tolist()

        # Save to json file
        result_json = {
            "id": i,
            "sentence": sentence,
            "embedding_qwen": embedding_qwen,
        }  

        with open(file_path, "w", encoding="utf-8") as f:
            f.write(json.dumps(result_json, ensure_ascii=False, indent=4))

        print(f"âœ… {split} instance #{i+1}/{split_size} saved to {file_path}")
