In [1]:
import pandas as pd

from llm_inference.base import LLMInterface
from setting.db import SessionLocal
import logging
from models.entity import get_entity_model
from models.relationship import get_relationship_model


Entity = get_entity_model("entities_150001", 1536)
Relationship = get_relationship_model("relationships_150001", 1536)

logger = logging.getLogger(__name__)

llm_client = LLMInterface("ollama", "deepseek-qwen-32b")
cluster_df = pd.read_pickle("cluster_entities.pkl")
cluster_df

Unnamed: 0,cluster,entity_id,entity_name,entity_description,entity_metadata,processed
0,iFWJsx6B_iter_1_idx1,362741,Region,A Region is a basic unit of data storage and d...,{'details': {'function': 'Data partition in Ti...,False
1,iFWJsx6B_iter_1_idx1,390267,Region,"In TiKV, a Region is a contiguous segment of t...","{'context': 'TiKV', 'definition': 'a segment o...",False
2,iFWJsx6B_iter_1_idx2,56028,Region,A Region is the basic unit of data storage and...,{'details': ['Replicated across multiple TiKV ...,False
3,iFWJsx6B_iter_1_idx2,362741,Region,A Region is a basic unit of data storage and d...,{'details': {'function': 'Data partition in Ti...,False
4,iFWJsx6B_iter_1_idx3,390220,Region,A basic data storage unit in TiKV. Backups are...,"{'backup_process': ['Backed up by TiKV', 'Back...",False
...,...,...,...,...,...,...
137,Gt4UtkMP_iter_1_idx1,58482,Table,Derived from from relationship: Table -> Table...,{'status': 'need-revised'},False
138,gNTT5hHo_iter_1_idx1,363070,MySQL,Derived from relationships: MySQL provides sta...,{'status': 'need-revised'},False
139,gNTT5hHo_iter_1_idx1,47420,MySQL,Derived from from relationship: SHOW STATUS st...,{'status': 'need-revised'},False
140,gNTT5hHo_iter_1_idx1,47678,MySQL,Derived from from relationship: SHOW BINDINGS ...,{'status': 'need-revised'},False


In [2]:
import json
import openai

from typing import Mapping, Any

embedding_model = openai.OpenAI()

def get_text_embedding(text: str, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return embedding_model.embeddings.create(input = [text], model=model).data[0].embedding


def get_entity_description_embedding(
    name: str, description: str
):
    combined_text = f"{name}: {description}"
    return get_text_embedding(combined_text)


def get_entity_metadata_embedding(
    metadata: dict[Mapping, Any]
):
    combined_text = json.dumps(metadata)
    return get_text_embedding(combined_text)

In [3]:
from entity_agg import merge_entities, should_merge_entities, group_mergeable_entities

cluster_mapping = {}
for _, row in cluster_df.iterrows():
    if row['processed'] == True:
        continue

    cluster_name = row['cluster']
    entity = Entity(
        id=row['entity_id'],
        name=row['entity_name'],
        description=row['entity_description'],
        meta=row['entity_metadata']
    )
    
    if cluster_name not in cluster_mapping:
        cluster_mapping[cluster_name] = set()
    
    cluster_mapping[cluster_name].add(entity)

if cluster_mapping:
    first_cluster = next(iter(cluster_mapping))
    print(f"Cluster: {first_cluster}")
    for entity in cluster_mapping[first_cluster]:
        print(f" - ID: {entity.id}, Name: {entity.name}, Description: {entity.description}")
        print(f"   - Metadata: {entity.meta}")

print(len(cluster_mapping))


Cluster: iFWJsx6B_iter_1_idx1
 - ID: 390267, Name: Region, Description: In TiKV, a Region is a contiguous segment of the key-value space, represented by a left-closed and right-open interval [StartKey, EndKey). It serves as the basic unit for data distribution and Raft replication. A Region consists of a series of adjacent keys and is replicated using the Raft algorithm to form a Raft Group.
   - Metadata: {'context': 'TiKV', 'definition': 'a segment of the key-value space, consisting of a series of adjacent keys', 'details': ['Consecutive segment of Key-Value pairs.', 'Represented by [StartKey, EndKey).', 'Default size limit is 96 MiB (configurable)', 'Basic unit for data distribution and Raft replication.'], 'further_information': 'TiDB Internal (I) - Data Storage (https://www.pingcap.com/blog/tidb-internal-data-storage/)', 'location': 'TiKV', 'note': 'Incorrect key counting in some cases', 'properties': ['basic unit of data distribution in TiKV', 'represents a range of keys', 'repli

In [4]:
for cluster_name, entities in cluster_mapping.items():
    print(f"merge entities cluster {cluster_name}, count {len(entities)}")

    token_count = merge_entities(llm_client, entities, only_count_token=True)
    if token_count > 16384:
        print("prompt token exceeds 16384", token_count)
        continue

    model_args = {}
    if token_count > 7000:
        model_args["options"]={
            "num_ctx": token_count+1500,
            "num_gpu": 80,
            "num_predict": 8192,
            "temperature": 0.1,
        }
    else:
        model_args["options"]={
            "num_ctx": 8092,
            "num_gpu": 80,
            "num_predict": 8192,
            "temperature": 0.1,
        }

    print("prompt token", token_count)
    try:
        be_continued =  should_merge_entities(llm_client, entities, **model_args)
        if be_continued is False:
            print(f"skip merge entities cluster {cluster_name}, count {len(entities)}")
            continue
        merged_entity = merge_entities(llm_client, entities, **model_args)
    except Exception as e:
        logging.error(f"Error processing cluster {cluster_name}: {e}", exc_info=True)
        continue

    if isinstance(merged_entity,dict) and "name" in merged_entity and "description" in merged_entity and "meta" in merged_entity:
        try:
            with SessionLocal() as session:
                # Step 1: Write the merged entity to the database
                new_entity = Entity(
                    name=merged_entity["name"],
                    description=merged_entity["description"],
                    meta=merged_entity.get("meta", {}),
                    description_vec=get_entity_description_embedding(merged_entity["name"], merged_entity["description"]),
                    meta_vec=get_entity_metadata_embedding(merged_entity.get("meta", {}))
                )
                print(new_entity.name)
                session.add(new_entity)
                session.flush()
                merged_entity_id = new_entity.id
                print(f"Merged entity created with ID: {merged_entity_id}")
                original_entity_ids = {entity.id for entity in entities}
                 # Step 2: Update relationships to reference the merged entity
                # Bulk update source entity IDs
                session.execute(
                    Relationship.__table__.update().where(
                        Relationship.source_entity_id.in_(original_entity_ids)
                    ).values(source_entity_id=merged_entity_id)
                )

                # Bulk update target entity IDs
                session.execute(
                    Relationship.__table__.update().where(
                        Relationship.target_entity_id.in_(original_entity_ids)
                    ).values(target_entity_id=merged_entity_id)
                )

                print(f"Relationships updated for merged entity {merged_entity_id}")

                session.commit()  # Commit the relationship updates
                print(f"Merged entity {cluster_name} processing complete.")
                cluster_df.loc[cluster_df["cluster"] == cluster_name, "processed"] = True
                cluster_df.to_pickle("cluster_entities.pkl")
        except Exception as e:
            logging.error(f"Error processing cluster {cluster_name}: {e}", exc_info=True)

            print(f"Error processing cluster {cluster_name}: {e}")
            session.rollback()
        finally:
            session.close()
    else:
        print(f"Merged entity {cluster_name} is invalid or empty.", merged_entity)

    print("*"* 100)

merge entities cluster iFWJsx6B_iter_1_idx1, count 2
prompt token 1141
Region
Merged entity created with ID: 540060
Relationships updated for merged entity 540060
Merged entity iFWJsx6B_iter_1_idx1 processing complete.
****************************************************************************************************
merge entities cluster iFWJsx6B_iter_1_idx2, count 2
prompt token 1007
Region
Merged entity created with ID: 540061
Relationships updated for merged entity 540061
Merged entity iFWJsx6B_iter_1_idx2 processing complete.
****************************************************************************************************
merge entities cluster iFWJsx6B_iter_1_idx3, count 2
prompt token 1059
Region
Merged entity created with ID: 540062
Relationships updated for merged entity 540062
Merged entity iFWJsx6B_iter_1_idx3 processing complete.
****************************************************************************************************
merge entities cluster grbSNNAY_iter_1_i

```sql
SELECT e.id, e.name
FROM entities_150001 e
WHERE NOT EXISTS (
  SELECT 1
  FROM relationships_150001 r
  WHERE r.source_entity_id = e.id
     OR r.target_entity_id = e.id
);
```

```sql
START TRANSACTION;

DELETE FROM entities_150001
WHERE NOT EXISTS (
  SELECT 1
  FROM relationships_150001 r
  WHERE r.source_entity_id = entities_150001.id
     OR r.target_entity_id = entities_150001.id
);

SELECT ROW_COUNT();

COMMIT;
```