In [1]:
import pandas as pd

from llm_inference.base import LLMInterface
from setting.db import SessionLocal
import logging
from models.entity import get_entity_model
from models.relationship import get_relationship_model


Entity = get_entity_model("entities_150001", 1536)
Relationship = get_relationship_model("relationships_150001", 1536)

logger = logging.getLogger(__name__)

llm_client = LLMInterface("ollama", "deepseek-qwen-32b")
cluster_df = pd.read_pickle("cluster_entities.pkl")
cluster_df

Unnamed: 0,cluster,entity_id,entity_name,entity_description,entity_metadata,processed
0,YbnSTRiH_iter_1_idx1_idx1,720296,TiKV,Derived from relationships: Pump queries TiKV ...,{'status': 'need-merged'},False
1,YbnSTRiH_iter_1_idx1_idx1,363280,tikv,The TiKV storage engine is a key-value compone...,"{'package': 'TiDB-community-server', 'role': '...",False
2,YbnSTRiH_iter_1_idx1_idx2,810020,TiKV,TiKV is a distributed key-value store designed...,{'status': 'need-revised'},False
3,YbnSTRiH_iter_1_idx1_idx2,720274,TiKV,TiKV is a distributed key-value storage engine...,{'Key Metrics': {'CPU': 'The CPU usage ratio o...,False
4,YbnSTRiH_iter_1_idx2_idx1,36322,TiKV,Derived from from relationship: TiKV -> TiKV c...,{'status': 'need-revised'},False
5,YbnSTRiH_iter_1_idx2_idx1,34246,TiKV,Derived from from relationship: TiKV -> TiKV i...,{'status': 'need-revised'},False
6,YbnSTRiH_iter_1_idx2_idx1,61260,TiKV,Derived from from relationship: tidb_session_a...,{'status': 'need-revised'},False
7,YbnSTRiH_iter_1_idx2_idx1,92782,TiKV,Derived from from relationship: enableDynamicC...,{'status': 'need-revised'},False
8,YbnSTRiH_iter_1_idx2_idx2,242242,TiKV,A component of TiDB Operator that should not b...,{'scaling_restriction': 'Should not be scaled ...,False
9,YbnSTRiH_iter_1_idx2_idx2,240738,TiKV,TiKV in TiDB 3.0.1 includes added statistics o...,"{'bug_fixes': ['Fix core dump issues'], 'featu...",False


In [2]:
import json
import openai

from typing import Mapping, Any

embedding_model = openai.OpenAI()

def get_text_embedding(text: str, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return embedding_model.embeddings.create(input = [text], model=model).data[0].embedding


def get_entity_description_embedding(
    name: str, description: str
):
    combined_text = f"{name}: {description}"
    return get_text_embedding(combined_text)


def get_entity_metadata_embedding(
    metadata: dict[Mapping, Any]
):
    combined_text = json.dumps(metadata)
    return get_text_embedding(combined_text)

In [3]:
from entity_agg import merge_entities, should_merge_entities, group_mergeable_entities

cluster_mapping = {}
for _, row in cluster_df.iterrows():
    if row['processed'] == True:
        continue

    cluster_name = row['cluster']
    entity = Entity(
        id=row['entity_id'],
        name=row['entity_name'],
        description=row['entity_description'],
        meta=row['entity_metadata']
    )
    
    if cluster_name not in cluster_mapping:
        cluster_mapping[cluster_name] = set()
    
    cluster_mapping[cluster_name].add(entity)

if cluster_mapping:
    first_cluster = next(iter(cluster_mapping))
    print(f"Cluster: {first_cluster}")
    for entity in cluster_mapping[first_cluster]:
        print(f" - ID: {entity.id}, Name: {entity.name}, Description: {entity.description}")
        print(f"   - Metadata: {entity.meta}")

print(len(cluster_mapping))


Cluster: YbnSTRiH_iter_1_idx1_idx1
 - ID: 720296, Name: TiKV, Description: Derived from relationships: Pump queries TiKV for transaction status, and the `TIKV_STORE_STATUS` table provides information about TiKV nodes.
   - Metadata: {'status': 'need-merged'}
 - ID: 363280, Name: tikv, Description: The TiKV storage engine is a key-value component within the TiDB cluster, serving as the storage engine used by TiDB.
   - Metadata: {'package': 'TiDB-community-server', 'role': 'key-value storage', 'topic': ['Storage Engine', 'tikv']}
4


In [4]:
for cluster_name, entities in cluster_mapping.items():
    print(f"merge entities cluster {cluster_name}, count {len(entities)}")

    token_count = merge_entities(llm_client, entities, only_count_token=True)
    if token_count > 16384:
        print("prompt token exceeds 16384", token_count)
        continue

    model_args = {}
    if token_count > 7000:
        model_args["options"]={
            "num_ctx": token_count+1500,
            "num_gpu": 80,
            "num_predict": 10000,
            "temperature": 0.1,
        }
    else:
        model_args["options"]={
            "num_ctx": 8092,
            "num_gpu": 80,
            "num_predict": 10000,
            "temperature": 0.1,
        }

    print("prompt token", token_count)
    try:
        check_result =  should_merge_entities(llm_client, entities, **model_args)
        if check_result.get("should_merge", False) is False:
            print(f"skip merge entities cluster {cluster_name}, count {len(entities)}, reason {check_result}")
            continue
        merged_entity = merge_entities(llm_client, entities, **model_args)
    except Exception as e:
        logging.error(f"Error processing cluster {cluster_name}: {e}", exc_info=True)
        continue

    if isinstance(merged_entity,dict) and "name" in merged_entity and "description" in merged_entity and "meta" in merged_entity:
        try:
            with SessionLocal() as session:
                # Step 1: Write the merged entity to the database
                new_entity = Entity(
                    name=merged_entity["name"],
                    description=merged_entity["description"],
                    meta=merged_entity.get("meta", {}),
                    description_vec=get_entity_description_embedding(merged_entity["name"], merged_entity["description"]),
                    meta_vec=get_entity_metadata_embedding(merged_entity.get("meta", {}))
                )
                print(new_entity.name)
                session.add(new_entity)
                session.flush()
                merged_entity_id = new_entity.id
                print(f"Merged entity created with ID: {merged_entity_id}")
                original_entity_ids = {entity.id for entity in entities}
                 # Step 2: Update relationships to reference the merged entity
                # Bulk update source entity IDs
                session.execute(
                    Relationship.__table__.update().where(
                        Relationship.source_entity_id.in_(original_entity_ids)
                    ).values(source_entity_id=merged_entity_id)
                )

                # Bulk update target entity IDs
                session.execute(
                    Relationship.__table__.update().where(
                        Relationship.target_entity_id.in_(original_entity_ids)
                    ).values(target_entity_id=merged_entity_id)
                )

                print(f"Relationships updated for merged entity {merged_entity_id}")

                session.commit()  # Commit the relationship updates
                print(f"Merged entity {cluster_name} processing complete.")
                cluster_df.loc[cluster_df["cluster"] == cluster_name, "processed"] = True
                cluster_df.to_pickle("cluster_entities.pkl")
        except Exception as e:
            logging.error(f"Error processing cluster {cluster_name}: {e}", exc_info=True)

            print(f"Error processing cluster {cluster_name}: {e}")
            session.rollback()
        finally:
            session.close()
    else:
        print(f"Merged entity {cluster_name} is invalid or empty.", merged_entity)

    print("*"* 100)

merge entities cluster YbnSTRiH_iter_1_idx1_idx1, count 2
prompt token 526
TiKV
Merged entity created with ID: 810021
Relationships updated for merged entity 810021
Merged entity YbnSTRiH_iter_1_idx1_idx1 processing complete.
****************************************************************************************************
merge entities cluster YbnSTRiH_iter_1_idx1_idx2, count 2
prompt token 3893
TiKV
Merged entity created with ID: 810022
Relationships updated for merged entity 810022
Merged entity YbnSTRiH_iter_1_idx1_idx2 processing complete.
****************************************************************************************************
merge entities cluster YbnSTRiH_iter_1_idx2_idx1, count 4
prompt token 626
TiKV
Merged entity created with ID: 810023
Relationships updated for merged entity 810023
Merged entity YbnSTRiH_iter_1_idx2_idx1 processing complete.
****************************************************************************************************
merge entities clu

```sql
SELECT e.id, e.name
FROM entities_150001 e
WHERE NOT EXISTS (
  SELECT 1
  FROM relationships_150001 r
  WHERE r.source_entity_id = e.id
     OR r.target_entity_id = e.id
);
```

```sql
START TRANSACTION;

DELETE FROM entities_150001
WHERE NOT EXISTS (
  SELECT 1
  FROM relationships_150001 r
  WHERE r.source_entity_id = entities_150001.id
     OR r.target_entity_id = entities_150001.id
);

SELECT ROW_COUNT();

COMMIT;
```