In [1]:
import pandas as pd


from llm_inference.base import LLMInterface
from setting.db import SessionLocal
import logging

logger = logging.getLogger(__name__)

llm_client = LLMInterface("openai", "gpt-4o")
cluster_df = pd.read_pickle("cluster_entities.pkl")
cluster_df

Unnamed: 0,cluster,entity_id,entity_name,entity_description,entity_metadata
0,mvqRcOzs_iter_1,1404,TiDB,The SQL processing component of a TiDB cluster...,"{'Component': True, 'Follower Read': {'details..."
1,mvqRcOzs_iter_1,34152,TiDB,"The distributed SQL layer of the TiDB cluster,...",{'config': 'Default port and global directory ...
2,mvqRcOzs_iter_1,48645,TiDB,TiDB is a distributed SQL database that uses T...,"{'functions': ['SQL Processing', 'Metadata Man..."
3,mvqRcOzs_iter_1,50066,TiDB,TiDB is a distributed SQL database compatible ...,"{'architecture': 'distributed', 'compatibility..."
4,mvqRcOzs_iter_1,53029,TiDB,"TiDB is an open-source, MySQL-compatible, dist...","{'attributes': {'architecture': 'distributed',..."
...,...,...,...,...,...
2503,Pfbb3ubI_iter_1,245560,TiDB Operator v1.2.6,"Version 1.2.6 of TiDB Operator, upgraded on Ja...","{'topic': 'TiDB Operator v1.2.6', 'upgrade_dat..."
2504,Pfbb3ubI_iter_1,270414,TiDB Operator 0.3.1,TiDB Operator version 0.3.1 is a release that ...,{'changes': {'bug_fixes': ['Fix parallel upgra...
2505,5KdHlJpa_iter_1,210110,podSecurityContext,The security context configuration for the Pod...,{'details': 'Configures the security context o...
2506,5KdHlJpa_iter_1,210163,podSecurityContext,A security context configuration for a Pod tha...,{'details': 'Configuration for the Pod that ru...


In [2]:
from models.entity import Entity

cluster_mapping = {}
for _, row in cluster_df.iterrows():
    cluster_name = row['cluster']
    entity = Entity(
        id=row['entity_id'],
        name=row['entity_name'],
        description=row['entity_description'],
        meta=row['entity_metadata']
    )
    
    if cluster_name not in cluster_mapping:
        cluster_mapping[cluster_name] = set()
    
    cluster_mapping[cluster_name].add(entity)

if cluster_mapping:
    first_cluster = next(iter(cluster_mapping))
    print(f"Cluster: {first_cluster}")
    for entity in cluster_mapping[first_cluster]:
        print(f" - ID: {entity.id}, Name: {entity.name}, Description: {entity.description}")

Cluster: mvqRcOzs_iter_1
 - ID: 62851, Name: TiDB, Description: TiDB is an open-source distributed SQL database that offers horizontal scalability, high availability, real-time HTAP, and MySQL compatibility. It features strong consistency and is designed for distributed environments and large-scale data. TiDB also provides system tables for managing user privileges, cluster status, and execution plans.
 - ID: 48645, Name: TiDB, Description: TiDB is a distributed SQL database that uses TiKV as its storage engine to process SQL queries. It integrates with Titan through TiKV. TiKV serves as the storage layer for TiDB.
 - ID: 90758, Name: TiDB, Description: TiDB is a distributed SQL database that reads data from the storage layer (TiKV) in parallel.
 - ID: 60561, Name: TiDB, Description: TiDB is an open-source, distributed, NewSQL database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads. It integrates with AI and is accessible via Python.  It supports MySQL sy

In [7]:
len(cluster_mapping)

353

In [4]:
import json
import openai

from typing import Mapping, Any

embedding_model = openai.OpenAI()

def get_text_embedding(text: str, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return embedding_model.embeddings.create(input = [text], model=model).data[0].embedding


def get_entity_description_embedding(
    name: str, description: str
):
    combined_text = f"{name}: {description}"
    return get_text_embedding(combined_text)


def get_entity_metadata_embedding(
    metadata: dict[Mapping, Any]
):
    combined_text = json.dumps(metadata)
    return get_text_embedding(combined_text)

In [8]:
from entity_agg import merge_entities
from models.entity import Entity
from models.relationship import Relationship

for cluster_name, entities in cluster_mapping.items():
    print(f"merge entities cluster {cluster_name}, count {len(entities)}")
    try:
        merged_entity = merge_entities(llm_client, entities)
    except Exception as e:
        logging.error(f"Error processing cluster {cluster_name}: {e}", exc_info=True)
        continue


    if isinstance(merged_entity,dict) and "name" in merged_entity and "description" in merged_entity and "meta" in merged_entity:
        try:
            with SessionLocal() as session:
                # Step 1: Write the merged entity to the database
                new_entity = Entity(
                    name=merged_entity["name"],
                    description=merged_entity["description"],
                    meta=merged_entity.get("meta", {}),
                    description_vec=get_entity_description_embedding(merged_entity["name"], merged_entity["description"]),
                    meta_vec=get_entity_metadata_embedding(merged_entity.get("meta", {}))
                )
                print(new_entity.name)
                session.add(new_entity)
                session.flush()
                merged_entity_id = new_entity.id
                print(f"Merged entity created with ID: {merged_entity_id}")

                original_entity_ids = {entity.id for entity in entities}
                # Step 2: Update relationships to reference the merged entity
                # Find all relationships where the original entities are either source or target
                relationships_to_update = session.query(Relationship).filter(
                    (Relationship.source_entity_id.in_(original_entity_ids)) |
                    (Relationship.target_entity_id.in_(original_entity_ids))
                ).all()

                for rel in relationships_to_update:
                    if rel.source_entity_id in original_entity_ids:
                        rel.source_entity_id = merged_entity_id
                    if rel.target_entity_id in original_entity_ids:
                        rel.target_entity_id = merged_entity_id

                session.commit()  # Commit the relationship updates
                print(f"Merged entity {cluster_name} processing complete.")
        except Exception as e:
            logging.error(f"Error processing cluster {cluster_name}: {e}", exc_info=True)

            print(f"Error processing cluster {cluster_name}: {e}")
            session.rollback()
        finally:
            session.close()
    else:
        print(f"Merged entity {cluster_name} is invalid or empty.", merged_entity)

    print("*"* 100)

merge entities cluster mvqRcOzs_iter_1, count 39
TiDB
Merged entity created with ID: 360007
Merged entity mvqRcOzs_iter_1 processing complete.
****************************************************************************************************
merge entities cluster p2cBgtqE_iter_1, count 13
MySQL Client
Merged entity created with ID: 360008
Merged entity p2cBgtqE_iter_1 processing complete.
****************************************************************************************************
merge entities cluster swWnKYMn_iter_1, count 5
TiFlash Node
Merged entity created with ID: 360009
Merged entity swWnKYMn_iter_1 processing complete.
****************************************************************************************************
merge entities cluster 27gMxSdZ_iter_1, count 14
Dumpling
Merged entity created with ID: 360010
Merged entity 27gMxSdZ_iter_1 processing complete.
****************************************************************************************************
merge