In [1]:
from entity_agg import EntityAggregator
from setting.db import SessionLocal
import logging

logger = logging.getLogger(__name__)

session = SessionLocal()
aggregator = EntityAggregator(session, "entities_150001")

In [2]:
iteration = 0
batch = 5000
clusters_info = []

In [3]:
import random
import string

def generate_random_string(length=8):
    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

while True:
    # entities = aggregator.get_entities(iteration*batch, batch)
    names = ['TiKV', 'PD', 'TiCDC', 'MySQL', 'Region', 'TiFlash', 'TiDB', 'Table', 'Table t ', 'Grafana', 'Prometheus', 'BR', 'Statistics']
    entities = aggregator.get_entities_by_name(names, batch*iteration, batch)
    if len(entities) == 0:
        print("cluster entities finished!")
        break
    iteration += 1

    print("start iteration", iteration)

    clusters = aggregator.cluster_entities(
        entities,
        embedding_weight=0.8,
        name_weight=0.2,
        desc_weight=0, 
        similarity_threshold=0.75
    )
    for cluster in clusters:
        random_str = generate_random_string()
        cluster_name = f"{random_str}_iter_{iteration}"
        for e in cluster:
            clusters_info.append(
                {
                    'cluster': cluster_name,
                    'entity_id': e.id,
                    'entity_name': e.name,
                    'entity_description': e.description,
                    'entity_metadata': e.meta
                }
            )
        print(f"save cluster {cluster_name}, count {len(cluster)}")

start iteration 1


Computing similarity rows: 100%|██████████| 495/495 [00:14<00:00, 34.20it/s]
DBSCAN clustering: 100%|██████████| 1/1 [00:00<00:00, 174.70it/s]


-1 (Noise): Prometheus - Used to record detailed information of various operations in components.
-1 (Noise): TiDB - TiDB provides statement summary tables similar to MySQL's `events_statements_summary_by_digest` starting from v4.0.0-rc.1 and also provides statement summary tables in `information_schema`.
-1 (Noise): Table t - A sample table with columns a, b, c, and d used to illustrate column pruning.
-1 (Noise): MySQL - Derived from from relationship: sql_mode -> `sql_mode` controls MySQL compatibility behaviors. -> MySQL
-1 (Noise): TiFlash - Derived from from relationship: TiFlash -> TiFlash server uses the security.redact_info_log configuration item to control log redaction. -> security.redact_info_log (TiFlash Server)
-1 (Noise): Grafana - A monitoring tool used to visualize TiFlash encryption at rest status.
-1 (Noise): TiDB - TiDB interacts with the PD (Placement Driver) component to obtain timestamps (TSOs) and for other cluster management tasks.
-1 (Noise): BR - Derived from

In [4]:
from entity_agg import merge_entities, group_mergeable_entities

cluster_mapping = {}
for row in clusters_info:
    cluster_name = row['cluster']
    entity = aggregator._entity_model(
        id=row['entity_id'],
        name=row['entity_name'],
        description=row['entity_description'],
        meta=row['entity_metadata']
    )
    
    if cluster_name not in cluster_mapping:
        cluster_mapping[cluster_name] = set()
    
    cluster_mapping[cluster_name].add(entity)

if cluster_mapping:
    first_cluster = next(iter(cluster_mapping))
    print(f"Cluster: {first_cluster}")
    for entity in cluster_mapping[first_cluster]:
        print(f" - ID: {entity.id}, Name: {entity.name}, Description: {entity.description}")
        print(f"   - Metadata: {entity.meta}")

print(len(cluster_mapping))

Cluster: iFWJsx6B_iter_1
 - ID: 420239, Name: Region, Description: A geographical location where TiDB clusters are deployed, influencing factors such as availability zones, data processing costs, default domains, and cluster creation processes.
   - Metadata: {'details': {'influence': 'Affects the cost of data processing based on the location of the secondary cluster relative to the primary cluster'}, 'example': {'Region1': ['AZ1', 'AZ2'], 'Region2': ['AZ3', 'AZ4'], 'Region3': ['AZ5']}, 'labels': ['Region', 'AZ'], 'linked_entities': ['Default Domain'], 'properties': [{'property': 'relation', 'value': 'Determines the default domain'}, {'property': 'Selection', 'value': 'Chosen during cluster creation on the Create Cluster Page'}], 'topic': ['location', 'Data Processing Cost', 'Deployment', 'Region']}
 - ID: 35984, Name: Region, Description: A geographical area used to group stores, with four regions: North, East, West, and Central.  This grouping is used for partitioning employee data a

In [5]:
from llm_inference.base import LLMInterface

new_clusters_info = []
llm_client = LLMInterface("ollama", "deepseek-qwen-32b")

for cluster_name, entities in cluster_mapping.items():
    print(f"merge entities cluster {cluster_name}, count {len(entities)}")

    processed_entities = entities
    while True:
        token_count = merge_entities(llm_client, processed_entities, only_count_token=True)
        if token_count <= 16384:
            break
        print("prompt token exceeds 16384", token_count)
        processed_entities = set(list(processed_entities)[:len(processed_entities)//2])
        print("prompt token exceeds 20000, reduced to", len(processed_entities))

    model_args = {}
    if token_count > 7000:
        model_args["options"]={
            "num_ctx": token_count+1500,
            "num_gpu": 80,
            "num_predict": 8192,
            "temperature": 0.1,
        }
    else:
        model_args["options"]={
            "num_ctx": 8192,
            "num_gpu": 80,
            "num_predict": 8192,
            "temperature": 0.1,
        }

    print("prompt token", token_count)
    try:
        merged_group =  group_mergeable_entities(llm_client, processed_entities, **model_args)
        cluster_idx = 0
        for entities in merged_group:
            cluster_idx += 1
            new_cluster_name = f"{cluster_name}_idx{cluster_idx}"
            for e in entities:
                new_clusters_info.append(
                    {
                        'cluster': new_cluster_name,
                        'entity_id': e.id,
                        'entity_name': e.name,
                        'entity_description': e.description,
                        'entity_metadata': e.meta
                    }
                )
    except Exception as e:
        logging.error(f"Error processing cluster {cluster_name}: {e}", exc_info=True)
        continue

merge entities cluster iFWJsx6B_iter_1, count 31
prompt token 7390
merge entities cluster grbSNNAY_iter_1, count 60
prompt token exceeds 16384 33293
prompt token exceeds 20000, reduced to 30
prompt token exceeds 16384 22432
prompt token exceeds 20000, reduced to 15
prompt token 10635
merge entities cluster ycLOCgyU_iter_1, count 29
prompt token 4912
merge entities cluster L0HiZHjN_iter_1, count 16
prompt token 3534
merge entities cluster N8ssyFDY_iter_1, count 42
prompt token exceeds 16384 19662
prompt token exceeds 20000, reduced to 21
prompt token 9097
merge entities cluster qlUlRh9g_iter_1, count 8
prompt token 2106
merge entities cluster OxDK2dk9_iter_1, count 57
prompt token 8511
merge entities cluster PpqeOrGJ_iter_1, count 33
prompt token 6018
merge entities cluster z6Vp8pwH_iter_1, count 19
prompt token 3329
merge entities cluster wuetpTY5_iter_1, count 26
prompt token 7428
merge entities cluster WI8jEck8_iter_1, count 23
prompt token 5200
[ERROR in group_mergeable_entities]: N

In [None]:
new_clusters_info

In [6]:
import pandas as pd

cluster_info_df = pd.DataFrame(new_clusters_info)
cluster_info_df['processed'] = False
cluster_info_df.to_pickle("cluster_entities.pkl")

In [7]:
cluster_info_df.count()

cluster               142
entity_id             142
entity_name           142
entity_description    142
entity_metadata       142
processed             142
dtype: int64