In [None]:
## Canonicalization for io-data and functional step (the implementation for these two are the same)
import json
import torch
from transformers import AutoTokenizer, AutoModel
import os
import time
import pandas as pd

model_name = 'BAAI/bge-m3'
# pth_file = r'canon\step_entity_preprocessing.pth'
# input_csv_path = r'canon\functional_step_entities_with_content.csv'
pth_file = r'canon\data_entity_preprocessing.pth'
input_csv_path = r'canon\io_data_entities_with_content.csv'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

def load_embeddings():
    if os.path.exists(pth_file):
        return torch.load(pth_file, map_location=torch.device('cpu'))
    else:
        return {}

def get_similarity(embedding1, embedding2):
    sim = torch.cosine_similarity(embedding1, embedding2, dim=1)
    return sim.item()

df = pd.read_csv(input_csv_path)

is_visited = []
ids = []
for index , row in df.iterrows():
    cur_id = row['id']
    ids.append(cur_id)
    is_visited.append(False)

count = len(is_visited)
embeddings = load_embeddings()

canon_dict = {}

start_time = time.time()
for i, id in enumerate(ids):
    if is_visited[i]:
        print(f"{i}:{id} has been clustered, {count} left")
        count = count -1
        continue
    count = count -1
    print(f"Processing {i}:{id}, {count} left")
    ## For unused embeddings, create a standardized set and compute similarities for the remaining items to find those similar to the current ID.
    if id not in canon_dict:
        canon_dict[id] = []
    embedding1 = embeddings[id]
    for j in range(i+1, len(ids)):
        if is_visited[j]:
            continue
        cur_id = ids[j]
        embedding2 = embeddings[cur_id]
        similarity = get_similarity(embedding1, embedding2)
        if similarity>=0.76:
            is_visited[j] = True
            canon_dict[id].append(cur_id)
    is_visited[i] = True
    end_time = time.time()
    print(f"Total processing time: {end_time - start_time:.2f} seconds")


folder_path = r"canon"
# file_path = os.path.join(folder_path, 'functional_step.json')
file_path = os.path.join(folder_path, 'io_data.json')

with open(file_path, 'w', encoding='utf-8-sig') as json_file:
    json.dump(canon_dict, json_file, ensure_ascii=False, indent=4)

end_time = time.time()
print(f"Finish, total processing time: {end_time - start_time:.2f} seconds")


In [None]:
## Save kg_canonicalization to db to create entities and relations

import json
import sqlite3

# Step 1: Read two JSON files and merge their keys and values into a set
def load_json_to_set(file_path):
    with open(file_path, 'r', encoding='utf-8-sig') as file:
        data = json.load(file)
    return {k: {k}.union(set(v)) for k, v in data.items()}

# Read io_data.json file
io_data_file_path = r"your/custom/path/io_data.json"
key_value_set = load_json_to_set(io_data_file_path)

# Read functional_step.json file and merge it into key_value_set
functional_step_file_path = r"your/custom/path/functional_step.json"
functional_step_set = load_json_to_set(functional_step_file_path)

# Merge the two sets
for k, v in functional_step_set.items():
    if k in key_value_set:
        key_value_set[k].update(v)
    else:
        key_value_set[k] = v

# Step 2: Connect to SQLite database and read raw_entity table
db_path = r"your/custom/path/sgmkg.db"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Create a new table and copy data
cursor.execute('''
    CREATE TABLE IF NOT EXISTS canon_entity (
        id TEXT PRIMARY KEY,
        entity_type TEXT,
        short_name TEXT,
        descr TEXT,
        new_id TEXT
    )
''')

# Copy data to the new table and update the new_id field based on key_value_set
cursor.execute('SELECT id, entity_type, short_name, descr FROM raw_entity')
rows = cursor.fetchall()

for row in rows:
    entity_id, entity_type, short_name, descr = row
    new_id = None
    
    # Find the matching key_value_set
    for key, value_set in key_value_set.items():
        if entity_id in value_set:
            new_id = key
            break
    
    cursor.execute('''
        INSERT OR REPLACE INTO canon_entity (id, entity_type, short_name, descr, new_id)
        VALUES (?, ?, ?, ?, ?)
    ''', (entity_id, entity_type, short_name, descr, new_id))

# Commit changes and close the database connection
conn.commit()
conn.close()

print("Data has been successfully copied and updated into the canon_entity table.")

In [None]:
import sqlite3

# Connect to SQLite database
db_path = r"your/custom/path/sgmkg.db"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Create a new table canon_relation_init with the same fields as raw_relation
cursor.execute('''
    CREATE TABLE IF NOT EXISTS canon_relation_init (
        id TEXT PRIMARY KEY,
        output_entity_id TEXT,
        input_entity_id TEXT,
        relation_type TEXT
    )
''')

# Read data from the raw_relation table
cursor.execute('SELECT id, output_entity_id, input_entity_id, relation_type FROM raw_relation')
rows = cursor.fetchall()

# Update output_entity_id and input_entity_id using new_id from canon_entity table
for row in rows:
    relation_id, output_entity_id, input_entity_id, relation_type = row
    
    # Get the new value for output_entity_id
    cursor.execute('SELECT new_id FROM canon_entity WHERE id = ?', (output_entity_id,))
    output_new_id = cursor.fetchone()
    if output_new_id and (relation_type == 'invoke' or relation_type == 'transfer'):
        output_entity_id = output_new_id[0]
    
    # Get the new value for input_entity_id
    cursor.execute('SELECT new_id FROM canon_entity WHERE id = ?', (input_entity_id,))
    input_new_id = cursor.fetchone()
    if input_new_id and (relation_type == 'invoke' or relation_type == 'transfer'):
        input_entity_id = input_new_id[0]
    
    # Insert the updated data into the canon_relation_init table
    cursor.execute('''
        INSERT OR REPLACE INTO canon_relation_init (id, output_entity_id, input_entity_id, relation_type)
        VALUES (?, ?, ?, ?)
    ''', (relation_id, output_entity_id, input_entity_id, relation_type))

# Commit changes and close the database connection
conn.commit()
conn.close()

print("Data has been successfully copied and updated into the canon_relation_init table.")


In [None]:
import sqlite3

# Connect to SQLite database
db_path = r"your/custom/path/sgmkg.db"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Create a new table canon_relation, including original fields and new fields: connectivity and frequency
cursor.execute('''
    CREATE TABLE IF NOT EXISTS canon_relation (
        id TEXT PRIMARY KEY,
        output_entity_id TEXT,
        input_entity_id TEXT,
        relation_type TEXT,
        connectivity INTEGER,
        frequency INTEGER
    )
''')

# Read data from the canon_relation_init table
cursor.execute('SELECT id, output_entity_id, input_entity_id, relation_type FROM canon_relation_init')
rows = cursor.fetchall()

# Dictionary to track the frequency of the same output_entity_id and input_entity_id combinations
relation_dict = {}

# Iterate through rows and record the frequency of each (output_entity_id, input_entity_id) pair
for row in rows:
    relation_id, output_entity_id, input_entity_id, relation_type = row
    key = (output_entity_id, input_entity_id)
    
    if key in relation_dict:
        relation_dict[key]['frequency'] += 1
    else:
        relation_dict[key] = {
            'relation_id': relation_id,
            'output_entity_id': output_entity_id,
            'input_entity_id': input_entity_id,
            'relation_type': relation_type,
            'frequency': 1,
            'connectivity': 1
        }

# Insert the results into the new canon_relation table
for key, value in relation_dict.items():
    cursor.execute('''
        INSERT OR REPLACE INTO canon_relation (
            id, output_entity_id, input_entity_id, relation_type, connectivity, frequency
        ) VALUES (?, ?, ?, ?, ?, ?)
    ''', (value['relation_id'], value['output_entity_id'], value['input_entity_id'], value['relation_type'], value['connectivity'], value['frequency']))

# Commit changes and close the database connection
conn.commit()
conn.close()

print("The canon_relation table has been successfully created and populated with data.")
