In [None]:
import sqlite3

# Connect to SQLite database
db_path = r"your/custom/path/sgmkg.db"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Execute query to retrieve new_id values starting with "iodata"
cursor.execute('''
    SELECT new_id FROM canon_entity
    WHERE new_id LIKE 'iodata%'
''')

# Retrieve and convert the result to a set of new_id values
iodata_ids = cursor.fetchall()
iodata_ids = set([item[0] for item in iodata_ids])

# Count the number of matching rows
count = len(iodata_ids)

# Close the database connection
conn.close()

print(f"Number of matching rows: {count}")

In [None]:
import torch
import sqlite3
import json
from concurrent.futures import ThreadPoolExecutor

# Load the .pth file
pth_file_path = r"your/custom/path/data_entity_preprocessing.pth"
data = torch.load(pth_file_path, map_location=torch.device('cpu'))

# Connect to SQLite database and retrieve new_id values
db_path = r"your/custom/path/sgmkg.db"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

cursor.execute('''
    SELECT new_id FROM canon_entity
    WHERE new_id LIKE 'iodata%'
''')

iodata_ids = cursor.fetchall()
iodata_ids = set([item[0] for item in iodata_ids])

# Extract items matching the criteria
new_embeddings = {k: v for k, v in data.items() if k in iodata_ids}

# Define the function to calculate similarity
def calculate_similarity(key, embeddings):
    target_embedding = embeddings[key]
    similarities = {}

    for other_key, other_embedding in embeddings.items():
        if key != other_key:
            with torch.no_grad():
                similarity = torch.nn.functional.cosine_similarity(target_embedding, other_embedding).item()
            similarities[other_key] = similarity

    # Get Top 3 similarities
    top_3 = sorted(similarities.items(), key=lambda item: item[1], reverse=True)[:3]
    print(f'Finish calculation for {key}')
    return key, [{"key": other_key, "similarity": similarity} for other_key, similarity in top_3]

# Use parallel computation to speed up similarity calculation
results_dict = {}
with ThreadPoolExecutor() as executor:
    results = list(executor.map(lambda k: calculate_similarity(k, new_embeddings), iodata_ids))
    for key, top_3 in results:
        results_dict[key] = top_3

# Save results to a JSON file
output_json_path = r"your/custom/path/similarity_results.json"
with open(output_json_path, 'w', encoding='utf-8') as f:
    json.dump(results_dict, f, ensure_ascii=False, indent=4)

# Close the database connection
conn.close()

print(f"Similarity calculation complete. Results saved to: {output_json_path}")


In [None]:
import sqlite3
import random
import json

# Connect to SQLite database
db_path = r"your/custom/path/sgmkg.db"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Step 1: Copy canon_relation table to aug_relation table
cursor.execute('DROP TABLE IF EXISTS aug_relation')  # Drop the old aug_relation table if it exists
cursor.execute('''
    CREATE TABLE aug_relation AS
    SELECT * FROM canon_relation
''')

cursor.execute('DROP TABLE IF EXISTS aug_entity')  # Drop the old aug_entity table if it exists
cursor.execute('''
    CREATE TABLE aug_entity AS
    SELECT * FROM canon_entity
''')

# Query to get new_id values starting with "iodata"
cursor.execute('''
    SELECT new_id FROM aug_entity
    WHERE new_id LIKE 'iodata%'
''')

# Get the list of new_id values
iodata_ids = cursor.fetchall()
iodata_ids = set([item[0] for item in iodata_ids])  # Convert to a simple list

# Read similarity_results.json file
json_file_path = r"your/custom/path/similarity_results.json"
with open(json_file_path, 'r', encoding='utf-8') as file:
    similarity_data = json.load(file)

# Iterate through each output_entity_id in the JSON file
for output_entity_id, similar_items in similarity_data.items():
    # Query aug_relation table for all 'transfer' relations with the current output_entity_id
    cursor.execute('''
        SELECT input_entity_id FROM aug_relation
        WHERE relation_type = 'transfer'
        AND output_entity_id = ?
    ''', (output_entity_id,))
    input_entity_ids = cursor.fetchall()
    input_entity_ids = [item[0] for item in input_entity_ids]  # Extract input_entity_id list

    # Randomly select 5 input_entity_ids if there are more than 5
    if len(input_entity_ids) > 5:
        input_entity_ids = random.sample(input_entity_ids, 5)

    # Iterate through each similar item for the current output_entity_id
    for item in similar_items:
        similar_output_entity_id = item['key']
        similarity = item['similarity']

        # Skip the item if similarity is less than 0.68
        if similarity < 0.68:
            print(f"Skipping {similar_output_entity_id} for {output_entity_id} due to low similarity: {similarity}")
            continue

        # Iterate through all input_entity_ids
        for input_entity_id in input_entity_ids:
            # Check if the output_entity_id and input_entity_id combination already exists
            cursor.execute('''
                SELECT frequency FROM aug_relation
                WHERE output_entity_id = ?
                AND input_entity_id = ?
                AND relation_type = 'transfer'
            ''', (similar_output_entity_id, input_entity_id))
            result = cursor.fetchone()

            if not result:
                # Insert a new row if the combination does not exist
                cursor.execute('''
                    INSERT INTO aug_relation (output_entity_id, input_entity_id, relation_type, connectivity, frequency)
                    VALUES (?, ?, 'transfer', ?, 1)
                ''', (similar_output_entity_id, input_entity_id, similarity))

# Commit changes and close database connection
conn.commit()
conn.close()

print("Processing complete. All data has been updated in the aug_relation table.")
