In [None]:
import requests
import requests_cache
import urllib.parse
import sqlite3
import time
import re
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# Enable caching of HTTP requests
requests_cache.install_cache('wikidata_cache', backend='memory', expire_after=86400)

def remove_disambiguator(label):
    # Remove trailing ' (number)' from label, e.g., 'father (1)' -> 'father'
    return re.sub(r' \(\d+\)$', '', label)

def safe_query(db_file, query, retries=50, delay=2):
    """Attempts to execute a query on an SQLite database with retries."""
    for attempt in range(retries):
        try:
            conn = sqlite3.connect(f'file:{db_file}?mode=ro', uri=True)
            cursor = conn.cursor()
            cursor.execute(query)
            result = cursor.fetchall()
            rows = list(set(remove_disambiguator(row[0]) for row in result))
            print(f"Total entities found in database: {len(rows)}")
            return rows
        except sqlite3.DatabaseError as e:
            print(f"[{db_file}] Attempt {attempt+1} failed: {e}")
            time.sleep(delay)
        finally:
            if 'conn' in locals():
                conn.close()
    print(f"[{db_file}] Failed after {retries} attempts.")
    return list()

def get_wikidata_id(entity_name, language="en", retries=50, delay=4):
    url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbsearchentities",
        "format": "json",
        "language": language,
        "search": entity_name
    }
    for attempt in range(retries):
        try:
            response = requests.get(url, params=params, timeout=10)
            data = response.json()
            if data.get('search'):
                return data['search'][0]['id']
            return None
        except requests.exceptions.RequestException as e:
            print(f"Request failed for {entity_name} (attempt {attempt+1}): {e}")
            time.sleep(delay)
    return None

def get_statement_count(wikidata_id):
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{wikidata_id}.json"
    response = requests.get(url)
    data = response.json()
    try:
        entity = data['entities'][wikidata_id]
        return len(entity.get('claims', {}))
    except KeyError:
        return 0

def fetch_entity_info(name, language="en"):
    wikidata_id = get_wikidata_id(name, language=language)
    if wikidata_id:
        count = get_statement_count(wikidata_id)
        return (name, wikidata_id, count)
    else:
        return (name, None, 0)

def fetch_and_sort_by_statements(entities, language="en"):
    results = []
    not_found = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(fetch_entity_info, name, language): name for name in entities}
        for future in tqdm(as_completed(futures), total=len(futures), desc=f"Fetching statement counts from Wikidata for {len(entities)} entities"):
            name, wikidata_id, count = future.result()
            if wikidata_id:
                results.append((name, wikidata_id, count))
            else:
                not_found.append(name)
    
    # Sort by statement count, descending
    results.sort(key=lambda x: x[2], reverse=True)
    print(f"\nTotal entities also found in Wikidata: {len(results)}")
    print(f"Total not found in Wikidata: {len(not_found)}")
    if not_found:
        print("Some of the missing entities:")
        for nf in not_found[:10]:  # show first 10 missing entries
            print(f" - {nf}")
    
    # Calculate top k%
    #k = int(len(results) * (k_percent / 100))
    #print(f"Top {k_percent}% of entities: {k} entities")
    return results#[:k]

def bucket_by_percentiles(results):
    """Split sorted entity results into 4 percentile buckets."""
    total = len(results)
    buckets = {
        "0-25%": results[int(total * 0.75):],
        "25-50%": results[int(total * 0.5):int(total * 0.75)],
        "50-75%": results[int(total * 0.25):int(total * 0.5)],
        "75-100%": results[:int(total * 0.25)]
    }
    return buckets

In [None]:
sql_query = "SELECT name FROM node WHERE type=\"instance\" ;"

import re

def extract_language_code(db_file):
    # Try to extract language code from filename, fallback to 'en' if not found
    match = re.search(r'_([A-Z]{2})_temp', db_file)
    if match:
        lang = match.group(1).lower()
        # PO is not a standard code, map to 'pl' for Polish, etc.
        lang_map = {'po': 'pl'}
        return lang_map.get(lang, lang)
    return 'en'

db_files = [
    './base_setting/babylonGPTKB_termination_seed1_EN_temp0_1st.db',
    './base_setting/babylonGPTKB_termination_seed1_EN_temp0_2nd.db',
    './base_setting/babylonGPTKB_termination_seed1_EN_temp0_3rd.db',
    './base_setting/babylonGPTKB_termination_seed1_EN_temp0_4th.db',
    './base_setting/babylonGPTKB_termination_seed1_EN_temp0_5th.db',
    './base_setting/babylonGPTKB_termination_seed1_EN_temp0_6th.db',
    './base_setting/babylonGPTKB_termination_seed1_EN_temp0_7th.db',
    './base_setting/babylonGPTKB_termination_seed1_EN_temp0_8th.db',
    './base_setting/babylonGPTKB_termination_seed1_EN_temp0_9th.db',
    './base_setting/babylonGPTKB_termination_seed1_EN_temp0_10th.db',

    #'./seed_variation/babylonGPTKB_termination_seed2_EN_temp0.db',
    #'./seed_variation/babylonGPTKB_termination_seed3_EN_temp0.db',
    #'./seed_variation/babylonGPTKB_termination_seed4_EN_temp0.db',
    #'./seed_variation/babylonGPTKB_termination_seed5_EN_temp0.db',
    #'./seed_variation/babylonGPTKB_termination_seed6_EN_temp0.db',
    #'./seed_variation/babylonGPTKB_termination_seed7_EN_temp0.db',
    #'./seed_variation/babylonGPTKB_termination_seed8_EN_temp0.db',
    #'./seed_variation/babylonGPTKB_termination_seed9_EN_temp0.db',
    #'./seed_variation/babylonGPTKB_termination_seed10_EN_temp0.db',

    #'./rand_variation/babylonGPTKB_termination_seed1_EN_temp1_1st.db',
    #'./rand_variation/babylonGPTKB_termination_seed1_EN_temp1_2nd.db',
    #'./rand_variation/babylonGPTKB_termination_seed1_EN_temp1_3rd.db',
    #'./rand_variation/babylonGPTKB_termination_seed1_EN_temp1_4th.db',
    #'./rand_variation/babylonGPTKB_termination_seed1_EN_temp1_5th.db',
    #'./rand_variation/babylonGPTKB_termination_seed1_EN_temp1_6th.db',
    #'./rand_variation/babylonGPTKB_termination_seed1_EN_temp1_7th.db',
    #'./rand_variation/babylonGPTKB_termination_seed1_EN_temp1_8th.db',
    #'./rand_variation/babylonGPTKB_termination_seed1_EN_temp1_9th.db',
    #'./rand_variation/babylonGPTKB_termination_seed1_EN_temp1_10th.db',

    #'./lang_variation/backtranslation_EN/babylonGPTKB_DE-EN_20250630_100431.db',
    #'./lang_variation/backtranslation_EN/babylonGPTKB_ES-EN_20250616_091934.db',
    #'./lang_variation/backtranslation_EN/babylonGPTKB_FR-EN_20250627_151851.db',
    #'./lang_variation/backtranslation_EN/babylonGPTKB_IT-EN_20250628_132420.db',
    #'./lang_variation/backtranslation_EN/babylonGPTKB_PO-EN_20250623_102904.db',
    #'./lang_variation/backtranslation_EN/babylonGPTKB_PT-EN_20250623_104443.db',
    #'./lang_variation/backtranslation_EN/babylonGPTKB_RU-EN_20250612_163509.db',
    #'./lang_variation/backtranslation_EN/babylonGPTKB_SV-EN_20250703_095332.db',
    #'./lang_variation/backtranslation_EN/babylonGPTKB_TR-EN_20250620_104047.db',

    #'./lang_variation/ID_babylonGPTKB_seed1_DE_temp0.db',
    #'./lang_variation/ID_babylonGPTKB_seed1_ES_temp0.db',
    #'./lang_variation/ID_babylonGPTKB_seed1_FR_temp0.db',
    #'./lang_variation/ID_babylonGPTKB_seed1_IT_temp0.db',
    #'./lang_variation/ID_babylonGPTKB_seed1_PO_temp0.db',
    #'./lang_variation/ID_babylonGPTKB_seed1_PT_temp0.db',
    #'./lang_variation/ID_babylonGPTKB_seed1_RU_temp0.db',
    #'./lang_variation/ID_babylonGPTKB_seed1_SV_temp0.db',
    #'./lang_variation/ID_babylonGPTKB_seed1_TR_temp0.db',

    #'./topic_variation/TBBT/TBBT_GPTKB_seed1_EN_temp0_1.db',
    #'./topic_variation/TBBT/TBBT_GPTKB_seed1_EN_temp0_2.db',
    #'./topic_variation/TBBT/TBBT_GPTKB_seed1_EN_temp0_3.db',
    #'./topic_variation/TBBT/TBBT_GPTKB_seed1_EN_temp0_4.db',
    #'./topic_variation/TBBT/TBBT_GPTKB_seed1_EN_temp0_5.db',
    #'./topic_variation/TBBT/TBBT_GPTKB_seed1_EN_temp0_6.db',
    #'./topic_variation/TBBT/TBBT_GPTKB_seed1_EN_temp0_7.db',
    #'./topic_variation/TBBT/TBBT_GPTKB_seed1_EN_temp0_8.db',
    #'./topic_variation/TBBT/TBBT_GPTKB_seed1_EN_temp0_9.db',
    #'./topic_variation/TBBT/TBBT_GPTKB_seed1_EN_temp0_10.db',

    #'./topic_variation/dax40/dax40GPTKB_seed1_EN_temp0_1.db',
    #'./topic_variation/dax40/dax40GPTKB_seed1_EN_temp0_2.db',
    #'./topic_variation/dax40/dax40GPTKB_seed1_EN_temp0_3.db',
    #'./topic_variation/dax40/dax40GPTKB_seed1_EN_temp0_4.db',
    #'./topic_variation/dax40/dax40GPTKB_seed1_EN_temp0_5.db',
    #'./topic_variation/dax40/dax40GPTKB_seed1_EN_temp0_6.db',
    #'./topic_variation/dax40/dax40GPTKB_seed1_EN_temp0_7.db',
    #'./topic_variation/dax40/dax40GPTKB_seed1_EN_temp0_8.db',
    #'./topic_variation/dax40/dax40GPTKB_seed1_EN_temp0_9.db',
    #'./topic_variation/dax40/dax40GPTKB_seed1_EN_temp0_10.db',
    ]

all_0_25_labels = []
all_25_50_labels = []
all_50_75_labels = []
all_75_100_labels = []
all_not_found_labels = []

for db_file in db_files:
    language = 'en' #extract_language_code(db_file)
    entity_list = safe_query(db_file, sql_query)
    top_entities = fetch_and_sort_by_statements(entity_list, language=language)

    def fetch_and_sort_by_statements_with_not_found(entities, language):
        results = []
        not_found = []

        with ThreadPoolExecutor(max_workers=10) as executor:
            futures = {executor.submit(fetch_entity_info, name, language): name for name in entities}
            for future in tqdm(as_completed(futures), total=len(futures), desc=f"Fetching statement counts from Wikidata for {len(entities)} entities"):
                name, wikidata_id, count = future.result()
                if wikidata_id:
                    results.append((name, wikidata_id, count))
                else:
                    not_found.append(name)

        results.sort(key=lambda x: x[2], reverse=True)
        return results, not_found

    top_entities, not_found_entities = fetch_and_sort_by_statements_with_not_found(entity_list, language)

    buckets = bucket_by_percentiles(top_entities)

    bucket_0_25 = buckets["0-25%"]
    bucket_25_50 = buckets["25-50%"]
    bucket_50_75 = buckets["50-75%"]
    bucket_75_100 = buckets["75-100%"]

    # Print summaries
    for label, entities in buckets.items():
        print(f"\nBucket {label}: {len(entities)} entities.")
        for name, qid, count in entities[:5]:  # Show a few from each
            print(f" - {name} ({qid}) - {count} statements")

    all_0_25_labels.append([i[0] for i in bucket_0_25])
    all_25_50_labels.append([i[0] for i in bucket_25_50])
    all_50_75_labels.append([i[0] for i in bucket_50_75])
    all_75_100_labels.append([i[0] for i in bucket_75_100])
    all_not_found_labels.append(not_found_entities)

(set_1_0_25, set_2_0_25, set_3_0_25, set_4_0_25, set_5_0_25,
 set_6_0_25, set_7_0_25, set_8_0_25, set_9_0_25, set_10_0_25
 ) = all_0_25_labels

(set_1_25_50, set_2_25_50, set_3_25_50, set_4_25_50, set_5_25_50,
 set_6_25_50, set_7_25_50, set_8_25_50, set_9_25_50, set_10_25_50
 ) = all_25_50_labels

(set_1_50_75, set_2_50_75, set_3_50_75, set_4_50_75, set_5_50_75,
 set_6_50_75, set_7_50_75, set_8_50_75, set_9_50_75, set_10_50_75
 ) = all_50_75_labels

(set_1_75_100, set_2_75_100, set_3_75_100, set_4_75_100, set_5_75_100,
 set_6_75_100, set_7_75_100, set_8_75_100, set_9_75_100, set_10_75_100
 ) = all_75_100_labels

(set_1_not_found, set_2_not_found, set_3_not_found, set_4_not_found, set_5_not_found,
 set_6_not_found, set_7_not_found, set_8_not_found, set_9_not_found, set_10_not_found
 ) = all_not_found_labels

In [None]:
# AVERAGE JACCARD SIMILARITY

from itertools import combinations
import numpy as np

def jaccard_similarity(list_a, list_b):
    set_a, set_b = set(list_a), set(list_b)
    return len(set_a & set_b) / len(set_a | set_b)

### 0-25 ###
sets = [set_1_0_25, set_2_0_25, set_3_0_25, set_4_0_25, set_5_0_25,
 set_6_0_25, set_7_0_25, set_8_0_25, set_9_0_25, set_10_0_25]
set_names = ['set_1_0_25', 'set_2_0_25', 'set_3_0_25', 'set_4_0_25', 'set_5_0_25',
 'set_6_0_25', 'set_7_0_25', 'set_8_0_25', 'set_9_0_25', 'set_10_0_25']
total_similarity = 0
pair_count = 0
for (i, set_a), (j, set_b) in combinations(enumerate(sets), 2):
    sim = jaccard_similarity(set_a, set_b)
    print(f"Jaccard({set_names[i]} ∩ {set_names[j]}) = {sim:.4f}")
    total_similarity += sim
    pair_count += 1
average_similarity = total_similarity / pair_count
std_similarity = np.std([jaccard_similarity(set_a, set_b) for (i, set_a), (j, set_b) in combinations(enumerate(sets), 2)])
print(f"\nAverage pairwise Jaccard similarity BUCKET 0-25%: {average_similarity:.4f} ± {std_similarity:.4f}")

### 25-50 ###
sets = [set_1_25_50, set_2_25_50, set_3_25_50, set_4_25_50, set_5_25_50,
 set_6_25_50, set_7_25_50, set_8_25_50, set_9_25_50, set_10_25_50]
set_names = ['set_1_25_50', 'set_2_25_50', 'set_3_25_50', 'set_4_25_50', 'set_5_25_50',
 'set_6_25_50', 'set_7_25_50', 'set_8_25_50', 'set_9_25_50', 'set_10_25_50']
total_similarity = 0
pair_count = 0
for (i, set_a), (j, set_b) in combinations(enumerate(sets), 2):
    sim = jaccard_similarity(set_a, set_b)
    print(f"Jaccard({set_names[i]} ∩ {set_names[j]}) = {sim:.4f}")
    total_similarity += sim
    pair_count += 1
average_similarity = total_similarity / pair_count
std_similarity = np.std([jaccard_similarity(set_a, set_b) for (i, set_a), (j, set_b) in combinations(enumerate(sets), 2)])
print(f"\nAverage pairwise Jaccard similarity BUCKET 25-50%: {average_similarity:.4f} ± {std_similarity:.4f}")

### 50-75 ###
sets = [set_1_50_75, set_2_50_75, set_3_50_75, set_4_50_75, set_5_50_75,
 set_6_50_75, set_7_50_75, set_8_50_75, set_9_50_75, set_10_50_75]
set_names = ['set_1_50_75', 'set_2_50_75', 'set_3_50_75', 'set_4_50_75', 'set_5_50_75',
 'set_6_50_75', 'set_7_50_75', 'set_8_50_75', 'set_9_50_75', 'set_10_50_75']
total_similarity = 0
pair_count = 0
for (i, set_a), (j, set_b) in combinations(enumerate(sets), 2):
    sim = jaccard_similarity(set_a, set_b)
    print(f"Jaccard({set_names[i]} ∩ {set_names[j]}) = {sim:.4f}")
    total_similarity += sim
    pair_count += 1
average_similarity = total_similarity / pair_count
std_similarity = np.std([jaccard_similarity(set_a, set_b) for (i, set_a), (j, set_b) in combinations(enumerate(sets), 2)])
print(f"\nAverage pairwise Jaccard similarity BUCKET 50-75%: {average_similarity:.4f} ± {std_similarity:.4f}")

### 75-100 ###
sets = [set_1_75_100, set_2_75_100, set_3_75_100, set_4_75_100, set_5_75_100,
 set_6_75_100, set_7_75_100, set_8_75_100, set_9_75_100, set_10_75_100]
set_names = ['set_1_75_100', 'set_2_75_100', 'set_3_75_100', 'set_4_75_100', 'set_5_75_100',
 'set_6_75_100', 'set_7_75_100', 'set_8_75_100', 'set_9_75_100', 'set_10_75_100']
total_similarity = 0
pair_count = 0
for (i, set_a), (j, set_b) in combinations(enumerate(sets), 2):
    sim = jaccard_similarity(set_a, set_b)
    print(f"Jaccard({set_names[i]} ∩ {set_names[j]}) = {sim:.4f}")
    total_similarity += sim
    pair_count += 1
average_similarity = total_similarity / pair_count
std_similarity = np.std([jaccard_similarity(set_a, set_b) for (i, set_a), (j, set_b) in combinations(enumerate(sets), 2)])
print(f"\nAverage pairwise Jaccard similarity BUCKET 75-100%: {average_similarity:.4f} ± {std_similarity:.4f}")

### NOT FOUND ###
sets = [set_1_not_found, set_2_not_found, set_3_not_found, set_4_not_found, set_5_not_found,
 set_6_not_found, set_7_not_found, set_8_not_found, set_9_not_found, set_10_not_found]
set_names = ['set_1_not_found', 'set_2_not_found', 'set_3_not_found', 'set_4_not_found', 'set_5_not_found',
 'set_6_not_found', 'set_7_not_found', 'set_8_not_found', 'set_9_not_found', 'set_10_not_found']
total_similarity = 0
pair_count = 0
for (i, set_a), (j, set_b) in combinations(enumerate(sets), 2):
    sim = jaccard_similarity(set_a, set_b)
    print(f"Jaccard({set_names[i]} ∩ {set_names[j]}) = {sim:.4f}")
    total_similarity += sim
    pair_count += 1
average_similarity = total_similarity / pair_count
std_similarity = np.std([jaccard_similarity(set_a, set_b) for (i, set_a), (j, set_b) in combinations(enumerate(sets), 2)])
print(f"\nAverage pairwise Jaccard similarity NOT FOUND: {average_similarity:.4f} ± {std_similarity:.4f}")

In [None]:
# COSINE-BASED HAUSDORFF SIMILARITY: BUCKETS vs FULL SETS OF OTHER RUNS
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_distances
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
#model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2', device='cuda')

def average_cosine_hausdorff_similarity_buckets(set1, set2, match_threshold=0.95):
    if not set1 or not set2:
        return 0, 0, 0, 0, len(set1), len(set2)
    embeddings1 = model.encode(list(set1), convert_to_numpy=True, batch_size=64)
    embeddings2 = model.encode(list(set2), convert_to_numpy=True, batch_size=64)
    distances_1 = cosine_distances(embeddings1, embeddings2)
    avg_distance1 = np.mean(np.min(distances_1, axis=1))
    avg_similarity = 1 - avg_distance1
    sim_matrix_1 = 1 - distances_1
    matches1 = np.sum(np.max(sim_matrix_1, axis=1) >= match_threshold)
    len1 = len(set1)
    len2 = len(set2)
    pct1 = 100 * matches1 / len1 if len1 > 0 else 0
    return avg_similarity, matches1, pct1, len1, len2

bucket_types = ['0_25', '25_50', '50_75', '75_100', 'not_found']
buckets = {
    '0_25': [set_1_0_25, set_2_0_25, set_3_0_25, set_4_0_25, set_5_0_25, set_6_0_25, set_7_0_25, set_8_0_25, set_9_0_25, set_10_0_25],
    '25_50': [set_1_25_50, set_2_25_50, set_3_25_50, set_4_25_50, set_5_25_50, set_6_25_50, set_7_25_50, set_8_25_50, set_9_25_50, set_10_25_50],
    '50_75': [set_1_50_75, set_2_50_75, set_3_50_75, set_4_50_75, set_5_50_75, set_6_50_75, set_7_50_75, set_8_50_75, set_9_50_75, set_10_50_75],
    '75_100': [set_1_75_100, set_2_75_100, set_3_75_100, set_4_75_100, set_5_75_100, set_6_75_100, set_7_75_100, set_8_75_100, set_9_75_100, set_10_75_100],
    'not_found': [set_1_not_found, set_2_not_found, set_3_not_found, set_4_not_found, set_5_not_found, set_6_not_found, set_7_not_found, set_8_not_found, set_9_not_found, set_10_not_found]
}
full_sets = [
    set_1_0_25 + set_1_25_50 + set_1_50_75 + set_1_75_100 + set_1_not_found,
    set_2_0_25 + set_2_25_50 + set_2_50_75 + set_2_75_100 + set_2_not_found,
    set_3_0_25 + set_3_25_50 + set_3_50_75 + set_3_75_100 + set_3_not_found,
    set_4_0_25 + set_4_25_50 + set_4_50_75 + set_4_75_100 + set_4_not_found,
    set_5_0_25 + set_5_25_50 + set_5_50_75 + set_5_75_100 + set_5_not_found,     
    set_6_0_25 + set_6_25_50 + set_6_50_75 + set_6_75_100 + set_6_not_found,     
    set_7_0_25 + set_7_25_50 + set_7_50_75 + set_7_75_100 + set_7_not_found,
    set_8_0_25 + set_8_25_50 + set_8_50_75 + set_8_75_100 + set_8_not_found,
    set_9_0_25 + set_9_25_50 + set_9_50_75 + set_9_75_100 + set_9_not_found,
    set_10_0_25 + set_10_25_50 + set_10_50_75 + set_10_75_100 + set_10_not_found
]
for bucket_name in bucket_types:     
    print(f"\n=== Cosine-based Hausdorff: {bucket_name.replace('_', '-')}% buckets vs full sets of other runs (bucket_i to full_j, i ≠ j, one direction only) ===")
    similarities = []
    match_percentages = []
    for i, bucket in enumerate(buckets[bucket_name]): 
        for j, full in enumerate(full_sets):
            if i == j:
                continue  # skip comparing to own run           
            similarity, matches, pct, len1, len2 = average_cosine_hausdorff_similarity_buckets(bucket, full)
            print(f"Bucket {bucket_name} of set {i+1} vs full set {j+1}: Similarity: {similarity:.4f}, Matches {i+1}→{j+1}: {matches}/{len1} ({pct:.1f}%)")
            similarities.append(similarity)
            match_percentages.append(pct)
    if similarities:
        print(f"Overall Avg Cosine-based Hausdorff Similarity {bucket_name.replace('_', '-')}%: {np.mean(similarities):.4f} ± {np.std(similarities):.4f}")
        print(f"Overall Avg Match Percentage {bucket_name.replace('_', '-')}%: {np.mean(match_percentages):.2f}% ± {np.std(match_percentages):.2f}%")