In [3]:
import pandas as pd
import networkx as nx

def check(seed):
    """Save valid pairs for a given seed."""
    # Load pairwise stats for this seed
    df = pd.read_csv(f'/home/wangzy/wikidbs/wikidbs/out/pairwise_stats_seed{seed}.csv')

    # Load graph to get connected components
    G = nx.read_gexf(f'/home/wangzy/wikidbs/wikidbs/out/graphs/overlap_graph_seed{seed}.gexf')
    components = list(nx.connected_components(G))

    # remove self-pairs
    df = df[df['db1'] != df['db2']]
    # print(f"Number of components: {len(components)}")
    for _, row in df.iterrows():
        overlap_count = row['overlap_features']
        if overlap_count == 0:
            print(row)

In [4]:
check(0)

db1                 35179_JAPAN_NATIONAL_ROUTE_CONNECTIONS_3
db2                          42198_MissGeorgiaPageantWinners
overlap_features                                           0
table1_features                                           92
table2_features                                           38
Name: 0, dtype: object
db1                 35179_JAPAN_NATIONAL_ROUTE_CONNECTIONS_3
db2                           74599_Chief_Physician_Profiles
overlap_features                                           0
table1_features                                           92
table2_features                                         1102
Name: 1, dtype: object
db1                 35179_JAPAN_NATIONAL_ROUTE_CONNECTIONS_3
db2                          43630_takeda_clinical_trials_db
overlap_features                                           0
table1_features                                           92
table2_features                                         1236
Name: 2, dtype: object
db1             

KeyboardInterrupt: 

In [6]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import os

# 设置图文件路径
graph_path = '/home/wangzy/wikidbs/wikidbs/out/graphs/overlap_graph_seed0.gexf'
output_dir = 'out/graph_analysis'
os.makedirs(output_dir, exist_ok=True)

# 读取图
G = nx.read_gexf(graph_path)
print(f'Graph loaded: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges.')

# 计算连通分量
components = list(nx.connected_components(G))
print(f'Number of connected components: {len(components)}')

# 统计每个连通分量内的最小跳数分布
all_shortest_paths = []
component_sizes = []
for i, component in enumerate(components):
    subgraph = G.subgraph(component)
    component_sizes.append(len(subgraph))
    lengths = dict(nx.all_pairs_shortest_path_length(subgraph))
    for source, targets in lengths.items():
        for target, distance in targets.items():
            if source != target:
                all_shortest_paths.append(distance)

# 绘制最小跳数分布图
plt.figure(figsize=(8, 6))
plt.hist(all_shortest_paths, bins=50, edgecolor='black', density=False)
plt.xlabel('Shortest Path Length')
plt.ylabel('Frequency')
plt.title('Distribution of Shortest Path Lengths within Connected Components')
plt.savefig(f'{output_dir}/shortest_path_distribution.png')
plt.close()
print('Saved shortest path length distribution plot.')

# 统计连通分量大小分布
plt.figure(figsize=(8, 6))
plt.hist(component_sizes, bins=20, edgecolor='black', density=True)
plt.xlabel('Component Size (Number of Nodes)')
plt.ylabel('Frequency')
plt.title('Distribution of Connected Component Sizes')
plt.savefig(f'{output_dir}/component_size_distribution.png')
plt.close()
print('Saved component size distribution plot.')

# 计算节点度分布
degrees = [degree for _, degree in G.degree()]
plt.figure(figsize=(8, 6))
plt.hist(degrees, bins=50, edgecolor='black', density=True)
plt.xlabel('Node Degree')
plt.ylabel('Frequency')
plt.title('Node Degree Distribution in Graph')
plt.savefig(f'{output_dir}/node_degree_distribution.png')
plt.close()
print('Saved node degree distribution plot.')

# 输出一些统计信息
print('Graph Analysis Summary:')
print(f' - Total Nodes: {G.number_of_nodes()}')
print(f' - Total Edges: {G.number_of_edges()}')
print(f' - Number of Connected Components: {len(components)}')
print(f' - Average Component Size: {np.mean(component_sizes):.2f}')
print(f' - Max Component Size: {np.max(component_sizes)}')
print(f' - Min Component Size: {np.min(component_sizes)}')
print(f' - Average Shortest Path Length: {np.mean(all_shortest_paths):.2f}')
print(f' - Max Shortest Path Length: {np.max(all_shortest_paths)}')
print(f' - Average Node Degree: {np.mean(degrees):.2f}')
print(f' - Max Node Degree: {np.max(degrees)}')
print(f' - Min Node Degree: {np.min(degrees)}')

Graph loaded: 1000 nodes, 96487 edges.
Number of connected components: 3
Saved shortest path length distribution plot.
Saved component size distribution plot.
Saved node degree distribution plot.
Graph Analysis Summary:
 - Total Nodes: 1000
 - Total Edges: 96487
 - Number of Connected Components: 3
 - Average Component Size: 333.33
 - Max Component Size: 676
 - Min Component Size: 144
 - Average Shortest Path Length: 1.64
 - Max Shortest Path Length: 4
 - Average Node Degree: 192.97
 - Max Node Degree: 577
 - Min Node Degree: 3
