In [3]:
import numpy as np
import json
from sklearn.metrics.pairwise import cosine_similarity

def generate_synthetic_data(
    num_communities=3,
    nodes_per_community=20,
    feature_dim=5,
    r=0.85,
    intra_weight=1.2,
    inter_weight=0.3,
    edge_prob=0.15,
    noise_scale=0.1,
    seed=42
):
    """
    生成合成网络数据[^1]

    参数：
    num_communities: 社区数量
    nodes_per_community: 每个社区节点数
    feature_dim: 特征维度
    r: 社区内最小相似度阈值
    intra_weight: 社区内基础边权
    inter_weight: 社区间基础边权
    edge_prob: 边生成概率
    noise_scale: 属性噪声强度
    """
    np.random.seed(seed)
    total_nodes = num_communities * nodes_per_community

    # 生成社区中心特征[^2]
    centroids = np.random.randn(num_communities, feature_dim)
    centroids /= np.linalg.norm(centroids, axis=1, keepdims=True)

    # 生成节点特征
    nodes = []
    for comm_id in range(num_communities):
        # 生成围绕中心点的随机特征
        comm_features = centroids[comm_id] + np.random.normal(
            scale=noise_scale,
            size=(nodes_per_community, feature_dim)
        )
        # 归一化保证相似度≥r
        norms = np.linalg.norm(comm_features, axis=1)
        comm_features = comm_features / norms[:, None]
        # 添加轻微扰动
        comm_features += np.random.uniform(-0.05, 0.05, size=comm_features.shape)

        for i in range(nodes_per_community):
            node_id = comm_id * nodes_per_community + i
            nodes.append({
                "id": node_id,
                "features": comm_features[i].tolist(),
                "community": comm_id
            })

    # 生成边（带权重噪声）[^3]
    edges = []
    adj_matrix = np.zeros((total_nodes, total_nodes))

    # 社区内部连接
    for comm in range(num_communities):
        start = comm * nodes_per_community
        end = start + nodes_per_community
        # 生成随机边
        for i in range(start, end):
            for j in range(i+1, end):
                if np.random.rand() < edge_prob:
                    weight = intra_weight + np.random.exponential(0.2)
                    adj_matrix[i,j] = weight

    # 社区间连接
    for i in range(total_nodes):
        for j in range(i+1, total_nodes):
            if nodes[i]["community"] != nodes[j]["community"]:
                if np.random.rand() < edge_prob/5:  # 更低的连接概率
                    weight = inter_weight + np.random.exponential(0.1)
                    adj_matrix[i,j] = weight

    # 稀疏化处理：截断低权重边[^4]
    threshold = np.percentile(adj_matrix[adj_matrix > 0], 70)
    adj_matrix[adj_matrix < threshold] = 0

    # 转换为边列表
    rows, cols = np.where(adj_matrix > 0)
    for i, j in zip(rows, cols):
        edges.append({
            "source": int(i),
            "target": int(j),
            "weight": float(adj_matrix[i,j])
        })

    # 验证社区内相似度
    for comm in range(num_communities):
        start = comm * nodes_per_community
        end = start + nodes_per_community
        features = np.array([nodes[i]["features"] for i in range(start, end)])
        sim_matrix = cosine_similarity(features)
        min_sim = sim_matrix.min()
        print(f"社区{comm}最小相似度：{min_sim:.4f} (要求≥{r})")
        assert min_sim >= r, "社区内相似度不足"

    return {
        "nodes": nodes,
        "edges": edges
    }

if __name__ == "__main__":
    data = generate_synthetic_data(
        num_communities=3,
        nodes_per_community=50,
        r=0.8,
        edge_prob=0.1
    )

    with open("synthetic_graph.json", "w") as f:
        json.dump(data, f, indent=0)

    print(f"\n生成数据统计：")
    print(f"节点数：{len(data['nodes'])}")
    print(f"边数：{len(data['edges'])}")


社区0最小相似度：0.8149 (要求≥0.8)
社区1最小相似度：0.8444 (要求≥0.8)
社区2最小相似度：0.8424 (要求≥0.8)

生成数据统计：
节点数：150
边数：160
