In [12]:
import pandas as pd
import numpy as np
import torch
import os

In [13]:
# Set device
device = torch.device('cuda:1')
# Directory for data and logs
inputdir = '../data/'
precesseddir = '../data/processed/'
if not os.path.exists(precesseddir):
    os.makedirs(precesseddir)

In [14]:
geo_data = pd.read_csv(inputdir + 'geo_reference.csv', delimiter=';')
graph_data = geo_data[['iu_ac', 'iu_nd_amont', 'iu_nd_aval']]

In [15]:
def duplicate_check(data):
    # 对 'iu_ac' 进行分组，并计算 'iu_nd_amont' 和 'iu_nd_aval' 的唯一值数量
    unique_counts = data.groupby('iu_ac').agg({
        'iu_nd_amont': 'nunique',
        'iu_nd_aval': 'nunique'
    })
    # 检查所有 'iu_nd_amont' 和 'iu_nd_aval' 的唯一值数量是否都是 1
    all_unique = (unique_counts['iu_nd_amont'] == 1) & (unique_counts['iu_nd_aval'] == 1)
    # 如果 all_unique 中的所有值都是 True，则表示每个重复的 iu_ac 都有唯一对应的 iu_nd_amont 和 iu_nd_aval
    if all_unique.all():
        print("对于所有重复的 'iu_ac'，'iu_nd_amont' 和 'iu_nd_aval' 均唯一对应。")
    else:
        print("存在一些 'iu_ac'，其 'iu_nd_amont' 或 'iu_nd_aval' 不是唯一对应的。")

duplicate_check(graph_data)

对于所有重复的 'iu_ac'，'iu_nd_amont' 和 'iu_nd_aval' 均唯一对应。


In [16]:
u_graph_data = graph_data.drop_duplicates(subset=['iu_ac', 'iu_nd_amont', 'iu_nd_aval'])

In [17]:
# def create_adjacency_matrix(data):
#     # Extract unique nodes and map them to an index
#     node_ids = pd.concat([data['iu_ac'], data['iu_nd_amont'], data['iu_nd_aval']]).unique()
#     node_index = {node_id: idx for idx, node_id in enumerate(node_ids)}
#     # Initialize an adjacency matrix of size NxN where N is the number of unique nodes
#     num_nodes = len(node_ids)
#     print(f'node_numbers: {num_nodes}')
#     adjacency_matrix = torch.zeros(num_nodes, num_nodes, dtype=torch.float32)
#     # Set edges based on upstream and downstream relationships
#     for _, row in data.iterrows():
#         node_idx = node_index[row['iu_ac']]
#         if row['iu_nd_amont'] in node_index:  # Check if upstream node is present
#             upstream_idx = node_index[row['iu_nd_amont']]
#             adjacency_matrix[upstream_idx][node_idx] = 1  # From upstream to current
#         if row['iu_nd_aval'] in node_index:  # Check if downstream node is present
#             downstream_idx = node_index[row['iu_nd_aval']]
#             adjacency_matrix[node_idx][downstream_idx] = 1  # From current to downstream
#     return adjacency_matrix


In [18]:
def create_adjacency_matrix(data):
    # 提取唯一节点并映射到索引
    node_ids = pd.concat([data['iu_ac'], data['iu_nd_amont'], data['iu_nd_aval']]).unique()
    node_index = {node_id: idx for idx, node_id in enumerate(node_ids)}
    num_nodes = len(node_ids)
    
    # 反向映射：索引到 iu_ac 值
    index_to_iu_ac = {idx: node for node, idx in node_index.items()}

    print(f'节点数量: {num_nodes}')
    
    # 初始化邻接矩阵
    adjacency_matrix = torch.zeros(num_nodes, num_nodes, dtype=torch.float32)
    
    # 优化：直接操作向量化的DataFrame
    # 映射 iu_ac, iu_nd_amont, iu_nd_aval 到它们的索引
    mapped_data = data[['iu_ac', 'iu_nd_amont', 'iu_nd_aval']].applymap(lambda x: node_index.get(x, None))
    mapped_data = mapped_data.dropna()  # 移除任何含有 NaN 的行

    # 使用向量化方法填充邻接矩阵
    for _, row in mapped_data.iterrows():
        adjacency_matrix[int(row['iu_nd_amont']), int(row['iu_ac'])] = 1
        adjacency_matrix[int(row['iu_ac']), int(row['iu_nd_aval'])] = 1

    return adjacency_matrix, index_to_iu_ac

In [19]:
adj_matrix, index_iu_ac =  create_adjacency_matrix(u_graph_data)

节点数量: 4634


  mapped_data = data[['iu_ac', 'iu_nd_amont', 'iu_nd_aval']].applymap(lambda x: node_index.get(x, None))


In [20]:
total_elements = adj_matrix.numel()  # 计算矩阵中的总元素数
zero_elements = (adj_matrix == 0).sum().item()  # 计算矩阵中零元素的数量
# 计算稀疏度
non_zero = total_elements - zero_elements
sparsity = zero_elements / total_elements

print(f"Sparsity: {sparsity:.4f}, total_elements:{total_elements} ,non_zero_num:{non_zero}")
print(len(geo_data['iu_ac'].unique()))
print(len(geo_data['iu_nd_amont'].unique()))

Sparsity: 0.9997, total_elements:21473956 ,non_zero_num:6696
3348
1790


In [21]:
def save_data(adjacency_matrix, index_to_iu_ac, filename=precesseddir+"graph_data.npz"):
    # 将 PyTorch Tensor 转换为 NumPy array
    adjacency_matrix_np = adjacency_matrix.numpy()
    # 将字典转换为两个列表，用于保存
    keys = np.array(list(index_to_iu_ac.keys()))
    values = np.array(list(index_to_iu_ac.values()))
    
    # 使用 savez_compressed 来保存数据
    np.savez_compressed(filename, adjacency_matrix=adjacency_matrix_np, keys=keys, values=values)
    
# 保存数据
save_data(adj_matrix, index_iu_ac)

In [22]:
# npz Data Load example:
def load_data(filename=f"{precesseddir}graph_data.npz"):
    # 加载 npz 文件
    data = np.load(filename)
    
    # 读取邻接矩阵
    adjacency_matrix = data['adjacency_matrix']
    
    # 重构字典
    keys = data['keys']
    values = data['values']
    index_to_iu_ac = {key: value for key, value in zip(keys, values)}
    
    return adjacency_matrix, index_to_iu_ac

# 加载数据
loaded_adj_matrix, loaded_node_mapping = load_data()

# 打印结果，验证加载
print("Loaded Adjacency Matrix:\n", loaded_adj_matrix)
print("Loaded Node Mapping:", loaded_node_mapping)


Loaded Adjacency Matrix:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Loaded Node Mapping: {0: 1223, 1: 1139, 2: 5266, 3: 5450, 4: 5417, 5: 4429, 6: 6124, 7: 5931, 8: 978, 9: 936, 10: 1757, 11: 1729, 12: 1730, 13: 1466, 14: 4022, 15: 5189, 16: 5234, 17: 5460, 18: 6086, 19: 6185, 20: 1920, 21: 710, 22: 5476, 23: 4818, 24: 1639, 25: 6624, 26: 1329, 27: 330, 28: 601, 29: 4781, 30: 5026, 31: 797, 32: 1927, 33: 6346, 34: 6216, 35: 465, 36: 980, 37: 5901, 38: 4251, 39: 6930, 40: 6877, 41: 6943, 42: 6929, 43: 6947, 44: 6267, 45: 1945, 46: 6745, 47: 4724, 48: 1850, 49: 1851, 50: 1864, 51: 1925, 52: 4064, 53: 4953, 54: 4922, 55: 1682, 56: 5689, 57: 1862, 58: 279, 59: 4665, 60: 865, 61: 179, 62: 4529, 63: 6817, 64: 1442, 65: 4019, 66: 1555, 67: 1340, 68: 1687, 69: 1629, 70: 431, 71: 1596, 72: 435, 73: 5469, 74: 4963, 75: 6801, 76: 6502, 77: 6042, 78: 1764, 79: 4477, 80: 4746, 81: 1140,