In [27]:
import h5py

data_path = "./Camp.h5"
with h5py.File(data_path, "r") as f:
    def print_keys(name, obj):
        print(f"{name}: {obj}")
    f.visititems(print_keys)

X: <HDF5 dataset "X": shape (777, 16270), type "<f4">
obs: <HDF5 group "/obs" (2 members)>
obs/Group: <HDF5 dataset "Group": shape (777,), type "<i8">
obs/cell_id: <HDF5 dataset "cell_id": shape (777,), type "|O">
var: <HDF5 group "/var" (1 members)>
var/gene_id: <HDF5 dataset "gene_id": shape (16270,), type "|O">


In [32]:
import os
import numpy as np
import torch
import torch.nn.functional as F
import pandas as pd
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv, GraphConv, GATConv  # 直接导入 GNN 层
from train import filter_data, make_graph_pyg
from models import GCNAE
import h5py
import random

# 固定随机种子
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

# 参数设置
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
category = "real_data"
epochs = 10
batch_size = 128
pca_size = 50
path = "./"  # 当前目录
files = ["Camp"]
print("Datasets:", files)

nb_genes = 2000
hidden_dim = 200
activation = F.relu
results = pd.DataFrame()

# 定义模型映射
gnn_layer_map = {
    "GCNConv": GCNConv,
    "GraphConv": GraphConv,
    "GATConv": GATConv,
}

# 主循环
for dataset in files:
    print(f"Processing dataset: {dataset}")
    
    # 加载数据
    data_path = os.path.join(path, f"{dataset}.h5")
    with h5py.File(data_path, "r") as f:
        # 数据矩阵
        X = np.array(f['X'])  # 矩阵形状 (777, 16270)
        # 标签（Group列）
        Y = np.array(f['obs']['Group'])  # 标签形状 (777,)

    print(f"Data shape: {X.shape}, Labels shape: {Y.shape}")

    # 数据过滤和PCA
    genes_idx, cells_idx = filter_data(X, highly_genes=nb_genes)
    X = X[cells_idx][:, genes_idx]  # 过滤后的数据
    Y = Y[cells_idx]  # 过滤后的标签
    n_clusters = len(np.unique(Y))

    # 构建图数据
    edge_index, node_features, labels = make_graph_pyg(
        X,
        Y=Y,
        threshold=0,
        dense_dim=pca_size,
        normalize_weights="log_per_cell",
    )

    edge_index = torch.tensor(edge_index, dtype=torch.long)
    node_features = torch.tensor(node_features, dtype=torch.float)
    labels = torch.tensor(labels, dtype=torch.long)

    pyg_data = Data(x=node_features, edge_index=edge_index, y=labels)
    dataloader = DataLoader([pyg_data], batch_size=batch_size, shuffle=True)

    print(f"Graph Info - Nodes: {pyg_data.num_nodes}, Edges: {pyg_data.num_edges}")

    for model_name, gnn_layer in gnn_layer_map.items():
        for run in range(3):
            # 模型初始化
            torch.manual_seed(run)
            torch.cuda.manual_seed_all(run)
            np.random.seed(run)
            random.seed(run)

            model = GCNAE(
                in_feats=pca_size,
                n_hidden=hidden_dim,
                n_layers=1,
                activation=activation,
                dropout=0.1,
                hidden=None,
                hidden_relu=False,
                hidden_bn=False,
            ).to(device)

            # 替换 GNN 层
            for i in range(len(model.layers)):
                model.layers[i] = gnn_layer(
                    in_channels=pca_size if i == 0 else hidden_dim,
                    out_channels=hidden_dim,
                )

            optim = torch.optim.Adam(model.parameters(), lr=1e-5)
            print(f">> Training Model: {model_name} | Run: {run}")

            # 模型训练
            scores = train(
                model=model,
                optimizer=optim,
                n_epochs=epochs,
                dataloader=dataloader,
                n_clusters=n_clusters,
                plot=False,
                cluster=["KMeans", "Leiden"],
            )

            # 确保分数是字典类型，转为 DataFrame
            scores_df = pd.DataFrame([scores])

            # 记录结果
            scores_df["dataset"] = dataset
            scores_df["run"] = run
            scores_df["nb_genes"] = nb_genes
            scores_df["hidden_dim"] = hidden_dim
            scores_df["model_name"] = model_name
            results = pd.concat([results, scores_df], ignore_index=True)

            # 保存结果
            results.to_pickle(f"./{category}_graph_networks.pkl")

# 汇总和显示结果
print("Mean results grouped by model_name:")
numeric_results = results.select_dtypes(include=[np.number])  # 仅保留数值列
print(numeric_results.groupby(results["model_name"]).mean())

print("Mean results grouped by dataset:")
print(numeric_results.groupby(results["dataset"]).mean())

Datasets: ['Camp']
Processing dataset: Camp
Data shape: (777, 16270), Labels shape: (777,)


  X = np.array(f['X'])  # 矩阵形状 (777, 16270)
  Y = np.array(f['obs']['Group'])  # 标签形状 (777,)


Graph Info - Nodes: 2777, Edges: 414710
>> Training Model: GCNConv | Run: 0
Epoch 1/10, Loss: 0.4733
Epoch 2/10, Loss: 0.4733
Epoch 3/10, Loss: 0.4733
Epoch 4/10, Loss: 0.4733
Epoch 5/10, Loss: 0.4733
Epoch 6/10, Loss: 0.4733
Epoch 7/10, Loss: 0.4733
Epoch 8/10, Loss: 0.4733
Epoch 9/10, Loss: 0.4733
Epoch 10/10, Loss: 0.4733
>> Training Model: GCNConv | Run: 1
Epoch 1/10, Loss: 0.4734
Epoch 2/10, Loss: 0.4734
Epoch 3/10, Loss: 0.4734
Epoch 4/10, Loss: 0.4734
Epoch 5/10, Loss: 0.4734
Epoch 6/10, Loss: 0.4734
Epoch 7/10, Loss: 0.4734
Epoch 8/10, Loss: 0.4734
Epoch 9/10, Loss: 0.4734
Epoch 10/10, Loss: 0.4734
>> Training Model: GCNConv | Run: 2
Epoch 1/10, Loss: 0.4735
Epoch 2/10, Loss: 0.4735
Epoch 3/10, Loss: 0.4735
Epoch 4/10, Loss: 0.4735
Epoch 5/10, Loss: 0.4735
Epoch 6/10, Loss: 0.4735
Epoch 7/10, Loss: 0.4735
Epoch 8/10, Loss: 0.4735
Epoch 9/10, Loss: 0.4735
Epoch 10/10, Loss: 0.4735
>> Training Model: GraphConv | Run: 0
Epoch 1/10, Loss: 0.3293
Epoch 2/10, Loss: 0.3289
Epoch 3/10,