In [2]:
import os
import sys
import time
import pickle
import random
import h5py
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import scanpy as sc
from sklearn.metrics import (
    adjusted_rand_score,
    calinski_harabasz_score,
    normalized_mutual_info_score,
    silhouette_score,
)
from tqdm import tqdm
import matplotlib.pyplot as plt

# 添加自定义模块路径
sys.path.append("../")
import train
import models
from torch_geometric.data import Data
from torch_geometric.loader import NeighborLoader

# 固定随机种子
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# 确保在CPU上运行
device = torch.device("cpu")
print(f"Using device: {device}")

# 配置全局参数
category = "real_data"
epochs = 10
batch_size = 128
pca_size = 50
nb_genes = 3000
hidden_dim = 200
hidden = [300]
model_name = "GraphConv"
normalize_weights = "log_per_cell"
node_features = "scale"
same_edge_values = False
edge_norm = True
hidden_relu = False
hidden_bn = False
activation = F.relu
n_layers = 1

# 创建保存路径
output_dir = f"../output/pickle_results/{category}"
os.makedirs(output_dir, exist_ok=True)  # 确保目录存在

# 文件路径设置
path = "../"
files = ["Quake_10x_Bladder", "Quake_Smart-seq2_Trachea"]

results = pd.DataFrame()

# 主循环：逐个数据集处理
for dataset in files:
    print(f">> Processing dataset: {dataset}")

    # 加载数据
    path = './Camp.h5'
    data = sc.read_h5ad(path)

    X_all = data.X
    y_all = data.obs.values[:, 0]

    Y = y_all
    X = X_all

    n_clusters = len(np.unique(Y))
    print(f"Number of clusters: {n_clusters}")

    # 数据过滤
    genes_idx, cells_idx = train.filter_data(X, highly_genes=nb_genes)
    X = X[cells_idx][:, genes_idx]
    Y = Y[cells_idx]

    # 构建图
    t0 = time.time()
    edge_index, node_feats, labels = train.make_graph_pyg(
        X,
        Y,
        dense_dim=pca_size,  # 使用 dense_dim 控制 PCA 维度
        normalize_weights=normalize_weights  # 保留 normalize_weights 参数
    )

    # 创建 PyG Data 对象
    graph_data = Data(
        x=torch.tensor(node_feats, dtype=torch.float),
        edge_index=torch.tensor(edge_index, dtype=torch.long),
        y=torch.tensor(labels, dtype=torch.long),
    )

    # 定义训练节点 ID
    train_ids = (graph_data.y != -1).nonzero(as_tuple=False).view(-1)

    # 使用 PyG 的 NeighborLoader 进行采样
    sampler_loader = NeighborLoader(
        graph_data,
        input_nodes=train_ids,
        num_neighbors=[-1],  # 全邻居
        batch_size=batch_size,
        shuffle=True,
    )

    print(f"INPUT: {model_name} {hidden_dim}, {hidden}, {same_edge_values}, {edge_norm}")
    t1 = time.time()

    # 不同分辨率下运行Leiden聚类
    for resolution in [0.05, 0.1, 0.2, 0.5, 0.9, 1, 1.2]:
        print(f"Running Leiden with resolution: {resolution}")
        t_start = time.time()
        torch.manual_seed(42)

        # 初始化模型
        model = models.GCNAE(
            in_feats=pca_size,
            n_hidden=hidden_dim,
            n_layers=n_layers,
            activation=activation,
            dropout=0.1,
            hidden=hidden,
            hidden_relu=hidden_relu,
            hidden_bn=hidden_bn,
        ).to(device)

        optim = torch.optim.Adam(model.parameters(), lr=1e-5)

        # 训练模型
        scores = train.train(
            model,
            optim,
            epochs,
            sampler_loader,
            n_clusters,
            plot=False,
            save=False,
            cluster=["Leiden"],
            cluster_params={
                "Leiden": {
                    "resolution": resolution,
                }
            },
        )

        # 收集结果
        scores["dataset"] = dataset
        scores["resolution"] = resolution
        scores["nb_pred_clusters"] = np.unique(scores["leiden_pred"]).shape[0]
        scores["true_nb_clusters"] = np.unique(Y).shape[0]
        results = pd.concat([results, pd.DataFrame([scores])], ignore_index=True)  # 使用 pd.concat

        # 保存中间结果
        results.to_pickle(os.path.join(output_dir, f"{category}_leiden.pkl"))

        print(f"Completed resolution {resolution} for dataset {dataset}.")

# 显示最终结果
print("Results summary:")
print(results[["dataset", "true_nb_clusters", "resolution", "nb_pred_clusters", "leiden_ari", "leiden_nmi"]].round(2))

Using device: cpu
>> Processing dataset: Quake_10x_Bladder
Number of clusters: 7




INPUT: GraphConv 200, [300], False, True
Running Leiden with resolution: 0.05
Epoch 1/10, Loss: 2.2575
Epoch 2/10, Loss: 2.2582
Epoch 3/10, Loss: 2.2545
Epoch 4/10, Loss: 2.2560
Epoch 5/10, Loss: 2.2570
Epoch 6/10, Loss: 2.2537
Epoch 7/10, Loss: 2.2546
Epoch 8/10, Loss: 2.2547
Epoch 9/10, Loss: 2.2541
Epoch 10/10, Loss: 2.2535
Completed resolution 0.05 for dataset Quake_10x_Bladder.
Running Leiden with resolution: 0.1
Epoch 1/10, Loss: 2.2575
Epoch 2/10, Loss: 2.2582
Epoch 3/10, Loss: 2.2545
Epoch 4/10, Loss: 2.2560
Epoch 5/10, Loss: 2.2570
Epoch 6/10, Loss: 2.2537
Epoch 7/10, Loss: 2.2546
Epoch 8/10, Loss: 2.2547
Epoch 9/10, Loss: 2.2541
Epoch 10/10, Loss: 2.2535
Completed resolution 0.1 for dataset Quake_10x_Bladder.
Running Leiden with resolution: 0.2
Epoch 1/10, Loss: 2.2575
Epoch 2/10, Loss: 2.2582
Epoch 3/10, Loss: 2.2545
Epoch 4/10, Loss: 2.2560
Epoch 5/10, Loss: 2.2570
Epoch 6/10, Loss: 2.2537
Epoch 7/10, Loss: 2.2546
Epoch 8/10, Loss: 2.2547
Epoch 9/10, Loss: 2.2541
Epoch 10/1



INPUT: GraphConv 200, [300], False, True
Running Leiden with resolution: 0.05
Epoch 1/10, Loss: 2.2576
Epoch 2/10, Loss: 2.2583
Epoch 3/10, Loss: 2.2546
Epoch 4/10, Loss: 2.2562
Epoch 5/10, Loss: 2.2572
Epoch 6/10, Loss: 2.2538
Epoch 7/10, Loss: 2.2546
Epoch 8/10, Loss: 2.2548
Epoch 9/10, Loss: 2.2542
Epoch 10/10, Loss: 2.2536
Completed resolution 0.05 for dataset Quake_Smart-seq2_Trachea.
Running Leiden with resolution: 0.1
Epoch 1/10, Loss: 2.2576
Epoch 2/10, Loss: 2.2583
Epoch 3/10, Loss: 2.2546
Epoch 4/10, Loss: 2.2562
Epoch 5/10, Loss: 2.2572
Epoch 6/10, Loss: 2.2538
Epoch 7/10, Loss: 2.2546
Epoch 8/10, Loss: 2.2548
Epoch 9/10, Loss: 2.2542
Epoch 10/10, Loss: 2.2536
Completed resolution 0.1 for dataset Quake_Smart-seq2_Trachea.
Running Leiden with resolution: 0.2
Epoch 1/10, Loss: 2.2576
Epoch 2/10, Loss: 2.2583
Epoch 3/10, Loss: 2.2546
Epoch 4/10, Loss: 2.2562
Epoch 5/10, Loss: 2.2572
Epoch 6/10, Loss: 2.2538
Epoch 7/10, Loss: 2.2546
Epoch 8/10, Loss: 2.2548
Epoch 9/10, Loss: 2.2