In [1]:
import os
import sys
import time
import pickle
import random
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import scanpy as sc
from sklearn.metrics import (
    adjusted_rand_score,
    normalized_mutual_info_score,
    silhouette_score,
)
from tqdm import tqdm
from torch_geometric.data import Data
from torch_geometric.loader import NeighborLoader

import train
import models

# 固定随机种子
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

# 定义全局参数
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

category = "imbalanced_data"
epochs = 10
batch_size = 128
pca_size = 50
nb_genes = 3000

path = "../"
files = [f"{path}R/simulated_data/{category}/{file}.h5" for file in os.listdir(f"{path}R/simulated_data/{category}") if file.endswith(".h5")]

results = pd.DataFrame()

model_name = "GraphConv"
normalize_weights = "per_cell"
node_features = "scale"
hidden_relu = False
hidden_bn = False
n_layers = 1
hidden_dim = 200
hidden = [300]
activation = F.relu

# 创建输出路径
output_dir = f"../output/pickle_results/{category}"
os.makedirs(output_dir, exist_ok=True)

# 数据集循环
for dataset_path in files:
    dataset_name = os.path.basename(dataset_path).replace(".h5", "")
    print(f">> Processing dataset: {dataset_name}")

    # 加载数据
    data_mat = h5py.File(dataset_path, "r")
    Y = np.array(data_mat['Y'])
    X = np.array(data_mat['X'])

    # 数据过滤
    genes_idx, cells_idx = train.filter_data(X, highly_genes=nb_genes)
    X = X[cells_idx][:, genes_idx]
    Y = Y[cells_idx]
    n_clusters = len(np.unique(Y))

    # 构建图
    t0 = time.time()
    edge_index, node_feats, labels = train.make_graph_pyg(
        X,
        Y,
        dense_dim=pca_size,
        normalize_weights=normalize_weights,
    )

    # 创建 PyG Data 对象
    graph_data = Data(
        x=torch.tensor(node_feats, dtype=torch.float),
        edge_index=torch.tensor(edge_index, dtype=torch.long),
        y=torch.tensor(labels, dtype=torch.long),
    )

    # 训练节点 ID
    train_ids = (graph_data.y != -1).nonzero(as_tuple=False).view(-1)

    # 使用 NeighborLoader 进行采样
    sampler_loader = NeighborLoader(
        graph_data,
        input_nodes=train_ids,
        num_neighbors=[-1],  # 全邻居
        batch_size=batch_size,
        shuffle=True,
    )

    print(f"INPUT: {model_name} {hidden_dim}, {hidden}")

    t1 = time.time()

    # 模型训练与评估
    for run in range(3):
        t_start = time.time()
        torch.manual_seed(run)
        torch.cuda.manual_seed_all(run)
        np.random.seed(run)
        random.seed(run)

        model = models.GCNAE(
            in_feats=pca_size,
            n_hidden=hidden_dim,
            n_layers=n_layers,
            activation=activation,
            dropout=0.1,
            hidden=hidden,
            hidden_relu=hidden_relu,
            hidden_bn=hidden_bn,
        ).to(device)
        if run == 0:
            print(">", model)

        optim = torch.optim.Adam(model.parameters(), lr=1e-5)

        # 调用训练函数
        scores = train.train(
            model,
            optim,
            epochs,
            sampler_loader,
            n_clusters,
            plot=False,
            cluster=["KMeans", "Leiden"],
            cluster_params={"Leiden": {"resolution": 1.0}},
        )

        # 记录结果
        scores["dataset"] = dataset_name
        scores["run"] = run
        scores["nb_genes"] = nb_genes
        scores["hidden"] = str(hidden)
        scores["hidden_dim"] = hidden_dim
        scores["tot_kmeans_time"] = (t1 - t0) + (scores['ae_end'] - t_start) + scores.get('kmeans_time', 0)
        scores["tot_leiden_time"] = (t1 - t0) + (scores['ae_end'] - t_start) + scores.get('leiden_time', 0)

        # 将结果存储到 DataFrame
        results = pd.concat([results, pd.DataFrame([scores])], ignore_index=True)

        # 保存结果到文件
        results.to_pickle(os.path.join(output_dir, f"{category}_gae.pkl"))
        print(f"Completed run {run} for dataset {dataset_name}.")

print("Final results summary:")
print(results.groupby("dataset").mean())

Python executable: /opt/anaconda3/envs/pytorch/bin/python
Python path: ['/opt/anaconda3/envs/pytorch/lib/python39.zip', '/opt/anaconda3/envs/pytorch/lib/python3.9', '/opt/anaconda3/envs/pytorch/lib/python3.9/lib-dynload', '', '/opt/anaconda3/envs/pytorch/lib/python3.9/site-packages', '/opt/anaconda3/envs/pytorch/lib/python3.9/site-packages/setuptools/_vendor', '/var/folders/3k/wtktqhhs2szgg7m8yyz29zgr0000gn/T/tmpjl0yie3z', '/opt/anaconda3/envs/pytorch/lib/python3.9/site-packages']
Scanpy imported successfully!
Using device: cpu


FileNotFoundError: [Errno 2] No such file or directory: '../R/simulated_data/imbalanced_data'