https://chatgpt.com/canvas/shared/67ac31ddc9b081919abfb9ac86613635

In [1]:
import desc as DESC
import scanpy as sc
print("Scanpy version:", sc.__version__)
print("DESC version:", DESC.__version__)


RuntimeError: module compiled against API version 0x10 but this version of numpy is 0xd . Check the section C-API incompatibility at the Troubleshooting ImportError section at https://numpy.org/devdocs/user/troubleshooting-importerror.html#c-api-incompatibility for indications on how to solve this problem .

AttributeError: module 'pandas.arrays' has no attribute 'BooleanArray'

In [None]:
# 导入所需模块
import desc as DESC
import numpy as np
import pandas as pd
import scanpy as sc
print(sc.__version__)
# 应当输出 1.4.5

import matplotlib.pyplot as plt

# 配置 matplotlib 和 Scanpy
%matplotlib inline
sc.settings.verbosity = 3  # 设置详细程度：errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()  # 打印版本信息



In [None]:
import scanpy as sc

# 使用 scanpy 加载 10X 数据
adata = sc.read_10x_mtx(
    'filtered_gene_bc_matrices/hg19',  # 数据路径
    var_names='gene_symbols',  # 使用基因符号作为变量名
    cache=True  # 缓存解析后的结果
)

# 确保变量名唯一
adata.var_names_make_unique()

# 检查数据概况
print(f"Data contains {adata.n_obs} cells and {adata.n_vars} genes.")


In [None]:
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt

# 1. 加载 PBMC 数据
adata = sc.read_10x_mtx(
    'filtered_gene_bc_matrices/hg19',  # 数据路径
    var_names='gene_symbols',         # 使用基因符号作为变量名
    cache=True                        # 缓存解析后的结果
)

# 打印初始数据概况
print(f"Initial dataset: {adata.n_obs} cells and {adata.n_vars} genes.")
print(adata)

# 2.1 过滤细胞和基因
# 移除表达基因少于 200 的细胞
sc.pp.filter_cells(adata, min_genes=200)
# 移除在少于 3 个细胞中检测到的基因
sc.pp.filter_genes(adata, min_cells=3)

# 打印过滤后数据概况
print(f"After filtering: {adata.n_obs} cells and {adata.n_vars} genes.")
print(adata)

# 计算线粒体基因的表达比例
mito_genes = adata.var_names.str.startswith('MT-')  # 查找以 "MT-" 开头的基因
adata.obs['percent_mito'] = np.sum(
    adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

# 添加每个细胞的总计数和基因数作为注释
adata.obs['n_counts'] = adata.X.sum(axis=1).A1  # 每个细胞的总 UMI 数量

# 绘制小提琴图，展示基因数、UMI 总数和线粒体基因比例的分布
sc.pl.violin(adata, ['n_genes', 'n_counts', 'percent_mito'], jitter=0.4, multi_panel=True)

# 过滤低质量细胞
# 移除基因数高于 2500 的细胞（可能是双细胞）
adata = adata[adata.obs['n_genes'] < 2500, :]
# 移除线粒体基因比例超过 5% 的细胞
adata = adata[adata.obs['percent_mito'] < 0.05, :]

# 打印过滤后数据概况
print(f"After QC filtering: {adata.n_obs} cells and {adata.n_vars} genes.")
print(adata)


In [None]:
# 2.2 Normalization
# 使用 DESC 的 normalize_per_cell 函数对每个细胞进行归一化
DESC.normalize_per_cell(adata, counts_per_cell_after=1e4)

# 或者使用 scanpy 的标准化方法
# sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)

# 打印归一化完成后的数据概况
print("Normalization complete.")
print(f"Dataset now contains {adata.n_obs} cells and {adata.n_vars} genes.")


In [None]:
# 2.3 Logarithm Transformation
# 使用 DESC 提供的 log1p 函数对数据进行对数变换
DESC.log1p(adata)

# 或者使用 scanpy 提供的 log1p 函数
# sc.pp.log1p(adata)

# 将对数变换后的数据保存到 .raw 属性
# 这样可以用于后续分析（例如差异表达分析或伪时间分析）
adata.raw = adata

# 打印完成对数变换后的数据概况
print("Logarithm transformation complete. Data is now stored in `adata.raw` for downstream analysis.")
print(f"Dataset now contains {adata.n_obs} cells and {adata.n_vars} genes.")


In [None]:

# 2.4 选择高变基因
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5, subset=True)
print(f"Number of highly variable genes selected: {adata.shape[1]}")

# 2.5 数据标准化
DESC.scale(adata, zero_center=True, max_value=3)
print("Preprocessing complete.")
print(f"Final dataset: {adata.n_obs} cells and {adata.n_vars} highly variable genes.")


In [None]:
# DESC 模型训练
adata = DESC.train(
    adata,
    dims=[adata.shape[1], 32, 16],  # 定义网络层结构
    tol=0.005,                      # 训练的早停阈值
    n_neighbors=10,                 # 邻居数量
    batch_size=256,                 # 每批次训练的样本数量
    louvain_resolution=[0.8],       # Louvain 聚类的分辨率
    save_dir="result_pbmc3k",       # 结果保存路径
    do_tsne=True,                   # 是否计算 tSNE
    learning_rate=300,              # 学习率
    do_umap=True,                   # 是否计算 UMAP
    num_Cores_tsne=4,               # tSNE 的核心数量
    save_encoder_weights=True       # 是否保存编码器权重
)

# 打印训练完成的信息
print("DESC analysis complete.")
print(f"Clustering results and embeddings are now stored in the `adata` object.")
