In [123]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import h5py
import scipy.sparse as sp
import anndata
from scipy.sparse import csc_matrix

In [124]:
umap_dir = "./degs/projection.csv"
cluster_dir = "./degs/clusters.csv"
h5_path = "./data/hKidney_cancer_section_outs/cell_feature_matrix.h5"
label_path = "./result/dapi_patches_20um_vali_55848.h5"

In [125]:
## 读取测试机中预测的label信息
label_prediction = pd.read_csv("./result/df_pred.csv")
print(label_prediction.head)

<bound method NDFrame.head of          Barcode  TrueLabel  PredLabel
0     edghmkbj-1          1          0
1     ofeodmln-1          1          1
2     fohankib-1          1          1
3     echiaaml-1          0          1
4     eagkpefk-1          0          0
...          ...        ...        ...
1187  gbhlmjld-1          0          0
1188  gcpacgfj-1          1          0
1189  gkodjmjc-1          1          1
1190  mdmjmjjo-1          1          1
1191  keppjdol-1          1          1

[1192 rows x 3 columns]>


In [126]:
## 读取umap坐标和cluster信息
df_umap = pd.read_csv(umap_dir)
df_cluster = pd.read_csv(cluster_dir)

df_merged = pd.merge(df_umap, df_cluster, on="Barcode", how = "inner")

In [127]:
print(df_merged.head())

      Barcode    UMAP-1    UMAP-2  Cluster
0  aaaanpll-1  0.830944 -2.130780       17
1  aaabddcj-1  2.347253 -6.709230        9
2  aaadbdpm-1 -0.023815 -4.861421        3
3  aaaebncm-1  0.957426 -1.994512       17
4  aaaegoge-1  1.423321 -6.029667        9


In [128]:
print(label_prediction["Barcode"].head())

0    edghmkbj-1
1    ofeodmln-1
2    fohankib-1
3    echiaaml-1
4    eagkpefk-1
Name: Barcode, dtype: object


In [129]:
print(df_merged["Barcode"].head())

0    aaaanpll-1
1    aaabddcj-1
2    aaadbdpm-1
3    aaaebncm-1
4    aaaegoge-1
Name: Barcode, dtype: object


In [130]:
df_merged_final = pd.merge(label_prediction, df_merged, on="Barcode", how="inner")
print(df_merged_final.head())

      Barcode  TrueLabel  PredLabel    UMAP-1    UMAP-2  Cluster
0  edghmkbj-1          1          0 -2.100206 -1.393516        3
1  ofeodmln-1          1          1 -5.288403  3.568874        1
2  fohankib-1          1          1 -1.167949  2.913191        2
3  echiaaml-1          0          1 -3.909014  3.843636        1
4  eagkpefk-1          0          0 -3.504418 -2.385618        3


In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=df_merged,
    x="UMAP-1", y="UMAP-2",
    hue="Cluster",
    palette="tab10",  # 可选: "Set2", "tab20", etc.
    s=2,  # 点的大小
    linewidth=0,
    alpha=0.8
)
plt.title("UMAP")
plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [131]:
## 读取基因表达谱
f = h5py.File(h5_path, "r")
data = f['matrix/data'][:]           # 非零值
indices = f['matrix/indices'][:]     # 列索引
indptr = f['matrix/indptr'][:]       # 行指针
shape = f['matrix/shape'][:]         # 原始矩阵维度

X = csc_matrix((data, indices, indptr), shape=tuple(shape))

In [132]:
barcodes = [x.decode() for x in f["matrix/barcodes"][:]]
genes = [g.decode() for g in f['matrix/features/name'][:]]

# 构建 AnnData
adata = anndata.AnnData(
    X=X.T,
    obs=pd.DataFrame(index=barcodes),
    var=pd.DataFrame(index=genes)
)

In [133]:
print(adata.shape)           # 应该是 (n_cells, n_genes)
print(adata.obs_names[:5])   # 一些 cell IDs
print(adata.var_names[:5])   # 一些 gene names

(56510, 541)
Index(['aaaanpll-1', 'aaabddcj-1', 'aaadbdpm-1', 'aaaebncm-1', 'aaaegoge-1'], dtype='object')
Index(['ABCC11', 'ACE2', 'ACKR1', 'ACTA2', 'ACTG2'], dtype='object')


In [134]:
adata.raw = adata

In [135]:
sc.pp.normalize_total(adata, target_sum=1e4)  # CPM-like
sc.pp.log1p(adata)  # log(count + 1)

  return fn(*args_all, **kw)


In [141]:
# 1. 确保 Barcode 是字符串类型（如果不是的话）
df_merged_final["Barcode"] = df_merged_final["Barcode"].astype(str)
adata.obs.index = adata.obs.index.astype(str)

# 2. 设定 df_merged_final 的 index 为 Barcode
df_meta = df_merged_final.set_index("Barcode")

# 3. 从 adata 中筛选出这些细胞
adata_sub = adata[df_meta.index].copy()

# 4. 替换 metadata
adata_sub.obs = df_meta

# ✅ 结果：adata_sub 是你需要的只包含预测过细胞的 AnnData，且 metadata 来自 df_merged_final


In [142]:
print(adata_sub.obs.head())

            TrueLabel  PredLabel    UMAP-1    UMAP-2  Cluster
Barcode                                                      
edghmkbj-1          1          0 -2.100206 -1.393516        3
ofeodmln-1          1          1 -5.288403  3.568874        1
fohankib-1          1          1 -1.167949  2.913191        2
echiaaml-1          0          1 -3.909014  3.843636        1
eagkpefk-1          0          0 -3.504418 -2.385618        3


In [None]:
from statsmodels.formula.api import ols
from scipy.stats import ttest_ind
import numpy as np
import statsmodels.api as sm

# 提取表达矩阵 (log-normalized)
expr = adata_sub.to_df()  # cells × genes
meta = adata_sub.obs

# 设定比较组
group1 = (meta['Cluster'] == 1)
group2 = (meta['Cluster'] == 2)

results = []

for gene in expr.columns:
    df = pd.DataFrame({
        'expr': expr.loc[group1 | group2, gene],
        'cluster': meta.loc[group1 | group2, 'Cluster'],
        'dividing': meta.loc[group1 | group2, 'PredLabel']
    })

    # 将 cluster 映射为数值变量
    df['cluster'] = df['cluster'].astype("category")
    df['dividing'] = df['dividing'].astype("category")

    # 回归模型：表达量 ~ cluster + dividing
    model = ols("expr ~ C(cluster) + C(dividing)", data=df).fit()
    pval = model.pvalues.get("C(cluster)[T.2]", np.nan)  # Cluster2 vs Cluster1
    coef = model.params.get("C(cluster)[T.2]", np.nan)

    results.append((gene, coef, pval))

# 构建结果表格
res_df = pd.DataFrame(results, columns=["Gene", "LogFC", "PValue"])
res_df["FDR"] = sm.stats.multipletests(res_df["PValue"], method='fdr_bh')[1]
res_df = res_df.sort_values("PValue")
res_df.head(10)


Unnamed: 0,Gene,LogFC,PValue,FDR
203,IL7R,3.960711,6.574554e-43,
84,CD8A,-3.675794,2.893844e-42,
57,CCL5,-3.474216,4.899e-38,
186,GZMK,-3.522969,1.773308e-36,
261,NKG7,-3.025696,3.017608e-28,
184,GZMA,-2.948768,4.145043e-27,
188,HAVCR2,-2.375097,9.311201e-20,
309,SELL,1.845991,3.192795e-16,
60,CCR7,1.707959,1.118994e-14,
217,LAG3,-1.240553,2.021682e-09,
