In [12]:
# Loading the Packages
%reload_ext autoreload
%autoreload 2

# basic packages
import os
from pathlib import Path
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# data processing
import numpy as np
import pandas as pd
import scanpy as sc

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams.update({
    "pgf.texsystem": "xelatex",      # 使用 XeLaTeX，如果不需要 LaTeX 公式渲染，可以省略
    'font.family': 'serif',          # 字体设置为衬线字体
    'text.usetex': False,            # 禁用 LaTeX，使用 Matplotlib 内置文字渲染
    'pgf.rcfonts': False,            # 禁用 pgf 的默认字体管理
    'pdf.fonttype': 42,              # 确保字体为 TrueType 格式，可被 Illustrator 编辑
    'ps.fonttype': 42,               # EPS 文件也使用 TrueType 格式
    'figure.dpi': 300,               # 设置图形分辨率
    'savefig.dpi': 300,              # 保存的图形文件分辨率
    'axes.unicode_minus': False,     # 避免负号问题
})

In [13]:
# workdir 
BASE_DIR = Path(r'G:\spatial_data\processed')
RUN_ID = '20230523_HCC_PRISM_probe_refined'

# Load one slide exp
base_path = BASE_DIR / f'{RUN_ID}_processed'
data_path = base_path / "segmented"
typ_path = base_path / "cell_typing"
STAGATE_path = base_path / "STAGATE"

output_path = base_path / "interaction_regression"
output_path.mkdir(exist_ok=True)

In [14]:
adata_STAGATE = sc.read(STAGATE_path / 'rad_cutoff_400' / 'adata_STAGATE.h5ad')
adata_STAGATE = adata_STAGATE[adata_STAGATE.obs['type'] != 'other']
adata_STAGATE

View of AnnData object with n_obs × n_vars = 60329 × 31
    obs: 'dataset', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'n_genes', 'n_counts', 'type', 'leiden', 'sample', 'tissue', 'tmp_leiden', 'leiden_subtype', 'subtype', 'leiden_type', 'Y', 'X', 'mclust_2', 'mclust_3', 'mclust_4', 'mclust_5', 'mclust_6', 'mclust_7', 'mclust_8', 'mclust_9', 'mclust_10', 'mclust_11', 'mclust_12', 'mclust_13', 'mclust_14', 'mclust_15', 'mclust_16', 'mclust_17', 'mclust_18', 'mclust_19'
    var: 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'mean', 'std'
    uns: 'Spatial_Net', 'leiden', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'STAGATE', 'X_pca', 'X_umap', 'spatial'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [15]:
spatial_net = adata_STAGATE.uns['Spatial_Net']
spatial_net = spatial_net[spatial_net['Cell1'].isin(adata_STAGATE.obs_names) & spatial_net['Cell2'].isin(adata_STAGATE.obs_names)]
adata_STAGATE.uns['Spatial_Net'] = spatial_net
adata_STAGATE.uns['Spatial_Net'].head()

Unnamed: 0,Cell1,Cell2,Distance
0,0,4,98.737024
1,0,2,85.146932
2,0,29,204.787695
4,0,15,135.003704
5,0,27,225.002222


In [16]:
# 定义分析参数 ----------------------------------------------------------
cell_type_col = 'subtype'  # 可修改为 'type' 或 'subtype'
small_radius = 100       # 相互作用邻域半径
large_radius = 400      # 微环境邻域半径

In [17]:
# 步骤1：统计每个细胞的局部相互作用（最终正确版）---------------------------
# 提取小邻域网络（保留所有Cell1记录）
small_net = adata_STAGATE.uns['Spatial_Net'].query(f"Distance <= {small_radius}").copy()

# 添加细胞类型信息（直接使用obs数据避免映射错误）
small_net = (
    small_net.merge(adata_STAGATE.obs[[cell_type_col]].rename(columns={cell_type_col: f'{cell_type_col}1'}),
                    left_on='Cell1', right_index=True, how='left')
    .merge(adata_STAGATE.obs[[cell_type_col]].rename(columns={cell_type_col: f'{cell_type_col}2'}),
           left_on='Cell2', right_index=True, how='left'))

# 按Cell1和type2分组统计（关键修正：移除type1分组）
cell_interactions = (small_net.groupby(['Cell1', f'{cell_type_col}2'])  # 仅按Cell1和对方的类型分组
    .size().unstack(fill_value=0)  # 展开为宽表，缺失值填0
    .add_prefix(f'interact_'))

# 确保包含所有细胞（即使无交互记录）
all_cells = adata_STAGATE.obs_names
cell_interactions = cell_interactions.reindex(all_cells, fill_value=0)

# 验证非零数据存在
print("非零交互示例:")
cell_interactions.loc[cell_interactions.sum(axis=1) > 0].head()

非零交互示例:


subtype2,interact_B_CD79A+,"interact_B_CD79A+, MS4A1+",interact_B_MS4A1+,interact_CAF_ACTA2+,interact_Cyto_T_CD4+,interact_Cyto_T_CD8+,interact_Endo_PECAM1+,interact_Ep_EPCAM+,interact_Liver,interact_Macrophage_C1QA+,...,"interact_T_CD8+, GZMA+, CXCL13+","interact_T_CD8+, PD1+",interact_T_reg,interact_Tumor_AFP+,interact_Tumor_GPC3+,interact_Tumor_proliferation,interact_cDC1_CLEC9A+,interact_cDC2_CD1C+,interact_other_cell_proliferation,interact_pDC_LILRA4+
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [18]:
# 步骤2：计算微环境组成 ----------------------------------------------

from scipy.sparse import csr_matrix

# 假设 Spatial_Net 是预存的邻接表，包含三列：Cell1, Cell2, Distance
spatial_net = adata_STAGATE.uns['Spatial_Net'].query(f"Distance <= {large_radius}").copy()

# 预生成类型编码矩阵
cell_types = adata_STAGATE.obs[cell_type_col]
unique_types = cell_types.unique()
n_types = len(unique_types)
type_onehot = pd.get_dummies(cell_types)  # 自动生成one-hot编码

# 将邻接表转换为稀疏矩阵（高效存储邻居关系）
cells = adata_STAGATE.obs_names
cell_to_idx = {cell: i for i, cell in enumerate(cells)}

# 构建稀疏邻接矩阵（仅包含在large_radius内的连接）
rows = [cell_to_idx[cell] for cell in spatial_net['Cell1']]
cols = [cell_to_idx[cell] for cell in spatial_net['Cell2']]
adj_matrix = csr_matrix((np.ones(len(rows)), (rows, cols)), shape=(len(cells), len(cells)))

# 关键加速步骤：利用稀疏矩阵乘法一次性计算所有组成
comp_counts = adj_matrix.dot(type_onehot.values)  # (n_cells, n_types)
row_sums = comp_counts.sum(axis=1)
comp_matrix = comp_counts / row_sums[:, np.newaxis]  # 行归一化

# 转换为DataFrame
microenv_df = pd.DataFrame(comp_matrix, columns=[f"{t}" for t in unique_types], index=cells).fillna(0)
microenv_df.head()

Unnamed: 0,Macrophage_LYVE1+,Mast_CPA3+,"T_CD4+, PD1+, CTLA4+","T_CD8+, GZMA+, CXCL13+",cDC1_CLEC9A+,"T_CD4+, CXCL13+",pDC_LILRA4+,Cyto_T_CD8+,Monocyte_CD14+,T_reg,...,B_MS4A1+,cDC2_CD1C+,Endo_PECAM1+,"T_CD4+, CTLA4+",B_CD79A+,"T_CD8+, CTLA4+","B_CD79A+, MS4A1+",Macrophage_C1QA+,"Neutrophil_CSF3R+, S100A8+",CAF_ACTA2+
0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,...,0.111111,0.0,0.222222,0.0,0.0,0.0,0.0,0.0,0.0,0.111111
2,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,...,0.076923,0.0,0.153846,0.0,0.0,0.0,0.0,0.0,0.076923,0.153846
4,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,...,0.076923,0.0,0.153846,0.0,0.0,0.0,0.0,0.0,0.076923,0.153846
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.076923,0.076923,0.0,0.0,0.0,0.0,...,0.076923,0.0,0.153846,0.0,0.0,0.0,0.0,0.0,0.076923,0.076923


In [19]:
# 步骤：合并对称的交互对（如A-B和B-A视为同一对）

# 定义合并函数
def merge_symmetric_interactions(interaction_series):
    # 生成排序后的元组作为新索引
    sorted_index = interaction_series.index.map(
        lambda x: tuple(sorted(x, key=lambda y: y.lower()))
    )
    
    # 按排序后的索引分组求和
    merged = interaction_series.groupby(sorted_index).sum()
    
    # 保留原始交互方向中的最大值（可选）
    # merged = interaction_series.groupby(sorted_index).max() 
    
    return merged

In [20]:
# 步骤3：关联分析 ----------------------------------------------------
interaction_counts = (
    small_net
    # small_net[small_net[f'{cell_type_col}1'] != small_net[f'{cell_type_col}2']]  # 过滤同型
    .groupby([f'{cell_type_col}1', f'{cell_type_col}2']).size())
# 合并对称交互
interaction_counts_merged = merge_symmetric_interactions(interaction_counts)
interaction_counts_merged

(B_CD79A+, B_CD79A+)                  252
(B_CD79A+, B_CD79A+, MS4A1+)          458
(B_CD79A+, B_MS4A1+)                  348
(B_CD79A+, CAF_ACTA2+)                564
(B_CD79A+, Cyto_T_CD4+)               104
                                     ... 
(pDC_LILRA4+, T_reg)                  388
(pDC_LILRA4+, Tumor_AFP+)             574
(pDC_LILRA4+, Tumor_GPC3+)            238
(pDC_LILRA4+, Tumor_proliferation)    236
(pDC_LILRA4+, pDC_LILRA4+)            156
Length: 630, dtype: int64

In [21]:
# 相互作用排序
top_interactions = interaction_counts_merged.nlargest(1000).index
top_interactions

Index([               ('Tumor_AFP+', 'Tumor_AFP+'),
                        ('Ep_EPCAM+', 'Ep_EPCAM+'),
             ('Tumor_AFP+', 'Tumor_proliferation'),
                      ('CAF_ACTA2+', 'CAF_ACTA2+'),
                     ('Tumor_AFP+', 'Tumor_GPC3+'),
        ('Macrophage_LYVE1+', 'Macrophage_LYVE1+'),
            ('Macrophage_LYVE1+', 'Mait_SLC4A10+'),
                    ('Macrophage_LYVE1+', 'T_reg'),
                    ('Ep_EPCAM+', 'Mait_SLC4A10+'),
                 ('Mait_SLC4A10+', 'T_CD4+, PD1+'),
       ...
           ('Liver', 'Neutrophil_CSF3R+, S100A8+'),
               ('Macrophage_C1QA+', 'Tumor_GPC3+'),
                       ('Liver', 'T_CD8+, CTLA4+'),
       ('Macrophage_C1QA+', 'Tumor_proliferation'),
                             ('B_CD79A+', 'Liver'),
                           ('CAF_ACTA2+', 'Liver'),
       ('B_CD79A+, MS4A1+', 'Tumor_proliferation'),
                     ('B_CD79A+, MS4A1+', 'Liver'),
                            ('Liver', 'NK_NCAM1+'),
 

In [None]:
import statsmodels.api as sm
from statsmodels.discrete.discrete_model import NegativeBinomial  # 正确导入路径


for i, top_interaction in tqdm(enumerate(top_interactions), total=len(top_interactions)):
    target_type1, target_type2 = top_interaction

    # 准备数据
    y = cell_interactions[f'interact_{target_type2}']
    X = sm.add_constant(microenv_df.reindex(y.index).fillna(0))

    # 使用NegativeBinomial回归
    model = NegativeBinomial(y, X)
    results = model.fit(disp=False) # 关闭优化过程打印
            
    # 可视化特征重要性（原代码逻辑）
    coef_df = pd.DataFrame({
        'feature': X.columns[1:],
        'coef': results.params[1:-1],
        'abs_coef': np.abs(results.params[1:-1])
    }).nlargest(100, 'abs_coef')
    
    plt.figure(figsize=(10,6))
    plt.barh(coef_df.feature, coef_df.coef, color='steelblue')
    plt.xlabel('Coefficient Magnitude')
    plt.title(f'Microenvironment Impact on {top_interaction} Interactions')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig(output_path / '{}_{}.png'.format(i, ', '.join(top_interaction)))
    plt.close()

100%|██████████| 630/630 [21:31<00:00,  2.05s/it]
