# STAGATE segment

In [1]:
# Loading the Packages
%reload_ext autoreload
%autoreload 2

import os
from pathlib import Path
import pickle
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import scanpy as sc
from scipy.signal import argrelextrema
from scipy.signal import find_peaks

import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams.update({
    "pgf.texsystem": "xelatex",      # 使用 XeLaTeX，如果不需要 LaTeX 公式渲染，可以省略
    'font.family': 'serif',          # 字体设置为衬线字体
    'text.usetex': False,            # 禁用 LaTeX，使用 Matplotlib 内置文字渲染
    'pgf.rcfonts': False,            # 禁用 pgf 的默认字体管理
    'pdf.fonttype': 42,              # 确保字体为 TrueType 格式，可被 Illustrator 编辑
    'ps.fonttype': 42,               # EPS 文件也使用 TrueType 格式
    'figure.dpi': 300,               # 设置图形分辨率
    'savefig.dpi': 300,              # 保存的图形文件分辨率
    'axes.unicode_minus': False,     # 避免负号问题
})

In [None]:
# workdir 
BASE_DIR = Path(r'G:\spatial_data')

RUN_ID = '20230523_HCC_PRISM_probe_refined'
src_path = BASE_DIR / 'processed' / f'{RUN_ID}'
data_path = src_path / "segmented"

analysis_path = BASE_DIR / 'analysis' / f'{RUN_ID}'
typ_path = analysis_path / "cell_typing"
STAGATE_path = analysis_path / "STAGATE"
STAGATE_path.mkdir(exist_ok=True)

## proprocess of data

In [None]:
combine_adata_st = sc.read_h5ad(typ_path / 'combine_adata_st.h5ad')
combine_adata_st

AnnData object with n_obs × n_vars = 80396 × 30
    obs: 'dataset', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'n_genes', 'n_counts', 'type', 'leiden', 'sample', 'tissue', 'tmp_leiden', 'leiden_subtype', 'subtype', 'leiden_type', 'Y_pos', 'X_pos'
    var: 'n_cells_by_counts-CNP0000650', 'mean_counts-CNP0000650', 'log1p_mean_counts-CNP0000650', 'pct_dropout_by_counts-CNP0000650', 'total_counts-CNP0000650', 'log1p_total_counts-CNP0000650', 'n_cells-CNP0000650', 'mean-CNP0000650', 'std-CNP0000650', 'n_cells_by_counts-GSE140228', 'mean_counts-GSE140228', 'log1p_mean_counts-GSE140228', 'pct_dropout_by_counts-GSE140228', 'total_counts-GSE140228', 'log1p_total_counts-GSE140228', 'n_cells-GSE140228', 'mean-GSE140228', 'std-GSE140228', 'n_cells_by_counts-GSE151530', 'mean_counts-GSE151530', 'log1p_mean_counts-GSE151530', 'pct_dropout_by_counts-GSE151530', 'total_counts-GSE151530', 'log1p_total_counts-GSE151530', 'n_cells-GSE151530', 'mean-GSE151530', '

In [None]:
adata_direct = sc.read_h5ad(typ_path / 'adata_leiden_res_1.h5ad')
adata = adata_direct[adata_direct.obs.index.isin(combine_adata_st.obs.index)]
adata.obs = combine_adata_st.obs
adata

AnnData object with n_obs × n_vars = 80396 × 31
    obs: 'dataset', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'n_genes', 'n_counts', 'type', 'leiden', 'sample', 'tissue', 'tmp_leiden', 'leiden_subtype', 'subtype', 'leiden_type', 'Y_pos', 'X_pos'
    var: 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'mean', 'std'
    uns: 'leiden', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [123]:
# format for later analysis
adata.obs = adata.obs.rename(columns={'X_pos':'X', 'Y_pos':'Y'})
adata.obsm['spatial'] = adata.obs.loc[:, ['X', 'Y']].values
adata.obs.head()

Unnamed: 0,dataset,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,n_genes,n_counts,type,leiden,sample,tissue,tmp_leiden,leiden_subtype,subtype,leiden_type,Y,X
0,PRISM_HCC,9,2.302585,22.0,3.135494,9,22.0,Macrophage,0,,non_liver,0,18,Macrophage_LYVE1+,11,1109,21214
2,PRISM_HCC,10,2.397895,16.0,2.833213,10,16.0,Mast,13,,non_liver,13,12,Mast_CPA3+,8,1156,21143
3,PRISM_HCC,6,1.94591,8.0,2.197225,6,8.0,other,73,,non_liver,-2,-2,other,-2,1157,11785
4,PRISM_HCC,6,1.94591,7.0,2.079442,6,7.0,CD4+,28,,non_liver,28,23,"T_CD4+, PD1+, CTLA4+",12,1164,21132
5,PRISM_HCC,13,2.639057,41.0,3.73767,13,41.0,CD8+,45,,non_liver,45,28,"T_CD8+, GZMA+, CXCL13+",13,1177,11828


## training of STAGATE and build graph


In [None]:
# # training of STAGATE and build graph
# import STAGATE
# import STAGATE_pyG

# def Stats_Spatial_Net_manual(adata, outpath=None):
#     import matplotlib.pyplot as plt
#     Num_edge = adata.uns['Spatial_Net']['Cell1'].shape[0]
#     Mean_edge = Num_edge/adata.shape[0]
#     plot_df = pd.value_counts(pd.value_counts(adata.uns['Spatial_Net']['Cell1']))
#     plot_df = plot_df/adata.shape[0]
#     fig, ax = plt.subplots(figsize=[3,2])
#     plt.ylabel('Percentage')
#     plt.xlabel('')
#     plt.title('Number of Neighbors (Mean=%.2f)'%Mean_edge)
#     ax.bar(plot_df.index, plot_df)
#     if not outpath is None:
#         plt.savefig(outpath)
#         plt.close()

# for rad_cutoff in tqdm([50,100,150,200,250,300,400]):
#     current_path = STAGATE_path / f'rad_cutoff_{str(rad_cutoff)}'
#     current_path.mkdir(exist_ok=True)
#     STAGATE_pyG.Cal_Spatial_Net(adata, rad_cutoff=rad_cutoff)
#     # stats of spatial network
#     Stats_Spatial_Net_manual(adata, outpath=current_path / 'stats_spatial_net.png')

#     adata_STAGATE = STAGATE_pyG.train_STAGATE(adata)
#     adata_STAGATE.write(current_path / 'adata_STAGATE.h5ad')
#     adata_STAGATE = sc.read_h5ad(current_path / 'adata_STAGATE.h5ad')
#     sc.pp.neighbors(adata_STAGATE, n_neighbors=50, use_rep='STAGATE')
#     sc.tl.umap(adata_STAGATE)
#     adata_STAGATE.write_h5ad(current_path / 'adata_STAGATE.h5ad')

#     mclust_dir = current_path / 'mClust_test'
#     mclust_dir.mkdir(exist_ok=True)
#     for cluster in range(2, 20):
#         adata_STAGATE = sc.read_h5ad(current_path / 'adata_STAGATE.h5ad')
#         adata_STAGATE = STAGATE.mclust_R(adata_STAGATE, used_obsm='STAGATE', used_savename=f'mclust_{cluster}', num_cluster=cluster)
#         adata_STAGATE.obs[f'mclust_{cluster}'] = adata_STAGATE.obs[f'mclust_{cluster}'].astype(str)
#         adata_STAGATE.write(current_path / 'adata_STAGATE.h5ad')
        
#         fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(20, 10))
#         sc.pl.umap(adata_STAGATE, color=f'mclust_{cluster}', palette=colors, s=10, show=False, ax=ax[0])
#         sc.pl.embedding(adata_STAGATE, basis="spatial", color=f'mclust_{cluster}', palette=colors, s=10, show=False, ax=ax[1]) #, legend_loc=False)
#         plt.tight_layout()
#         plt.savefig(mclust_dir / f'cluster_{cluster}.png')
#         plt.close()

## clustering of ROIs


In [4]:
import yaml

with open(os.path.join(typ_path, 'annotation_params.yaml'), 'r') as f:
    annotaiton_params = yaml.load(f, Loader=yaml.FullLoader)
type_colormap = annotaiton_params['type_colormap']
print(annotaiton_params.keys())

colors = [_ for _ in type_colormap.values()][:-1]
colors[5] = (0, 0, 0)
colors[6] = (0, 1, 1)

dict_keys(['leiden_annotation', 'marker_gene_dict', 'type_colormap'])


In [10]:
rad_cutoff = 250
current_path = STAGATE_path / f'rad_cutoff_{str(rad_cutoff)}'
mclust_dir = current_path / 'mClust_test'
cluster = 13
adata_STAGATE = sc.read_h5ad(current_path / 'adata_STAGATE.h5ad')
fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(20, 10))
sc.pl.umap(adata_STAGATE, color=f'mclust_{cluster}', palette=colors, s=10, show=False, ax=ax[0])
sc.pl.embedding(adata_STAGATE, basis="spatial", color=f'mclust_{cluster}', palette=colors, s=10, show=False, ax=ax[1]) #, legend_loc=False)
plt.tight_layout()
plt.savefig(mclust_dir / f'cluster_{cluster}.png')
plt.close()

## analysis: gene distribution on umap

In [128]:
rad_cutoff = 250
adata_STAGATE = sc.read_h5ad(STAGATE_path / f'rad_cutoff_{str(rad_cutoff)}' / 'adata_STAGATE.h5ad')

In [129]:
gene_list=[
    "HBV", "AFP", "GPC3", "MKI67", "PECAM1", "EPCAM", "ACTA2", 
    "CLEC9A", "CD1C", "LILRA4", "SLC4A10", "CPA3", "C1QA", "FCGR3A", "S100A8", "CSF3R", "LYVE1", 
    "CD3D", "CD4", "CD8A", "GZMA", "GZMB", "CTLA4", "PDCD1", "CXCL13", 
    "FOXP3", "PRF1", "CD79A", "MS4A1", "NCAM1", "MZB1"
]

figure_path = STAGATE_path / f'rad_cutoff_{str(rad_cutoff)}'
fig, ax = plt.subplots(ncols=6, nrows=6, figsize=(30, 25))
for _, gene in enumerate(gene_list):
    ax_tmp = ax.flatten()[_]
    sc.pl.umap(adata_STAGATE, color=gene, ax=ax_tmp, show=False, s=3, alpha=0.8, vmax=5)
plt.tight_layout()
plt.savefig(figure_path / 'umap_gene.png')
plt.close()

# Cell composition

In [None]:
rad_cutoff = 250
cluster_num = 13
current_path = STAGATE_path / f'rad_cutoff_{str(rad_cutoff)}'
adata_STAGATE = sc.read_h5ad(current_path / 'adata_STAGATE.h5ad')

In [9]:
import yaml

with open(os.path.join(typ_path, 'annotation_params.yaml'), 'r') as f:
    annotaiton_params = yaml.load(f, Loader=yaml.FullLoader)
print(annotaiton_params.keys())

dict_keys(['leiden_annotation', 'marker_gene_dict', 'type_colormap'])


In [10]:
leiden_annotation = annotaiton_params['leiden_annotation']
leiden_type_dict = {}
leiden_subtype_dict = {}
for type_key, subtypes in leiden_annotation.items():
    type_values = []
    for subtype_key, values in subtypes.items():
        type_values.extend(values)
        leiden_subtype_dict[subtype_key] = values
    leiden_type_dict[type_key] = type_values

type_colormap = annotaiton_params['type_colormap']
subtype_colormap = dict()
for subtype in leiden_subtype_dict.keys():
    for rough_type in type_colormap.keys():
        if rough_type in subtype:
            subtype_colormap[subtype] = type_colormap[rough_type]
            break
subtype_colormap

{'Liver': [1, 0.392, 0],
 'Tumor_AFP+': [0.751, 0.491, 0],
 'Tumor_GPC3+': [0.751, 0.491, 0],
 'Tumor_proliferation': [0.751, 0.491, 0],
 'other_cell_proliferation': [0, 0.5, 0.636],
 'Endo_PECAM1+': [1, 0, 1],
 'Ep_EPCAM+': [0, 1, 0],
 'CAF_ACTA2+': [0, 0, 1],
 'cDC1_CLEC9A+': [1, 0.259, 0],
 'cDC2_CD1C+': [1, 0.259, 0],
 'pDC_LILRA4+': [1, 0.259, 0],
 'Mait_SLC4A10+': [1, 0, 0.434],
 'Mast_CPA3+': [1, 0, 0],
 'Monocyte_CD14+': [0, 0.471, 1],
 'Monocyte_CD14+, CD16+': [0, 0.471, 1],
 'Monocyte_CD16+': [0, 0.471, 1],
 'Neutrophil_CSF3R+, S100A8+': [1, 1, 0],
 'Neutrophil_CSF3R+': [1, 1, 0],
 'Macrophage_LYVE1+': [0.5, 0.8, 0],
 'Macrophage_C1QA+': [0.5, 0.8, 0],
 'Cyto_T_CD4+': [0.5, 0.5, 0.5],
 'T_CD4+, PD1+': [0.5, 0.5, 0.5],
 'T_CD4+, CXCL13+': [0.5, 0.5, 0.5],
 'T_CD4+, PD1+, CTLA4+': [0.5, 0.5, 0.5],
 'T_CD4+, CTLA4+': [0.5, 0.5, 0.5],
 'Cyto_T_CD8+': [1, 0.8, 0],
 'T_CD8+, PD1+': [1, 0.8, 0],
 'T_CD8+, CTLA4+': [1, 0.8, 0],
 'T_CD8+, GZMA+, CXCL13+': [1, 0.8, 0],
 'T_reg': [0, 1,

In [15]:
subtype_colormap['Tumor_AFP+'] = [1, 0.5, 0]
subtype_colormap['Tumor_AFP+'] = [1, 0.5, 0]
subtype_colormap['Tumor_GPC3+'] = [0.75, 0.5, 0.5]
subtype_colormap['Tumor_proliferation'] = [1, 0.5, 0]

subtype_colormap['B_CD79A+, MS4A1+'] = [0, 0.5, 0.5]
subtype_colormap['cDC2_CD1C+'] = [1, 0.6, 0]
subtype_colormap['Macrophage_C1QA+'] = [0, 0.8, 0.5]
subtype_colormap['Mait_SLC4A10+'] = [0, 0, 1]
subtype_colormap['Neutrophil_CSF3R+'] = [0.5, 0.5, 0]
subtype_colormap['T_CD4+, CXCL13+'] = [0.8,0.8,0.8]
subtype_colormap['T_CD4+, PD1+, CTLA4+'] = [0.8,0.8,0.8]
subtype_colormap['T_CD8+, GZMA+, CXCL13+'] = [1,0,0]
subtype_colormap['Monocyte_CD14+, CD16+'] = [0,0.8,1]

In [12]:
non_immune_subtype = ['Liver', 'Tumor_AFP+', 'Tumor_GPC3+', 'Tumor_proliferation', 'other_cell_proliferation', 'Endo_PECAM1+', 'Ep_EPCAM+', 'CAF_ACTA2+',]
immune_subtype = [_ for _ in subtype_colormap.keys() if _ not in non_immune_subtype]

In [16]:
immune_data = adata_STAGATE[adata_STAGATE.obs.subtype.isin(immune_subtype)]
non_immune_data = adata_STAGATE[adata_STAGATE.obs.subtype.isin(non_immune_subtype)]

# def plot_stacked_bar(data, subtype_list, mclust_order, title, colormap, ax):
#     # 根据mclust_13聚类列统计每个聚类内不同细胞类型的数量
#     subtype_counts = data.groupby([f'mclust_{cluster_num}', 'subtype']).size().unstack(fill_value=0)
#     subtype_counts = subtype_counts.loc[mclust_order]
#     subtype_counts = subtype_counts.loc[:, subtype_counts.columns.isin(subtype_list)]    
#     subtype_counts_normalized = subtype_counts.div(subtype_counts.sum(axis=1), axis=0)
#     colors = [colormap[subtype] for subtype in subtype_counts_normalized.columns]    
#     subtype_counts_normalized.plot(kind='bar', stacked=True, ax=ax, color=colors, width=1.0)
    
#     ax.set_title(title)
#     ax.set_xlabel('Cluster')
#     ax.set_ylabel('Normalized Proportion')
#     ax.legend(title='Cell Type', bbox_to_anchor=(1.05, 1), loc='upper left')

def plot_stacked_bar(data, subtype_list, mclust_order, title, colormap, ax):
    # 根据mclust_13聚类列统计每个聚类内不同细胞类型的数量
    subtype_counts = data.groupby([f'mclust_{cluster_num}', 'subtype']).size().unstack(fill_value=0)
    subtype_counts = subtype_counts.loc[mclust_order]
    subtype_counts = subtype_counts.loc[:, subtype_counts.columns.isin(subtype_list)]    
    subtype_counts_normalized = subtype_counts.div(subtype_counts.sum(axis=1), axis=0)
    colors = [colormap[subtype] for subtype in subtype_counts_normalized.columns]    
    
    # 绘制堆叠条形图，并禁用自动图例
    subtype_counts_normalized.plot(kind='bar', stacked=True, ax=ax, color=colors, width=1.0, legend=False)
    
    ax.set_title(title)
    ax.set_xlabel('Cluster')
    ax.set_ylabel('Normalized Proportion')
    
    # 获取句柄和标签，并反转顺序
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles[::-1], labels[::-1], title='Cell Type', bbox_to_anchor=(1.05, 1), loc='upper left')

fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# mclust_order = [str(_) for _ in [1, 9, 3, 11, 10, 12, 8, 13, 7, 2, 4, 5, 6]]
mclust_order = [str(_) for _ in range(1, cluster_num+1)]
plot_stacked_bar(immune_data.obs, immune_subtype, mclust_order, 'Immune Subtypes', subtype_colormap, axes[0])
plot_stacked_bar(non_immune_data.obs, non_immune_subtype, mclust_order, 'Non-Immune Subtypes', subtype_colormap, axes[1])
plt.tight_layout()
plt.savefig(current_path / f'mClust{cluster_num}_subtype_distribution.png')
plt.close()

In [26]:
immune_data = adata_STAGATE[adata_STAGATE.obs.subtype.isin(immune_subtype)]
non_immune_data = adata_STAGATE[adata_STAGATE.obs.subtype.isin(non_immune_subtype)]

def plot_stacked_bar(data, title, colors, ax):
    subtype_counts = data.groupby([f'mclust_{cluster_num}', 'subtype_group']).size().unstack(fill_value=0)    
    subtype_counts_normalized = subtype_counts
    # subtype_counts_normalized = subtype_counts.div(subtype_counts.sum(axis=1), axis=0)
    
    subtype_counts_normalized.plot(kind='bar', stacked=True, ax=ax, color=colors, width=1.0)
    ax.set_title(title)
    ax.set_xlabel('Cluster')
    ax.set_ylabel('Normalized Proportion')
    ax.legend(title='Cell Type', bbox_to_anchor=(1.05, 1), loc='upper left')


fig, ax = plt.subplots(figsize=(8, 6))
colors = ['red', 'blue']
combined_data = pd.concat([immune_data.obs, non_immune_data.obs])
combined_data['subtype_group'] = combined_data['subtype'].apply(lambda x: 'immune' if x in immune_subtype else ('non-immune' if x in non_immune_subtype else 'other'))
plot_stacked_bar(combined_data, 'Immune vs Non-Immune Subtypes', colors, ax)

plt.tight_layout()
plt.savefig(current_path / f'mClust{cluster_num}_im_non_im_cell_num.png')
plt.close()