In [1]:
# Loading the Packages
%reload_ext autoreload
%autoreload 2

# basic
import warnings
warnings.filterwarnings('ignore')
import os
from pathlib import Path
from tqdm import tqdm

# data process
import numpy as np
import pandas as pd
import scanpy as sc

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams.update({
    "pgf.texsystem": "xelatex",      # 使用 XeLaTeX，如果不需要 LaTeX 公式渲染，可以省略
    'font.family': 'serif',          # 字体设置为衬线字体
    'text.usetex': False,            # 禁用 LaTeX，使用 Matplotlib 内置文字渲染
    'pgf.rcfonts': False,            # 禁用 pgf 的默认字体管理
    'pdf.fonttype': 42,              # 确保字体为 TrueType 格式，可被 Illustrator 编辑
    'ps.fonttype': 42,               # EPS 文件也使用 TrueType 格式
    'figure.dpi': 300,               # 设置图形分辨率
    'savefig.dpi': 300,              # 保存的图形文件分辨率
    'axes.unicode_minus': False,     # 避免负号问题
})

In [2]:
# workdir 
BASE_DIR = Path(r'G:\spatial_data')
RUN_ID = '20230523_HCC_PRISM_probe_refined'
src_path = BASE_DIR / 'processed' / f'{RUN_ID}'
analysis_path = BASE_DIR / 'analysis' / f'{RUN_ID}'

# Load one slide exp
segmend_path = src_path / "segmented"
typ_path = analysis_path / "cell_typing"
output_path = analysis_path / "roi_variation"
output_path.mkdir(exist_ok=True)

# ROI define

In [3]:
from skimage import io, transform, morphology

def ROI_mask_load(input_path, out_path, show=True, save=False):
    ROI_mask = {}
    for mask_file in os.listdir(input_path):
        image = io.imread(os.path.join(input_path, mask_file))
        image = transform.rotate(image, angle=90, resize=True)
        image = morphology.binary_dilation(image, footprint=morphology.disk(5))
        ROI_mask[mask_file.replace('.tif','').replace('Mask', 'ROI')] = image
        
    ncols = int(-(-len(ROI_mask)**(1/2)//1))
    nrows = -(-len(ROI_mask)//ncols)
    fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols*4, nrows*4))
    for pos, mask_name in enumerate(list(ROI_mask.keys())):
        ax[pos // ncols][pos % ncols].imshow(ROI_mask[mask_name], cmap='gray')
        ax[pos // ncols][pos % ncols].set_title(mask_name)
        ax[pos // ncols][pos % ncols].set_xlabel("")
        ax[pos // ncols][pos % ncols].set_ylabel("")
    fig.suptitle('Mask_of_ROIs', fontsize=20)
    plt.tight_layout()

    if save: plt.savefig(out_path)
    elif show: plt.show()
    plt.close()

    return ROI_mask

In [4]:
ROI_mask = ROI_mask_load(input_path=os.path.join(output_path, 'roi_mask'), 
                         show=False, save=True, out_path=os.path.join(output_path, 'roi_mask.png'))

In [6]:
adata = sc.read_h5ad(typ_path / 'adata.h5ad')
ROI_mask = ROI_mask_load(input_path=output_path/'roi_mask', show=False, save=False, out_path=None)
adata.obs['Y_pos'] = adata.obsm['spatial'][:, 0]
adata.obs['X_pos'] = adata.obsm['spatial'][:, 1]
adata.obs['ROI'] = pd.Categorical(['other']*len(adata), categories=list(ROI_mask.keys()) + ['other'], ordered=False)    
for _, mask in ROI_mask.items():
    yrange = mask.shape[0]
    for cell in tqdm(adata.obs.index, desc=_):
        if mask[yrange - int(adata.obs['Y_pos'].loc[cell]/100), int(adata.obs['X_pos'].loc[cell]/100)]:
            adata.obs['ROI'].loc[cell] = _
adata = adata[adata.obs['type'] != 'other']
adata

ROI_1: 100%|██████████| 80396/80396 [00:01<00:00, 53902.23it/s]
ROI_2: 100%|██████████| 80396/80396 [00:01<00:00, 56511.72it/s]
ROI_3: 100%|██████████| 80396/80396 [00:01<00:00, 59343.69it/s]
ROI_4: 100%|██████████| 80396/80396 [00:01<00:00, 65010.42it/s]
ROI_5: 100%|██████████| 80396/80396 [00:01<00:00, 60161.11it/s]


View of AnnData object with n_obs × n_vars = 60329 × 31
    obs: 'dataset', 'n_genes', 'n_counts', 'leiden_type', 'type', 'leiden_subtype', 'subtype', 'Y_pos', 'X_pos', 'ROI'
    var: 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'mean', 'std'
    uns: 'leiden', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap', 'spatial'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

# ROI gene exp

In [7]:
import pandas as pd
# from scipy.sparse import isspparse

# 确认目标基因存在于adata.raw中
target_genes = ['AFP', 'GPC3', 'MKI67', 'HBV']
missing_genes = [gene for gene in target_genes if gene not in adata.raw.var_names]
if missing_genes:
    raise ValueError(f"Genes not found in adata.raw: {missing_genes}")

# 获取所有唯一ROI
rois = adata.obs['ROI'].unique()

# 存储结果的字典
mean_exp_dict = {}

for roi in rois:
    # 获取当前ROI的细胞掩码
    mask = adata.obs['ROI'] == roi
    # 提取对应细胞和基因的表达数据
    roi_data = adata.raw[mask, target_genes].X
    
    # 计算均值（处理稀疏矩阵）
    # if issparse(roi_data):
    #     mean_exp = roi_data.mean(axis=0).A1  # 转换为numpy数组
    # else:
    mean_exp = roi_data.mean(axis=0)
    
    mean_exp_dict[roi] = mean_exp

# 转换为DataFrame
result_df = pd.DataFrame.from_dict(
    mean_exp_dict, 
    orient='index', 
    columns=target_genes
)
result_df = result_df.loc[['ROI_1', 'ROI_2', 'ROI_3', 'ROI_4', 'ROI_5']]
# normalize by gene
result_df = result_df.div(result_df.sum(axis=1), axis=0)

In [16]:
plt.figure(figsize=(5, 2))
sns.heatmap(result_df.T, cmap='coolwarm', vmax=0.7)
plt.savefig(os.path.join(output_path, 'mean_exp.pdf'))
plt.close()

# ROI_cell type

In [29]:
import yaml

with open(analysis_path / "cell_typing_params.yaml") as f:
    params = yaml.load(f, Loader=yaml.FullLoader)

cell_type_dict = {key: list(value.keys()) for key, value in params['leiden_annotation'].items()}
type_colormap = params['type_colormap']

In [30]:
from collections import Counter

ROI_cluster = pd.DataFrame(index=list(type_colormap.keys())[:-1])

for _, mask in ROI_mask.items():
    yrange = mask.shape[0]
    tmp_adata_st = adata[adata.obs.ROI == _]
    tmp = Counter(tmp_adata_st.obs.type)
    df = pd.DataFrame(tmp, index=[0]).T
    df.columns = [_]
    ROI_cluster = pd.concat([ROI_cluster, df], axis=1)

In [31]:
import matplotlib.ticker as mtick
from matplotlib import cm
from matplotlib.colors import LinearSegmentedColormap

colors = [_ for _ in type_colormap.values()][:-1]
cmap = LinearSegmentedColormap.from_list('my_colormap', colors)

fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(15, 7))
sc.pl.umap(adata, size=2, show=False, palette=type_colormap, 
           color="type", legend_loc="on data", legend_fontweight=50, legend_fontsize=10, ax=ax[0])

df = ROI_cluster.T
df = df.div(df.sum(axis=1), axis=0) * 100
ax[1] = df.plot(kind='bar', stacked=True, colormap=cmap, ax=ax[1])
ax[1].yaxis.set_major_formatter(mtick.PercentFormatter())
ax[1].set_title('content of different cluster on ROIs')

plt.legend(loc='upper center', bbox_to_anchor=(1.2, 1), ncols=1)
plt.tight_layout()
plt.savefig(os.path.join(output_path, 'roi_type_cell_comp.png'))
plt.close()

In [32]:
ROI_cluster_immune = ROI_cluster[ROI_cluster.index.isin(['Liver','Tumor','Endo','Ep','CAF'])]
ROI_cluster_nonimmune = ROI_cluster[~ROI_cluster.index.isin(['Liver','Tumor','Endo','Ep','CAF'])]

In [33]:
import matplotlib.ticker as mtick
from matplotlib.colors import LinearSegmentedColormap


fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(30, 10))

colors = [_ for _ in type_colormap.values()][:-1]
cmap1 = LinearSegmentedColormap.from_list('my_colormap', colors[:5])
cmap2 = LinearSegmentedColormap.from_list('my_colormap', colors[5:])

df = ROI_cluster_immune.T
df = df.div(df.sum(axis=1), axis=0) * 100
df.plot(kind='bar', stacked=True, figsize=(20,10), colormap=cmap1, ax=ax[0])
ax[0].yaxis.set_major_formatter(mtick.PercentFormatter())
ax[0].set_title('content of different cluster on ROIs')

df = ROI_cluster_nonimmune.T
df = df.div(df.sum(axis=1), axis=0) * 100
df.plot(kind='bar', stacked=True, figsize=(20,10), colormap=cmap2, ax=ax[1])
ax[1].yaxis.set_major_formatter(mtick.PercentFormatter())
ax[1].set_title('content of different cluster on ROIs')
plt.savefig(os.path.join(output_path, 'roi_type_cell_comp_div.png'))
plt.close()

# Ro/e

In [34]:
adata = adata[adata.obs['type'] != 'other']
adata

View of AnnData object with n_obs × n_vars = 60329 × 31
    obs: 'dataset', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'n_genes', 'n_counts', 'tissue', 'leiden', 'type', 'sample', 'tmp_leiden', 'leiden_subtype', 'subtype', 'leiden_type', 'Y_pos', 'X_pos', 'ROI'
    var: 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'mean', 'std'
    uns: 'leiden', 'log1p', 'neighbors', 'pca', 'umap', 'type_colors'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [35]:
with open(output_path/'params.yaml') as f:
    params = yaml.load(f, Loader=yaml.FullLoader)
    type_of_interest = params['type_of_interest']
# type_of_interest = [[subtype] for subtype_list in cell_type_dict.values() for subtype in subtype_list]

ROIs = [f'ROI_{i}' for i in [1, 2, 3, 4, 5]]
R_oe = pd.DataFrame(columns=ROIs)
R_oe['type_of_in'] = [', '.join(_) for _ in type_of_interest]
R_oe.set_index('type_of_in', inplace = True)

adata_for_ROE = adata[adata.obs.ROI.isin(ROIs)]
total_cell_num = len(adata_for_ROE)

for subtype_list in type_of_interest:
    for region in ROIs:
        adata_for_ROI = adata_for_ROE[adata_for_ROE.obs.ROI == region]
        adata_for_ROI_subtype = adata_for_ROI[adata_for_ROI.obs.subtype.isin(subtype_list)]

        observed_num = len(adata_for_ROI_subtype)
        expect_num = (len(adata_for_ROE[adata_for_ROE.obs.ROI==region]) * len(adata_for_ROE[adata_for_ROE.obs.subtype.isin(subtype_list)]))/total_cell_num
        if expect_num == 0:
            R_oe.loc[', '.join(subtype_list), region] = 0
            continue

        R_oe.loc[', '.join(subtype_list), region] = observed_num/expect_num

plt.figure(figsize=(7, 10))
sns.heatmap([list(_) for _ in np.array(R_oe)], cmap="coolwarm", 
            xticklabels=R_oe.columns, yticklabels=R_oe.index, vmax=2.5) # vmin=-0.5,
plt.title(f"R observed/expected")
plt.tight_layout()
plt.savefig(output_path / 'roe.pdf')
plt.close()