# Cell typing

## Environment

In [124]:
# Loading the Packages
%reload_ext autoreload
%autoreload 2

import os
from pathlib import Path
import pickle
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import scanpy as sc
from scipy.signal import argrelextrema
from scipy.signal import find_peaks

import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams.update({
    "pgf.texsystem": "xelatex",      # 使用 XeLaTeX，如果不需要 LaTeX 公式渲染，可以省略
    'font.family': 'serif',          # 字体设置为衬线字体
    'text.usetex': False,            # 禁用 LaTeX，使用 Matplotlib 内置文字渲染
    'pgf.rcfonts': False,            # 禁用 pgf 的默认字体管理
    'pdf.fonttype': 42,              # 确保字体为 TrueType 格式，可被 Illustrator 编辑
    'ps.fonttype': 42,               # EPS 文件也使用 TrueType 格式
    'figure.dpi': 300,               # 设置图形分辨率
    'savefig.dpi': 300,              # 保存的图形文件分辨率
    'axes.unicode_minus': False,     # 避免负号问题
})

# workdir 
BASE_DIR = Path(r'F:\spatial_data\processed')
RUN_ID = '20230523_HCC_PRISM_probe_refined'
src_dir = BASE_DIR / f'{RUN_ID}_processed'

# Load one slide exp
base_path = BASE_DIR / f'{RUN_ID}_processed'
data_path = os.path.join(base_path / "segmented")
typ_path = os.path.join(base_path / "cell_typing")
figure_path = os.path.join(base_path / "figures")
if not os.path.exists(typ_path): os.mkdir(typ_path)
if not os.path.exists(figure_path): os.mkdir(figure_path)

expr_path = os.path.join(data_path, "expression_matrix.csv")
FOI = 'HCC'
gene_list=[
    "HBV", "AFP", "GPC3", "MKI67",
    "PECAM1", "EPCAM", "ACTA2", 
    "CLEC9A", "CD1C", "LILRA4",
    "SLC4A10", "CPA3",
    "C1QA", "FCGR3A", "S100A8", "CSF3R", "LYVE1",
    "CD3D", "CD4", "CD8A",
    "GZMA", "GZMB", "CTLA4", "PDCD1", "CXCL13", 
    "FOXP3", "PRF1", 
    "CD79A", "MS4A1",    
    "NCAM1", "MZB1"
]

## load exp data

In [2]:
raw_adata = pd.read_csv(expr_path, index_col=0)
adata = sc.AnnData(raw_adata)
adata.obs['dataset'] = ["PRISM_HCC"] * len(adata)

## Preprocessing


In [None]:
# Preprocessing
def adata_filter(adata, min_genes, min_counts, max_counts, min_cells):
    sc.pp.filter_cells(adata, min_genes=min_genes)
    sc.pp.filter_cells(adata, min_counts=min_counts)
    sc.pp.filter_cells(adata, max_counts=max_counts)
    sc.pp.filter_genes(adata, min_cells=min_cells)
    return adata


def QC_plot(adata, hue, min_counts='nan', max_counts='nan', min_genes='nan', min_cells='nan'):
    g = sns.JointGrid(data=adata.obs, x="total_counts", y="n_genes_by_counts", height=5, ratio=2, hue=hue)
    g.fig.suptitle("QC_by_{}, cell_num={}, gene_num={}\nmin_counts={}, max_counts={}, min_genes={}, min_cells={}\n\n\n\n\n".format(
                    hue,len(adata),len(adata.var.index),min_counts, max_counts, min_genes, min_cells))
    g.plot_joint(sns.scatterplot, s=40, alpha=0.3)
    g.plot_marginals(sns.kdeplot)
    g.set_axis_labels("total_counts", "n_genes_by_counts", fontsize=16)
    g.fig.set_figwidth(6)
    g.fig.set_figheight(6)
    plt.show()


def general_preprocess(adata, min_genes=2, min_counts=5, max_counts=1300, min_cells=3, auto_filter=False):
    # Plot top 20 most expressed genes
    sc.pl.highest_expr_genes(adata, n_top=10)
    # Calculate QC metrics
    sc.pp.calculate_qc_metrics(adata, percent_top=None, inplace=True)
    
    fig, ax = plt.subplots(figsize=(20,5))
    a = adata.obs.total_counts
    sns.histplot(a, bins=100, stat='density', alpha=1, kde=True, edgecolor='white', linewidth=0.5,
                ax=ax, line_kws=dict(color='black', alpha=0.7, linewidth=1.5, label='KDE'),) # log=True, binrange=[0,100]
    y = ax.get_lines()[0].get_ydata()
    maxima = [float(_/len(y)*(max(a)-min(a))+min(a)) for _ in argrelextrema(-np.array(y), np.less)[0]]
    print(f'maxima: {maxima}')
    plt.show()

    # QC plot
    origin_cell_num = len(adata)
    QC_plot(adata, hue='dataset')

    # Filter
    if auto_filter: min_counts = int(maxima[0]) - 1
    adata = adata_filter(adata, min_genes, min_counts, max_counts, min_cells)
    filtered_cell_num = len(adata)
    QC_plot(adata, hue='dataset', min_genes=min_genes, min_counts=min_counts, max_counts=max_counts, min_cells=min_cells)
    return adata, origin_cell_num, filtered_cell_num

In [None]:
# process of gene name
adata.var.index = adata.var.index.str.upper()
adata = adata[:, gene_list]

# general preprocess
adata, origin_cell_num, filtered_cell_num = general_preprocess(adata, min_genes=2, min_counts=5, max_counts=200, min_cells=1, auto_filter=False)

# threshold for liver
adata.obs["tissue"] = ["non_liver"] * len(adata)
adata_liver = adata[adata[:, "HBV"].X >= 5]
adata.obs["tissue"][adata_liver.obs.index] = ["liver"] * len(adata_liver)

# copy the meta data of adata
adata.raw = adata.copy()

## direct leiden


In [125]:
def preprocess_of_UMAP(adata, regress=True):
    # Normalization scaling
    sc.pp.normalize_total(adata)
    # X = log(1+X)
    sc.pp.log1p(adata)
    if regress:
        # regress using linear model
        sc.pp.regress_out(adata, ["total_counts"])
    # Scale data to unit variance and zero mean
    sc.pp.scale(adata)
    return adata


# def preprocess_of_UMAP(adata, pseudo_count=1, regress=True):
#     # Step 1: 每个细胞的总 counts 归一化
#     sc.pp.normalize_total(adata, target_sum=1e4)
    
#     # Step 2: 对数变换并加伪 counts
#     adata.X = np.log1p(adata.X + pseudo_count)
    
#     # Step 3: 基因层面的归一化（中位数归一化）
#     gene_medians = np.median(adata.X, axis=0)  # 计算每个基因的中位数
#     adata.X = adata.X / gene_medians  # 对每个基因进行归一化
#     adata.X = np.nan_to_num(adata.X)  # 去除可能出现的NaN

#     # Step 4: 如果需要，进行回归校正
#     if regress: sc.pp.regress_out(adata, ['total_counts'])  # 校正总 counts 影响

#     # Step 5: 数据标准化
#     sc.pp.scale(adata)  # 对数据进行标准化，确保零均值和单位方差
#     return adata


def save_pos_on_UMAP(adata, out_dir):
    try:
        adata_coor = pd.DataFrame(adata.obsm["X_umap"], columns=["Coor_X", "Coor_Y"], index=adata.obs.index)
        df = pd.concat([adata_coor["Coor_X"], adata_coor["Coor_Y"], pd.DataFrame(adata.obs.index), adata.obs.leiden], axis=1)
        df.to_csv(out_dir)
    except KeyError: print('X_umap not found, please perform umap first.')


def save_cell_cluster(adata, out_path, st_point, cell_num, name="leiden"):
    raw_clu = dict(adata.obs[name])
    cluster = dict()
    for cell_num in raw_clu.keys(): cluster[cell_num] = -1
    for cell in raw_clu.keys(): cluster[int(cell) - st_point] = int(raw_clu[cell])
    with open(out_path, "wb") as handle: pickle.dump(cluster, handle)


def UMAP_genes_plot(adata, FOI, size=0.1, show=True, save=False, out_path='./UMAP_genes.png', dpi=300, datatype='direct', dataset=[]):
    n_pcs = len(adata.uns['pca']['variance'])
    n_neighbors = adata.uns['neighbors']['params']['n_neighbors']
    # resolution = adata.uns['leiden']['params']['resolution']
    # Plot Gene distribution
    ncols = int(-(-len(adata.var_names)**(1/2)//1))
    nrows = -(-len(adata.var_names)//ncols)
    fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols*4, nrows*4))
    for pos, gene_name in enumerate(list(adata.var_names)):
        sc.pl.umap(
            adata[adata.obs.dataset.isin(dataset)],
            ax=ax[pos // ncols][pos % ncols],
            size=size, color=gene_name, legend_fontweight=100, legend_fontsize=20,
            show=False, vmax=5, vmin=0)
        ax[pos // ncols][pos % ncols].set_xticklabels("")
        ax[pos // ncols][pos % ncols].set_yticklabels("")

    fig.suptitle(
        "{}\nexp:{}\nUMAP:{}\n".format(
        f"{FOI}_{datatype}_{dataset}_UMAP",
        f"cell_num={len(adata)}",
        f"n_neighbors={n_neighbors}, n_pcs={n_pcs}"),
        fontsize=20,
    )
    plt.tight_layout()
    if save:
        if out_path.endswith('.png'): plt.savefig(f"{out_path}", bbox_inches = 'tight', dpi=dpi)
        else: plt.savefig(f"{out_path}", bbox_inches = 'tight')
    if show: plt.show()
    plt.close()


def UMAP_leiden_plot(adata, FOI='', color='leiden', show=True, save=False, out_path='./UMAP_leiden.png',dpi=300, datatype='direct', DOI=['PRISM_HCC'], legend_loc='on data',palette=False):
    n_pcs = len(adata.uns['pca']['variance'])
    n_neighbors = adata.uns['neighbors']['params']['n_neighbors']
    resolution = adata.uns['leiden']['params']['resolution']
    # Plot Cluster
    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 10))
    if palette:
        sc.pl.umap(adata[adata.obs.dataset.isin(DOI)], 
            color=color, palette=palette, legend_loc=legend_loc, legend_fontsize=7, 
            ax=ax[0], show=False)
        sc.pl.umap(adata,
            color="dataset", legend_fontweight=100, legend_fontsize=20,
            ax=ax[1], show=False)
    else:
        sc.pl.umap(adata[adata.obs.dataset.isin(DOI)],
            color=color, legend_loc=legend_loc, legend_fontsize=7,
            ax=ax[0], show=False)
        sc.pl.umap(adata,
            color="dataset", legend_fontweight=100, legend_fontsize=20,
            ax=ax[1], show=False)
        
    fig.suptitle(
        "{}\nexp:{}\nUMAP:{}\n".format(
        f"{FOI}_{datatype}_cluster",
        f"cell_num={len(adata)}",
        f"n_neighbors={n_neighbors}, n_pcs={n_pcs}, resolution={resolution}"),
        fontsize=20,
    )
    plt.tight_layout()
    if save:
        if out_path.endswith('.png'): plt.savefig(f"{out_path}", bbox_inches = 'tight', dpi=dpi)
        else: plt.savefig(f"{out_path}", bbox_inches = 'tight')
    if show: plt.show()
    plt.close()


def leiden_QC_plot(adata, color='leiden'):
    # cluster QC
    g = sns.JointGrid(data=adata.obs, x="total_counts", y="n_genes_by_counts", height=5, ratio=2, hue=color)
    g.plot_joint(sns.scatterplot, s=40, alpha=0.3)
    g.plot_marginals(sns.kdeplot)
    g.set_axis_labels("total_counts", "n_genes_by_counts", fontsize=8)
    g.fig.set_figwidth(3)
    g.fig.set_figheight(3)
    plt.show()

In [None]:
# preprocess of UMAP
adata = preprocess_of_UMAP(adata, regress=True)

# compute pca
sc.tl.pca(adata)
sc.pl.pca_variance_ratio(adata, log=False)

In [None]:
# PCA with proper components
sc.tl.pca(adata, n_comps=20)

# Run UMAP                                                                                   
sc.pp.neighbors(adata, n_neighbors=50)
sc.tl.umap(adata)
UMAP_genes_plot(adata, FOI='HCC', size=1, show=False, save=True, out_path=os.path.join(typ_path, 'Genes_UMAP.png'), datatype='direct', dataset=["PRISM_HCC"])

In [None]:
# Run Leiden cluster
leiden_resolution=1
sc.tl.leiden(adata, resolution=leiden_resolution)

fig, ax = plt.subplots(ncols=1, nrows=2, figsize=(10,4))
adata_tmp = adata.copy()
adata_tmp.write_h5ad(os.path.join(typ_path, f'adata_leiden_res_{leiden_resolution}.h5ad'))
a=[len(adata_tmp[adata_tmp.obs.leiden == _]) for _ in adata_tmp.obs.leiden.unique()]
sns.histplot(a, bins=100, stat='count', alpha=1, edgecolor='white', linewidth=0.5, ax=ax[0], 
            line_kws=dict(color='black', alpha=0.7, linewidth=1.5))# binrange=[0,100], log=True, 
ax[0].set_title(f'leiden_cluster_size_distribution_res={leiden_resolution}')

leiden_threshold = 400
adata_tmp = adata_tmp[adata_tmp.obs.leiden.isin([_ for _ in adata_tmp.obs.leiden.unique() if len(adata[adata.obs.leiden == _]) > leiden_threshold])]
adata_tmp.write_h5ad(os.path.join(typ_path, f'adata_leiden_res_{leiden_resolution}_filtered_{leiden_threshold}.h5ad'))
a=[len(adata_tmp[adata_tmp.obs.leiden == _]) for _ in adata_tmp.obs.leiden.unique()]
sns.histplot(a, bins=100, stat='count', alpha=1, edgecolor='white', linewidth=0.5, ax=ax[1], 
            line_kws=dict(color='black', alpha=0.7, linewidth=1.5))# binrange=[0,100], log=True, 
ax[1].set_title(f'leiden_threshold={leiden_threshold}')
plt.tight_layout()
plt.savefig(os.path.join(typ_path, f'ClusterSize_hist.png'), bbox_inches = 'tight', dpi=300)
plt.close()

In [None]:
# Plot Gene distribution
UMAP_leiden_plot(adata, FOI='HCC', color='leiden', show=False, save=False, out_path=os.path.join(typ_path, 'leiden_UMAP.png'), datatype='direct')
QC_plot(adata, hue='leiden')

# Integration with scRNA-seq using Harmony


## load and preprocess of sc data


### load sc data1


In [None]:
# path
sc_path = r"F:\spatial_data\reference\scRNAseq"
file = "HCC_GSE151530"

# load sc-rna data
adata_sc1_meta = sc.read_mtx(os.path.join(sc_path, file, "matrix.mtx"))
adata_sc1_meta = adata_sc1_meta.T

adata_temp_features = pd.read_csv(os.path.join(sc_path, file, "genes.tsv"), sep="\t", header=None)
adata_sc1_meta.var["gene_name"] = adata_temp_features[1].tolist()
adata_sc1_meta.var.set_index("gene_name", inplace=True)
adata_sc1_meta.var_names_make_unique()

info = pd.read_csv(os.path.join(sc_path,file,'Info.txt'), sep='\t')
adata_sc1_meta.obs['sample'] = list(info['Sample'])
adata_sc1_meta.obs['type'] = list(info['Type'])
adata_sc1_meta.obs['dataset'] = ['GSE151530'] * len(adata_sc1_meta)

# general process
adata_sc1 = adata_sc1_meta.copy()
adata_sc1, _, _ = general_preprocess(adata=adata_sc1, min_genes=2000, min_counts=10000, max_counts=300000, min_cells=1,)

### load sc data2


In [None]:
# load sc-rna data GSE140228
sc_path = r"F:\spatial_data\reference\scRNAseq"
file = "HCC_GSE140228_immune"

adata_sc2_meta = pd.read_csv(os.path.join(sc_path, file, "GSE140228_read_counts_Smartseq2.csv"), index_col=0)
adata_sc2_meta = sc.AnnData(adata_sc2_meta.T)
adata_sc2_meta.var_names_make_unique()

info = pd.read_csv(os.path.join(sc_path, file, "GSE140228_cell_info_Smartseq2.tsv"), sep="\t")
adata_sc2_meta.obs["type"] = info["celltype_sub"].tolist()
adata_sc2_meta.obs["sample"] = info["Sample"].tolist()
adata_sc2_meta.obs["tissue"] = info["Tissue"].tolist()
adata_sc2_meta.obs['dataset'] = ['GSE140228'] * len(adata_sc2_meta)

# general process
adata_sc2 = adata_sc2_meta.copy()
adata_sc2, _, _ = general_preprocess(adata=adata_sc2, min_genes=1000, min_counts=10000, max_counts=1500000, min_cells=1,)

### load sc data3


In [None]:
sc_path = r"F:\spatial_data\reference\scRNAseq"
file = "HCC_CNP0000650_CD45-"
sc_data_meta = pd.read_csv(os.path.join(sc_path, file, 'HCC_log_tpm_expression_matrix.txt'), sep='\t', index_col=0)
adata_sc3_meta = sc.AnnData(sc_data_meta.T)
adata_sc3_meta.var_names_make_unique()

info = pd.read_csv(os.path.join(sc_path, file,'HCC_cell_metadata.txt'), sep='\t')
adata_sc3_meta.obs['sample']=info['sample_name']
adata_sc3_meta.obs['type']=info['cell_type']
adata_sc3_meta.obs['tissue']=info['tissue_source']
adata_sc3_meta.obs['dataset']=['CNP0000650']*len(adata_sc3_meta)

# general process
adata_sc3 = adata_sc3_meta.copy()
adata_sc3, _, _ = general_preprocess(adata=adata_sc3, min_genes=2000, min_counts=1000, max_counts=25000, min_cells=1,)

## harmony combine


### preprocess of sc data

In [None]:
# adata.var.index = adata.var.index.str.upper()
adata_sc1.var.index = adata_sc1.var.index.str.upper()
adata_sc1.var_names_make_unique()
adata_sc2.var.index = adata_sc2.var.index.str.upper()
adata_sc2.var_names_make_unique()
adata_sc3.var.index = adata_sc3.var.index.str.upper()
adata_sc3.var_names_make_unique()

list_of_variable_names = gene_list[1:]
adata_subset = adata[:, list_of_variable_names]
adata_subset.var.index = adata_subset.var.index.str.upper()

In [None]:
adata_sc1_subset = adata_sc1.copy()
# adata_sc_subset = adata_sc[:, list_of_variable_names]
# Normalization scaling sc
sc.pp.normalize_total(adata_sc1_subset)
sc.pp.log1p(adata_sc1_subset)
# Calculate QC metrics
sc.pp.calculate_qc_metrics(adata_sc1_subset, percent_top=None, inplace=True)
# Scale data to unit variance and zero mean
sc.pp.regress_out(adata_sc1_subset, ["total_counts"])
sc.pp.scale(adata_sc1_subset)

In [None]:
adata_sc2_subset = adata_sc2.copy()
# adata_sc2_subset = adata_sc2[adata_sc2[:,'CPA3'].X > 0][:, list_of_variable_names]
# adata_sc2_subset = adata_sc2[:, list_of_variable_names]
# Normalization scaling sc
sc.pp.normalize_total(adata_sc2_subset)
sc.pp.log1p(adata_sc2_subset)
# Calculate QC metrics
sc.pp.calculate_qc_metrics(adata_sc2_subset, percent_top=None, inplace=True)
# Scale data to unit variance and zero mean
sc.pp.regress_out(adata_sc2_subset, ["total_counts"])
sc.pp.scale(adata_sc2_subset)

In [None]:
adata_sc3_subset = adata_sc3.copy()
# adata_sc2_subset = adata_sc2[adata_sc2[:,'CPA3'].X > 0][:, list_of_variable_names]
# adata_sc2_subset = adata_sc2[:, list_of_variable_names]
# Normalization scaling sc
sc.pp.normalize_total(adata_sc3_subset)
sc.pp.log1p(adata_sc3_subset)
# Calculate QC metrics
sc.pp.calculate_qc_metrics(adata_sc3_subset, percent_top=None, inplace=True)
# Scale data to unit variance and zero mean
sc.pp.regress_out(adata_sc3_subset, ["total_counts"])
sc.pp.scale(adata_sc3_subset)

### harmony

In [None]:
combine_adata = adata_subset.concatenate(adata_sc1_subset, adata_sc2_subset, adata_sc3_subset, batch_key="dataset", batch_categories=["PRISM_HCC", "GSE151530", "GSE140228", "CNP0000650"])
# combine_adata = adata_subset.concatenate(adata_sc1_subset, batch_key="dataset", batch_categories=["st", "scrna1"])
# sc.pp.combat(combine_adata, key="dataset")

print("origin_gene:", adata.var.index)
print("origin_gene_num:", len(adata.var.index))
print("combine_gene:", combine_adata.var.index)
print("combine_gene_num:", len(combine_adata.var.index))
print("Genes_not_matched:", set(adata.var.index) - set(combine_adata.var.index))

In [None]:
sc.tl.pca(combine_adata)
sc.pl.pca_variance_ratio(combine_adata, log=False)

In [None]:
h_pcs = 29
sc.tl.pca(combine_adata, n_comps=h_pcs)
sc_cell_num = len(combine_adata) - len(adata)
print(combine_adata)

In [None]:
import scanpy.external as sce
import random

np.random.seed(42)
random.seed(42)
sce.pp.harmony_integrate(combine_adata, "dataset", "X_pca", "X_pca_harmony", max_iter_harmony=30) # max_iter_kmeans=30,

## umap and leiden


In [None]:
neighbor = 50
sc.pp.neighbors(combine_adata, n_neighbors=neighbor, use_rep="X_pca_harmony")
sc.tl.umap(combine_adata)

In [None]:
leiden_resolution=4
sc.tl.leiden(combine_adata, resolution=leiden_resolution)

a=[len(combine_adata[combine_adata.obs.leiden == _]) for _ in combine_adata.obs.leiden.unique()]
fig, ax = plt.subplots(figsize=(10,2))
sns.histplot(a, bins=100, stat='count', alpha=1, kde=True, edgecolor='white', linewidth=0.5,ax=ax,
            line_kws=dict(color='black', alpha=0.7, linewidth=1.5, label='KDE')) # log=True, # binrange=[0,100]
plt.show()

combine_adata = combine_adata[combine_adata.obs.leiden.isin([_ for _ in combine_adata.obs.leiden.unique() if len(combine_adata[combine_adata.obs.leiden == _]) > len(combine_adata)/200])]
a=[len(combine_adata[combine_adata.obs.leiden == _]) for _ in combine_adata.obs.leiden.unique()]
fig, ax = plt.subplots(figsize=(10,2))
sns.histplot(a, bins=100, stat='count', alpha=1, kde=True, edgecolor='white', linewidth=0.5,
            ax=ax, line_kws=dict(color='black', alpha=0.7, linewidth=1.5, label='KDE')) # log=True, # binrange=[0,100]
plt.show()

UMAP_genes_plot(combine_adata, FOI, dataset=['PRISM_HCC'], datatype='harmony', size=1, show=False, save=True, out_path=os.path.join(typ_path,'UMAP_genes.png'))
UMAP_leiden_plot(combine_adata, FOI, color='leiden', datatype='harmony', show=False, save=True, out_path=os.path.join(typ_path,'UMAP_leiden.png'))

In [None]:
leiden_resolution=8
sc.tl.leiden(combine_adata, resolution=leiden_resolution)
UMAP_leiden_plot(combine_adata, FOI, color='leiden', save=True, out_path=os.path.join(typ_path,f'UMAP_leiden_res={leiden_resolution}.png'), datatype='harmony')

In [None]:
combine_adata.write_h5ad(os.path.join(typ_path, 'combine_adata_leiden_res=8.h5ad'))

In [None]:
leiden_list = [str(i) for i in sorted([int(_) for _ in combine_adata.obs.leiden.unique()])]

fig, ax = plt.subplots(ncols=1, nrows=len(leiden_list),figsize =(25, 1.5*len(leiden_list)))
for _ in range(len(leiden_list)):
    cluster_num = leiden_list[_]
    data = combine_adata[combine_adata.obs['leiden'] == cluster_num].X
    ax[_].boxplot(data, flierprops={'marker': 'o', 'markersize': 2, 'markerfacecolor': 'fuchsia'})
    ax[_].set_xticklabels(list(combine_adata.var_names))
    ax[_].set_ylabel(f'cluster{cluster_num}')
    ax[_].set_ylim(-2, 5)
plt.savefig(os.path.join(typ_path,'leiden_boxplot_res=8.png'), bbox_inches='tight',dpi=300)
plt.close()

In [None]:
# combine_adata = sc.read_h5ad(r'E:\TMC\cell_typing\results\cluster\2023.8.29_HCC_all\GSE151530\HCC_GSE_harmony.h5ad')
# combine_adata = sc.read_h5ad(r'E:\TMC\cell_typing\results\cluster\2023.9.6_harmony-cluster_and_thre-in-cluster\without_CLEC9A, MS4A1\harmony_GSE151530_without_CLEC9A, MS4A1.h5ad')
# combine_adata = sc.read_h5ad(r'E:\TMC\cell_typing\results/cluster/2023.9.6_harmony-cluster_and_thre-in-cluster/without_CLEC9A, MS4A1/harmony_GSE151530&GSE140228_without_CLEC9A, MS4A1.h5ad')
# combine_adata = sc.read_h5ad(r'E:\TMC\cell_typing\results\cluster\2023.9.6_harmony-cluster_and_thre-in-cluster\without_CLEC9A, MS4A1, CPA3\combine_adata.h5ad')
# combine_adata = sc.read_h5ad(r'E:\TMC\cell_typing\results\cluster\2023.9.7_HCC_harmony_loose_filter\sc_GSE151530_30genes\combine_adata_with_GSE151530_30genes.h5ad')
# combine_adata = sc.read_h5ad(r'E:\TMC\cell_typing\results\cluster\2023.9.7_HCC_harmony_loose_filter\sc_GSE151530&GSE140228&CNP0000650_30genes\combine_adata_with_GSE151530&GSE140228&CNP0000650_30genes.h5ad')
# combine_adata = sc.read_h5ad(r'E:\TMC\cell_typing\results\cluster\2023.9.10\combine_adata_with_GSE151530&GSE140228&CNP0000650_30genes_res=5.5.h5ad')
# combine_adata = sc.read_h5ad(r'E:\TMC\cell_typing\results\cluster\2023.9.12\combine_adata_with_GSE151530&GSE140228&CNP0000650_30genes.h5ad')
# combine_adata = sc.read_h5ad(r'E:\TMC\cell_typing\results\cluster\2023.9.12\combine_adata_with_GSE151530&GSE140228&CNP0000650_30genes, res=5.5.h5ad')
# combine_adata = sc.read_h5ad(r'E:\TMC\cell_typing\results\cluster\2023.9.13\combine_adata_res=5.5, filtered, typed.h5ad')

# Threshold for interested cluster


In [None]:
adata = sc.read_h5ad(os.path.join(typ_path,'adata_leiden_res_1_filtered_400.h5ad'))
combine_adata = sc.read_h5ad(os.path.join(typ_path, 'combine_adata_leiden.h5ad'))

## interested clusters

In [58]:
import yaml

with open(os.path.join(typ_path, 'leiden_annotation.yaml'), 'r') as f:
    leiden_annotation = yaml.load(f, Loader=yaml.FullLoader)

leiden_type_dict = {}
leiden_subtype_dict = {}
for type_key, subtypes in leiden_annotation.items():
    type_values = []
    for subtype_key, values in subtypes.items():
        type_values.extend(values)
        leiden_subtype_dict[subtype_key] = values
    leiden_type_dict[type_key] = type_values

## threshold

In [59]:
def threshold_in_cluster(adata, marker_gene=[], thre_gene=['AFP','GPC3','ACTA2','PECAM1'], type_name=[], cluster_dict={}):
# for cluster_gene in cluster_to_filter:
    thre_min = [True] * len(marker_gene) + [False] * len(thre_gene)
    gene_list = marker_gene + thre_gene
    minima_dict = {}
    for _ in gene_list:
        minima_dict[_] = ''

    cluster_list_temp=[]
    for _ in cluster_dict.keys():
        for name in type_name:
            if name in _:
                cluster_list_temp += [str(_) for _ in cluster_dict[_]]

    cluster = adata[adata.obs.leiden.isin(cluster_list_temp)]

    fig, ax = plt.subplots(nrows=1,ncols=len(thre_min),figsize=(24, 4))
    for i, gene in enumerate(gene_list):
        a = [float(_) for _ in cluster[:, gene].X]
        sns.histplot(a, bins=20, stat='density', alpha= 1, kde=True, 
                     edgecolor='white', linewidth=0.5, ax=ax[i],
                    line_kws=dict(color='black', alpha=0.7, linewidth=1.5, label='KDE')) # log=True, 
        ax[i].get_lines()[0].set_color('red') # edit line color due to bug in sns v 0.11.0
        ax[i].set_xlabel(gene)

        y = ax[i].get_lines()[0].get_ydata()
        minima_dict[gene] = [float(_/len(y)*(max(a)-min(a))+min(a)) for _ in argrelextrema(np.array(y), np.less)[0]]
        # print(f'{gene}_minima: {minima_dict[gene]}')
        fig.subplots_adjust(hspace=0.4)
        fig.subplots_adjust(wspace=0.4)
        fig.suptitle(f'distribution of cluster, marker gene={marker_gene}')
    plt.show()

    cluster.obs['tmp_leiden'] = ['-1']*len(cluster)
    for _, gene in enumerate(gene_list):
        minima = minima_dict[gene]
        while True:
            if len(minima) == 0:
                minima = [0]
                break
            if minima[0] > 0.5:
                minima[0] = 0
                break
            if minima[0] < -0.5:
                minima.pop(0)
                continue
            break
        
        print(f'{gene}_thre: {minima[0]}')

        if thre_min[_]:
            tmp = cluster[cluster[:, gene].X > minima[0]]
            cluster.obs['tmp_leiden'][tmp.obs.index] = ['1']*len(tmp)
        else:
            tmp = cluster[cluster[:, gene].X > minima[0]]
            cluster.obs['tmp_leiden'][tmp.obs.index] = ['-1']*len(tmp)
    
    tmp = cluster[cluster.obs['tmp_leiden']=='-1']
    adata.obs['tmp_leiden'][tmp.obs.index] = ['-2']*len(tmp)

    cell_to_plot = len(cluster[cluster.obs['tmp_leiden']=='1'])
    print(f'marker_gene={marker_gene}, {cell_to_plot} cells of {len(cluster)} cells left\n')
    return adata


def collect_liver(combine_adata_st, tissue_obs='tissue', in_out_leiden='tmp_leiden'):
    other_cluster = combine_adata_st[combine_adata_st.obs[in_out_leiden]=='-2']
    liver = other_cluster[other_cluster.obs[tissue_obs] == "liver"]
    combine_adata_st.obs[in_out_leiden] = list(combine_adata_st.obs[in_out_leiden])
    combine_adata_st.obs[in_out_leiden][liver.obs.index] = ["-1"] * len(liver)
    return combine_adata_st


def re_num_leiden(combine_adata_st, cluster_dict, in_leiden='tmp_leiden', out_leiden='new_leiden', out_type='type'):
    combine_adata_st.obs[out_leiden] = ["-2"] * len(combine_adata_st)
    combine_adata_st.obs[out_type] = ["other"] * len(combine_adata_st)
    for cluster_num, cluster_name in enumerate(cluster_dict.keys()):
        for sub_cluster in cluster_dict[cluster_name]:
            temp = combine_adata_st[combine_adata_st.obs[in_leiden] == str(sub_cluster)]
            combine_adata_st.obs[out_leiden][temp.obs.index] = [str(cluster_num)] * len(temp)
            combine_adata_st.obs[out_type][temp.obs.index] = [str(cluster_name)] * len(temp)
    temp = combine_adata_st[combine_adata_st.obs[out_leiden] == '-2']
    combine_adata_st.obs[out_type][temp.obs.index] = ['other'] * len(temp)
    return combine_adata_st

In [None]:
# initialization of combine_adata_st and cluster_to_plot
combine_adata_st = combine_adata[combine_adata.obs.dataset == 'PRISM_HCC']
# rename obs index
sub = [_.replace('-PRISM_HCC','') for _ in combine_adata_st.obs.index]
combine_adata_st.obs.index = sub
# collect liver
adata = adata[combine_adata_st.obs.index]
combine_adata_st.obs.tissue = adata.obs.tissue
combine_adata_st.write_h5ad(os.path.join(typ_path, 'combine_adata_st.h5ad'))

In [None]:
# # threshold for marker gene clusters
# # combine_adata_st = threshold_in_cluster(combine_adata_st, marker_gene=['AFP', 'GPC3'], thre_gene=[], type_name=['Tumor'], cluster_dict=cluster_dict)
# cluster_to_filter = ['CPA3', 'LYVE1', 'SLC4A10', 'CLEC9A', 'CD1C']
# for marker_gene in cluster_to_filter:
#     combine_adata_st = threshold_in_cluster(combine_adata_st, [marker_gene], thre_gene=['AFP','GPC3','ACTA2','PECAM1'], type_name=[marker_gene], cluster_dict=cluster_dict)

In [61]:
combine_adata_st = sc.read_h5ad(os.path.join(typ_path, 'combine_adata_st.h5ad'))
# not in cluster_dict is other
combine_adata_st.obs['tmp_leiden'] = ['-2']*len(combine_adata_st)
defined_leiden = [str(_) for j in leiden_subtype_dict.values() for _ in j]
defined_adata = combine_adata_st[combine_adata_st.obs.leiden.isin(defined_leiden)]
combine_adata_st.obs.loc[defined_adata.obs.index, 'tmp_leiden'] = defined_adata.obs['leiden']
# liver cells
combine_adata_st = collect_liver(combine_adata_st, tissue_obs='tissue', in_out_leiden='tmp_leiden')

# rename leidens based on cluster_dict
combine_adata_st = re_num_leiden(combine_adata_st, leiden_subtype_dict, in_leiden='tmp_leiden', out_leiden='leiden_subtype', out_type='subtype')
combine_adata_st = re_num_leiden(combine_adata_st, leiden_type_dict, in_leiden='tmp_leiden', out_leiden='leiden_type', out_type='type')

In [62]:
combine_adata_st.write_h5ad(os.path.join(typ_path, 'combine_adata_st.h5ad'))

# Plot: dotplot, umap

In [126]:
adata = sc.read_h5ad(os.path.join(typ_path, 'adata_leiden_res_1_filtered_400.h5ad'))
combine_adata_st = sc.read_h5ad(os.path.join(typ_path, 'combine_adata_st.h5ad'))
adata = adata[combine_adata_st.obs.index]
adata.obs = combine_adata_st.obs.copy()
adata.raw = None

## dotplot

In [110]:
tmp_var_names=[
    'HBV', 'AFP', 'GPC3', 'MKI67', 'PECAM1', 'EPCAM', 'ACTA2',
    'FOXP3',    
    'CD3D', 'CD4', 'PDCD1', 'CTLA4', 'CXCL13', 'NCAM1', 'GZMA', 'GZMB', 'PRF1', 
    'CD8A',
    'CD79A', 'MS4A1', 'MZB1',
    'LILRA4', 
    'CLEC9A', 'CD1C', 'LYVE1',
    'C1QA', 'FCGR3A', 'S100A8', 'CSF3R',    
    'SLC4A10', 'CPA3',
    ]

tmp_category = [
    'Liver', 'Tumor', 'other_cell_proliferation', 'Endo', 'Ep', 'CAF',
    'T_reg', 'NK', 'CD4+', 'CD8+', 
    'B', 'DC', 'Macrophage','Monocyte','Neutrophil',
    'Mait','Mast', 
    ]

tmp = adata[combine_adata_st.obs.type!='other']
tmp.obs.type = pd.Categorical(tmp.obs.type, categories=tmp_category)
                                                                                                                                                                                                
fig,ax = plt.subplots(figsize=(12,10))
sc.pl.dotplot(tmp, var_names=tmp_var_names, groupby='type', ax=ax, show=False)
plt.tight_layout()
plt.savefig(os.path.join(figure_path, 'cell_type_dotplot.pdf'))
plt.savefig(os.path.join(figure_path, 'cell_type_dotplot.png'))
plt.close()
print('not_drown:', set(combine_adata_st.obs.type.unique())-set(tmp_category))
print('not_in:', set(tmp_category)-set(combine_adata_st.obs.type.unique()))

not_drown: {'other'}
not_in: set()


In [111]:
tmp_var_names=[
    'HBV', 'AFP', 'GPC3', 'MKI67', 'PECAM1', 'EPCAM', 'ACTA2',
    'FOXP3',
    'CD3D', 'CD4', 'PDCD1', 'CTLA4', 'CXCL13', 'NCAM1', 'GZMA', 'GZMB','PRF1',
    'CD8A',
    'CD79A', 'MS4A1', 'MZB1',
    'LILRA4',
    'CLEC9A', 'CD1C', 'LYVE1',
    'C1QA', 'FCGR3A', 'S100A8','CSF3R',    
    'SLC4A10', 'CPA3',
    ]

tmp_category = [
    'Liver',
    'Tumor_AFP+', 'Tumor_GPC3+', 'Tumor_proliferation', 'other_cell_proliferation', 
    'Endo_PECAM1+', 'Ep_EPCAM+', 'CAF_ACTA2+',
    'T_reg', 
    'T_CD4+, PD1+', 'T_CD4+, PD1+, CTLA4+',
    'T_CD4+, CTLA4+','T_CD4+, CXCL13+', 'Cyto_T_CD4+',
    'NK_NCAM1+',
    'Cyto_T_CD8+', 'T_CD8+, PD1+', 'T_CD8+, GZMA+, CXCL13+', 'T_CD8+, CTLA4+',
    'B_CD79A+', 'B_CD79A+, MS4A1+', 'B_MS4A1+', 'Plasma_B_CD79A+, MZB1+',
    'pDC_LILRA4+', 'cDC1_CLEC9A+', 'cDC2_CD1C+',
    'Macrophage_LYVE1+', 'Macrophage_C1QA+',
    'Monocyte_CD16+', 'Monocyte_CD14+, CD16+', 'Monocyte_CD14+',
    'Neutrophil_CSF3R+, S100A8+', 'Neutrophil_CSF3R+',
    'Mait_SLC4A10+', 'Mast_CPA3+',
]

tmp = adata[combine_adata_st.obs.subtype!='other']
tmp.obs.subtype = pd.Categorical(tmp.obs.subtype, categories=tmp_category)

fig,ax = plt.subplots(figsize=(14,12))
sc.pl.dotplot(tmp, var_names=tmp_var_names, groupby='subtype', ax=ax, show=False)
plt.tight_layout()
plt.savefig(os.path.join(figure_path, 'cell_subtype_dotplot.pdf'))
plt.savefig(os.path.join(figure_path, 'cell_subtype_dotplot.png'))
plt.close()
print('not_drown:', set(combine_adata_st.obs.subtype.unique())-set(tmp_category))
print('not_in:', set(tmp_category)-set(combine_adata_st.obs.subtype.unique()))

not_drown: {'other'}
not_in: set()


## umap

In [127]:
type_colormap = {
    'Liver':(1,0.392,0),
    'Tumor':(0.751,0.491,0),
    'other_cell_proliferation': (0,0.5,0.636),
    'Endo':(1,0,1),
    'Ep':(0,1,0),
    'CAF':(0,0,1),
    'DC':(1,0.259,0),
    'Mait':(1,0,0.434),
    'Mast':(1,0,0),
    'Monocyte':(0,0.471,1),
    'Neutrophil':(1,1,0),
    'Macrophage':(0.5,0.8,0),
    'CD4+':(0.5,0.5,0.5),
    'CD8+':(1,0.8,0),
    'T_reg':(0,1,0.672),
    'B':(0,1,1),
    'NK':(1,0,0),
    'other':(0.9,0.9,0.9),
}

subtype_colormap = dict()
for subtype in leiden_subtype_dict.keys():
    for rough_type in type_colormap.keys():
        if rough_type in subtype:
            subtype_colormap[subtype] = type_colormap[rough_type]
            break

In [128]:
tmp = combine_adata_st[~combine_adata_st.obs['type'].isin(['other'])]
UMAP_leiden_plot(adata=tmp, DOI=['PRISM_HCC'], datatype='harmony', color='type', FOI='HCC', palette=type_colormap, legend_loc='right margin', 
                 show=False, save=True, out_path=os.path.join(figure_path, 'cell_type_UMAP.png'))
# UMAP_genes_plot(adata=tmp, FOI='HCC', dataset=['PRISM_HCC'], datatype='harmony', size=1, 
#                 show=False, save=True, out_path=os.path.join(figure_path, 'genes_UMAP.png'))

# Plot: heatmap

In [119]:
def UMAP_genes_plot(adata, FOI, size=0.1, show=True, save=False, out_path='./UMAP_genes.png', dpi=300, datatype='direct', dataset=[]):
    n_pcs = len(adata.uns['pca']['variance'])
    n_neighbors = adata.uns['neighbors']['params']['n_neighbors']
    # resolution = adata.uns['leiden']['params']['resolution']
    # Plot Gene distribution
    ncols = int(-(-len(adata.var_names)**(1/2)//1))
    nrows = -(-len(adata.var_names)//ncols)
    fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols*4, nrows*4))
    for pos, gene_name in enumerate(list(adata.var_names)):
        sc.pl.umap(
            adata[adata.obs.dataset.isin(dataset)],
            ax=ax[pos // ncols][pos % ncols],
            size=size, color=gene_name, legend_fontweight=100, legend_fontsize=20,
            show=False, vmax=5, vmin=0)
        ax[pos // ncols][pos % ncols].set_xticklabels("")
        ax[pos // ncols][pos % ncols].set_yticklabels("")

    fig.suptitle(
        "{}\nexp:{}\nUMAP:{}\n".format(
        f"{FOI}_{datatype}_{dataset}_UMAP",
        f"cell_num={len(adata)}",
        f"n_neighbors={n_neighbors}, n_pcs={n_pcs}"),
        fontsize=20,
    )
    plt.tight_layout()
    if save:
        if out_path.endswith('.png'): plt.savefig(f"{out_path}", bbox_inches = 'tight', dpi=dpi)
        else: plt.savefig(f"{out_path}", bbox_inches = 'tight')
    if show: plt.show()
    plt.close()


def UMAP_leiden_plot(adata, FOI='', color='leiden', show=True, save=False, out_path='./UMAP_leiden.png',dpi=300, datatype='direct', DOI=['PRISM_HCC'], legend_loc='on data',palette=False):
    n_pcs = len(adata.uns['pca']['variance'])
    n_neighbors = adata.uns['neighbors']['params']['n_neighbors']
    resolution = adata.uns['leiden']['params']['resolution']
    # Plot Cluster
    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 10))
    if palette:
        sc.pl.umap(adata[adata.obs.dataset.isin(DOI)], 
            color=color, palette=palette, legend_loc=legend_loc, legend_fontsize=7, 
            ax=ax[0], show=False)
        sc.pl.umap(adata,
            color="dataset", legend_fontweight=100, legend_fontsize=20,
            ax=ax[1], show=False)
    else:
        sc.pl.umap(adata[adata.obs.dataset.isin(DOI)],
            color=color, legend_loc=legend_loc, legend_fontsize=7,
            ax=ax[0], show=False)
        sc.pl.umap(adata,
            color="dataset", legend_fontweight=100, legend_fontsize=20,
            ax=ax[1], show=False)
        
    fig.suptitle(
        "{}\nexp:{}\nUMAP:{}\n".format(
        f"{FOI}_{datatype}_cluster",
        f"cell_num={len(adata)}",
        f"n_neighbors={n_neighbors}, n_pcs={n_pcs}, resolution={resolution}"),
        fontsize=20,
    )
    plt.tight_layout()
    if save:
        if out_path.endswith('.png'): plt.savefig(f"{out_path}", bbox_inches = 'tight', dpi=dpi)
        else: plt.savefig(f"{out_path}", bbox_inches = 'tight')
    if show: plt.show()
    plt.close()


def leiden_QC_plot(adata, color='leiden'):
    # cluster QC
    g = sns.JointGrid(data=adata.obs, x="total_counts", y="n_genes_by_counts", height=5, ratio=2, hue=color)
    g.plot_joint(sns.scatterplot, s=40, alpha=0.3)
    g.plot_marginals(sns.kdeplot)
    g.set_axis_labels("total_counts", "n_genes_by_counts", fontsize=8)
    g.fig.set_figwidth(3)
    g.fig.set_figheight(3)
    plt.show()

In [118]:
def matrix_for_heatmap(adata_1, adata_2, adata_ori, obs_1="leiden_type", obs_2="leiden", 
                       cluster_of_intere=list, sc_cluster_of_intere=list, save=False, whole=False):
    raw_cluster_num = len(cluster_of_intere)
    sc_cluster_num = len(sc_cluster_of_intere)
    raw_data_matrix = np.array([])
    raw_data_whole_matrix = np.array([])
    sc_data_matrix = np.array([])

    for cluster_num in cluster_of_intere:
        raw_add_whole = np.array([np.mean(adata_ori.X[adata_1.obs[obs_1] == str(cluster_num)], axis=0)])
        if raw_data_whole_matrix.size == 0: raw_data_whole_matrix = raw_add_whole
        else: raw_data_whole_matrix = np.concatenate((raw_data_whole_matrix, raw_add_whole), axis=0)

        raw_add = np.array([np.mean(adata_1.X[adata_1.obs[obs_1] == str(cluster_num)], axis=0)])
        if raw_data_matrix.size == 0: raw_data_matrix = raw_add
        else: raw_data_matrix = np.concatenate((raw_data_matrix, raw_add), axis=0)

    for cluster_num in sc_cluster_of_intere:
        sc_add = np.array([np.mean(adata_2.X[adata_2.obs[obs_2] == str(cluster_num)], axis=0)])
        if sc_data_matrix.size == 0: sc_data_matrix = sc_add
        else: sc_data_matrix = np.concatenate((sc_data_matrix, sc_add), axis=0)

    matrix = np.concatenate((raw_data_matrix, sc_data_matrix), axis=0)
    corr_matrix = np.corrcoef(matrix)
    if whole: return raw_data_whole_matrix, sc_data_matrix, corr_matrix[0 : raw_cluster_num, raw_cluster_num : raw_cluster_num + sc_cluster_num]
    else: return raw_data_matrix, sc_data_matrix, corr_matrix[0 : raw_cluster_num, raw_cluster_num : raw_cluster_num + sc_cluster_num]

## cluster of sc data

In [None]:
adata_sc_subset = adata_sc1_subset.concatenate(adata_sc2_subset, adata_sc3_subset, batch_key="dataset", batch_categories=["GSE151530", "GSE140228", "CNP0000650"])
# adata_sc_subset = combine_adata[combine_adata.obs.dataset.isin(['GSE151530','GSE140228','CNP0000650'])]

In [None]:
adata_sc_subset = adata_sc_subset[:, list_of_variable_names]
sc.tl.pca(adata_sc_subset, n_comps=29)
sc.pl.pca_variance_ratio(adata_sc_subset, log=False)

h_pcs = 29
sc.tl.pca(adata_sc_subset, n_comps=h_pcs)
print(adata_sc_subset)

import scanpy.external as sce
sce.pp.harmony_integrate(adata_sc_subset, "dataset", "X_pca", "X_pca_harmony", max_iter_harmony=30) # max_iter_kmeans=30,

neighbor = 50
sc.pp.neighbors(adata_sc_subset, n_neighbors=neighbor, use_rep="X_pca_harmony")
sc.tl.umap(adata_sc_subset)

In [None]:
leiden_resolution=4
sc.tl.leiden(adata_sc_subset, resolution=leiden_resolution)

In [None]:
UMAP_genes_plot(adata=adata_sc_subset, size=1, FOI=FOI, save=False, out_path=typ_path, datatype='sc_harmony', dataset=['GSE151530','GSE140228','CNP0000650'])
UMAP_leiden_plot(adata=adata_sc_subset, FOI=FOI, color='leiden', save=True, out_path='./UMAP.pdf', datatype='sc_harmony', 
                #  legend_loc='right margin', 
                 DOI=['GSE151530','GSE140228','CNP0000650'])

In [None]:
tmp = adata_sc_subset.copy()
leiden_list = [str(i) for i in sorted([int(_) for _ in tmp.obs.leiden.unique()])]

fig, ax = plt.subplots(ncols=1, nrows=len(leiden_list),figsize =(20, 3*len(leiden_list)))
for _ in range(len(leiden_list)):
    cluster_num = leiden_list[_]
    data = tmp[tmp.obs['leiden'] == cluster_num].X
    ax[_].boxplot(data,
                  flierprops={'marker': 'o', 'markersize': 2, 'markerfacecolor': 'fuchsia'})
    ax[_].set_xticklabels(list(tmp.var_names))
    ax[_].set_title(f'cluster{cluster_num}')
    # show plot
# plt.close()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(13, 15))

sc.tl.dendrogram(adata_sc_subset, groupby='leiden', linkage_method='single')
sc.pl.dotplot(adata_sc_subset, var_names=list_of_variable_names, 
              groupby='leiden', vmax=5, ax=ax, dendrogram=True, show=False)
plt.show()

In [25]:
sc_cluster_dict = {
    "Tumor_AFP+":[67,12,54,11],
    "Tumor_GPC3+":[69,37,21,24,29,41,47],
    "Tumor_proliferation":[2],
    "Endo_PECAM1+":[35],
    "Ep_EPCAM+":[10],
    "CAF_ACTA2+":[5],

    "T_reg":[28,31],
    "T_CD4+, CTLA4+":[25],
    "T_CD4+, PD1+, CTLA4+":[],
    "T_CD4+, PD1+":[30],
    "T_CD4+, PD1+, CXCL13+":[],
    "T_CD4+, CXCL13+":[],
    "Cyto_T_CD4+":[16,53],
    "T_CD4+_other":[60],

    "T_CD8+_other":[20],
    "T_CD8+, PD1+":[42,44],
    "T_CD8+, GZMA+, CXCL13+":[0,46],
    "Cyto_T_CD8+":[27,17,20,23,33],
    
    "T_proliferation":[],
    "NK_NCAM1+":[9],

    "B_CD79A+": [58,38,70,14],
    "B_MS4A1+": [32],
    "B_CD79A+, MS4A1+": [7],
    "B_proliferation":[],

    "cDC1_CLEC9A+":[39],
    "cDC2_CD1C+":[3],
    "pDC_LILRA4+":[36],
    
    "Mait_SLC4A10+":[6],
    "Mast_CPA3+":[45],
    "Monocyte_CD14+":[50],
    "Monocyte_CD14+, CD16+":[],
    "Monocyte_CD16+":[64,51,8,15,18],
    "Neutrophil_CSF3R+, S100A8+":[1],
    "Neutrophil_CSF3R+":[4,56],

    "Macrophage_C1QA+":[40,55],
    "Macrophage_LYVE1+":[13],

    "others_or_ILCs_MZB1+":[],
    "others":[],
}

sc_cluster_rough_dict = {
    "Liver": [],
    "Tumor": [],
    # "Tumor_AFP+": [],
    # "Tumor_GPC3+": [],
    # "Tumor_proliferation": [],
    "Endo": [],
    "Ep": [],
    "CAF": [],
    "DC": [],
    "Mait": [],
    "Mast": [],
    "Monocyte": [],
    "Neutrophil": [],
    "Macrophage": [],
    "CD4+": [],
    "CD8+": [],
    "T_reg": [],
    "T_proliferation": [],
    "B": [],
    "NK": [],
    #   'MZB1+':[],
}
for rough_type in sc_cluster_rough_dict.keys():
    for fine_type in sc_cluster_dict.keys():
        if rough_type in fine_type:
            sc_cluster_rough_dict[rough_type] += sc_cluster_dict[fine_type]

In [None]:
# initialization of combine_adata_st and cluster_to_plot
adata_sc_subset.obs['tmp_leiden'] = list(adata_sc_subset.obs['leiden'])

# rename leidens based on cluster_dict
adata_sc_subset = re_num_leiden(adata_sc_subset, sc_cluster_dict, in_leiden='tmp_leiden', out_leiden='leiden_subtype', out_type='subtype')
adata_sc_subset = re_num_leiden(adata_sc_subset, sc_cluster_rough_dict, in_leiden='tmp_leiden', out_leiden='leiden_type', out_type='type')

In [None]:
tmp_var_names=[
    # 'HBV',
    'AFP', 'GPC3', 'MKI67', 'PECAM1', 'EPCAM', 'ACTA2', 'FOXP3',
    'CD3D', 'CD4', 'PDCD1', 'CTLA4', 'CXCL13', 'NCAM1', 'GZMA', 'GZMB', 'PRF1',
    'CD8A', 'CD79A', 'MS4A1','MZB1', 
    'LILRA4', 'CLEC9A', 'CD1C', 'LYVE1', 'C1QA', 'FCGR3A', 'S100A8', 'CSF3R', 'SLC4A10', 'CPA3',
    ]

tmp_category = [
    # 'Liver',
    'Tumor', 'Endo', 'Ep', 'CAF', 'T_reg', 'proliferation', 'NK',
    'CD4+', 'CD8+',  'B', 'DC', 'Macrophage', 'Monocyte', 'Neutrophil', 'Mait', 'Mast',
    ]

tmp = adata_sc_subset[adata_sc_subset.obs.type!='other']
tmp.obs.type = pd.Categorical(tmp.obs.type, categories=tmp_category)                                                                                                                                                                                                
fig,ax = plt.subplots(figsize=(12,10))
sc.pl.dotplot(tmp, var_names=tmp_var_names, show=False, groupby='type', vmax=5, ax=ax)
plt.show()

In [None]:
adata_sc_subset.write(r'e:\TMC\cell_typing\results\2023.8.21-_PRISM_HCC_one_layer\2023.10.12_leiden_final_typed\adata_sc_subset_res=4,retyped_by_zch.h5ad')

## calculation of corr matrix

In [114]:
adata_sc_subset = sc.read_h5ad(os.path.join(typ_path, 'adata_sc_subset_res=4,retyped_by_zch.h5ad'))
adata_sc_subset[adata_sc_subset.obs.type != 'other']

View of AnnData object with n_obs × n_vars = 30628 × 30
    obs: 'sample', 'type', 'dataset', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'n_genes', 'n_counts', 'tissue', 'leiden', 'tmp_leiden', 'leiden_subtype', 'subtype', 'leiden_type'
    var: 'n_cells_by_counts-CNP0000650', 'mean_counts-CNP0000650', 'log1p_mean_counts-CNP0000650', 'pct_dropout_by_counts-CNP0000650', 'total_counts-CNP0000650', 'log1p_total_counts-CNP0000650', 'n_cells-CNP0000650', 'mean-CNP0000650', 'std-CNP0000650', 'n_cells_by_counts-GSE140228', 'mean_counts-GSE140228', 'log1p_mean_counts-GSE140228', 'pct_dropout_by_counts-GSE140228', 'total_counts-GSE140228', 'log1p_total_counts-GSE140228', 'n_cells-GSE140228', 'mean-GSE140228', 'std-GSE140228', 'n_cells_by_counts-GSE151530', 'mean_counts-GSE151530', 'log1p_mean_counts-GSE151530', 'pct_dropout_by_counts-GSE151530', 'total_counts-GSE151530', 'log1p_total_counts-GSE151530', 'n_cells-GSE151530', 'mean-GSE151530', 'std-GSE151

## corr type

In [120]:
list_of_variable_names = gene_list[1:]
combine_adata_st = combine_adata_st[:,list_of_variable_names]
adata_sc_subset = adata_sc_subset[:,list_of_variable_names]

cluster_of_intere = list(leiden_type_dict.keys())
raw_data_matrix, sc_data_matrix, corr_matrix = matrix_for_heatmap(
    combine_adata_st, adata_sc_subset, combine_adata_st, 
    obs_1="type", obs_2="type", 
    cluster_of_intere=cluster_of_intere, sc_cluster_of_intere=cluster_of_intere, 
    save=False, whole=False)

In [121]:
# map_plot of raw vs. sc cluster
map_plot = pd.DataFrame(corr_matrix, columns=cluster_of_intere, index=cluster_of_intere)
plt.figure(figsize=(24, 18))
heatmap = sns.heatmap(map_plot, cmap="coolwarm", annot=False)
plt.savefig(os.path.join(figure_path, 'corr_sc_type.pdf'), bbox_inches = 'tight')
plt.close()

## corr subtype

In [122]:
combine_adata_st = combine_adata_st[:,list_of_variable_names]
adata_sc_subset = adata_sc_subset[:,list_of_variable_names]

cluster_of_intere = list(leiden_subtype_dict.keys())
raw_data_matrix, sc_data_matrix, corr_matrix = matrix_for_heatmap(
    combine_adata_st, adata_sc_subset, combine_adata_st, 
    obs_1="subtype", obs_2="subtype", 
    cluster_of_intere=cluster_of_intere, sc_cluster_of_intere=cluster_of_intere, 
    save=False, whole=False)

In [123]:
# map_plot of raw vs. sc cluster
map_plot = pd.DataFrame(corr_matrix, columns=cluster_of_intere, index=cluster_of_intere)
df = map_plot.copy()
all_nan_columns = df.columns[df.isna().all()].tolist()
all_nan_rows = df.index[df.isna().all(axis=1)].tolist()
union_set = set(all_nan_columns).union(all_nan_rows)
df = df.drop(union_set, axis=0)
df = df.drop(union_set, axis=1)

plt.figure(figsize=(24, 18))
heatmap = sns.heatmap(df, cmap="coolwarm", annot=False)
plt.savefig(os.path.join(figure_path, 'corr_sc_subtype.pdf'), bbox_inches = 'tight')
plt.close()