In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import bbknn
import os
from scipy import sparse
import matplotlib.pyplot as plt
# from scanpy_base_moudle_update2 import *
# import scrublet as scr
import datetime
import harmonypy as hm

sc.settings.verbosity = 3
#sc.logging.print_versions()

# 设置图片的分辨率以及其他样式
sc.settings.set_figure_params(dpi=150, figsize = (4, 3), fontsize=12)

import matplotlib.font_manager
flist = matplotlib.font_manager.get_fontconfig_fonts()
names = [matplotlib.font_manager.FontProperties(fname=fname).get_name() for fname in flist]
print(names)

params={
        #'font.style':'italic',
        'font.weight':'normal',    #or 'blod'
        }
plt.rcParams.update(params)

plt.rcParams['font.family']='Arial'

In [None]:
matplotlib.matplotlib_fname()

In [None]:
adata = sc.read('/home/wangyue/basic-calculation_data/c_Project_outputs/qilu_CRSwNP/IMM.h5ad')
adata

In [None]:
sc.settings.set_figure_params(dpi=600, figsize = (4, 4), fontsize=7)
sc.pl.umap(adata, color=['annotation'], add_outline=True, outline_width = (0.2, 0.05), palette="tab20", frameon=False)

In [None]:
adata.obs['annotation'].cat.categories

In [None]:
np.array(adata.uns['annotation_colors'])

In [None]:
print(adata.obs['annotation'].cat.categories)

old_colors = np.array(adata.uns['annotation_colors'])

new_colors = old_colors
# B cells
new_colors[[17]] = '#1f77b4'
# Endothelial cells
new_colors[[18]] = '#aec7e8'
# Epithelium
new_colors[[19]] = '#ff7f0e'
# Fibroblasts
new_colors[[20]] = '#ffbb78'
# Glia cells
new_colors[[21]] = '#2ca02c'
# Granulocytes
new_colors[22] = '#98df8a'
# MNPs/DCs
new_colors[[23]] = '#d62728'
# Mast cells
new_colors[[24]] = '#ff9896'
# Pericytes
new_colors[[25]] = '#9467bd'
# Plasma cells
new_colors[[26]] = '#f7b6d2'

adata.uns['annotation_colors'] = new_colors

In [None]:
sc.settings.set_figure_params(dpi=600, figsize = (4, 4), fontsize=7)
sc.pl.umap(adata, color=['annotation'], add_outline=True, outline_width = (0.2, 0.05), frameon=False)

In [None]:
sc.settings.set_figure_params(fontsize=12)
# 趋化因子受体
mp = sc.pl.dotplot(adata, 
              ['NOTUM','ST3GAL1','LTB4R','LTB4R2','CYSLTR1','CYSLTR2'], 
              'annotation', 
              dendrogram=False, 
              #figsize=(8,10),
              use_raw=True,
              cmap = 'RdYlBu_r',
              var_group_rotation=45,
              swap_axes=False,
              standard_scale='var',
              colorbar_title='Scaled expression in var',
              return_fig=True
              # , save='_' + sample_name + '_fig11.png'
             ) 
mp.add_totals().style(grid=True).show()

In [None]:
sc.settings.set_figure_params(dpi=400, figsize = (4, 4), fontsize=7)
sc.pl.umap(adata, color=['tissue'], frameon=False, title='')
sc.pl.umap(adata, color=['Health'], frameon=False, title='')

In [None]:
E01_index = adata.obs.loc[adata.obs["tissue"].isin(['inferior turbinate']), :].index

E02_index = adata.obs.loc[adata.obs["tissue"].isin(['middle turbinate']), :].index

E03_index = adata.obs.loc[adata.obs["tissue"].isin(['polyp']), :].index

E04_index = adata.obs.loc[adata.obs["Health"].isin(['healthy control']), :].index

In [None]:
adata.obs['fig2_barplot'] = 'Rhi-IT'
adata.obs.at[E02_index,'fig2_barplot']='Rhi-MT'
adata.obs.at[E03_index,'fig2_barplot']='Rhi-NP'
adata.obs.at[E04_index,'fig2_barplot']='HC-IT'

In [None]:
sc.settings.set_figure_params(dpi=600, figsize = (4, 4), fontsize=7)
sc.pl.umap(adata, color=['fig2_barplot'], frameon=False, title='')

In [None]:
sc.pl.umap(adata, color=['fig2_barplot'], groups = ['HC-IT','Rhi-IT'], frameon=False, title ='')

In [None]:
sc.pl.umap(adata, color=['fig2_barplot'], groups = ['Rhi-MT','Rhi-NP'], frameon=False, title ='')

In [None]:
adata.uns['annotation_colors']

In [None]:
adata.obs['annotation'].cat.categories

In [None]:
adata

In [None]:
T_index = adata.obs.loc[adata.obs["annotation"].isin(['C02-T01-NK cells', 'C02-T02-ILC1', 'C02-T03-ILC2', 'C02-T04-ILC3',
       'C02-T05-CD4+ Naive T', 'C02-T06-CD4+ Treg', 'C02-T07-CD4+Th2',
       'C02-T08-CD4+ Th17', 'C02-T09-CD4+ Trm', 'C02-T10-CD8+ T cells',
       'C02-T11-CD8+ GNLY+ T cells', 'C02-T12-Cycling T']), :].index

B_index = adata.obs.loc[adata.obs["annotation"].isin(['C03-B01-Naive B cells', 'C03-B02-Memory B cells', 'C03-B03-Cycling B']), :].index

P_index = adata.obs.loc[adata.obs["annotation"].isin(['C03-B04-IgA Plasma cells', 'C03-B05-IgG Plasma cells']), :].index

MNPs_index = adata.obs.loc[adata.obs["annotation"].isin(['C04-M01-Macrophages', 'C04-M02-CCL18+ Macrophages',
       'C04-M03-CCL13+ Macrophages', 'C04-M04-CXCL8+ Macrophages',
       'C04-M05-Monocytes', 'C04-M06-DC1', 'C04-M07-DC2',
       'C04-M08-Migratory DCs', 'C04-M09-pDC']), :].index

Mast_index = adata.obs.loc[adata.obs["annotation"].isin(['C05-Mast cell']), :].index
                                                     

adata.obs['compartment'] = '1-T/ILCs'

adata.obs.at[B_index,'compartment']='2-B'
adata.obs.at[P_index,'compartment']='3-Plasma'
adata.obs.at[MNPs_index,'compartment']='4-MNPs-DCs'
adata.obs.at[Mast_index,'compartment']='5-Mast'

In [None]:
sc.settings.set_figure_params(dpi=600, figsize = (4, 4), fontsize=7)
sc.pl.umap(adata, color=['compartment'], add_outline=True, outline_width = (0.2, 0.05), palette="tab20", frameon=False)

In [None]:
adata.uns['compartment_colors']

In [None]:
adata.obs['compartment'].cat.categories

In [None]:
Groups_tab_1 = pd.crosstab(index=adata.obs['fig2_barplot'],  # Make a crosstab
                        columns=adata.obs['compartment'], margins=True)               # Name the count column
MyTab_1= Groups_tab_1.div(Groups_tab_1["All"], axis=0)
MyTab2_1 = MyTab_1.drop(columns="All")
MyTab2_1 = MyTab2_1.drop(index="All")

MyTab2_1 = MyTab2_1.T
order = ['HC-IT','Rhi-IT','Rhi-MT','Rhi-NP']
MyTab2_1 = MyTab2_1[order]
MyTab2_1 = MyTab2_1.T
#categories = IMM_group[::-1]

MyTab2_1.columns = pd.CategoricalIndex(MyTab2_1.columns.values)

# Sort the columns (axis=1) by the new categorical ordering
MyTab2_1 = MyTab2_1.sort_index(axis=1)

ax = MyTab2_1.plot.bar(
            figsize=(3.7,7),
            stacked=True,
            edgecolor = '#000000',
            linewidth=0.4,
            width=0.8, 
            fontsize=15,
            color={"1-T/ILCs": "#1f77b4",
                   '2-B': "#98df8a",
                   '3-Plasma': "#8c564b", 
                   '4-MNPs-DCs': "#c7c7c7", 
                   '5-Mast': "#9edae5"}
             )


plt.title("", fontsize=12)
plt.ylabel("Fraction of cells", fontsize=15)
plt.xlabel("", fontsize=12)
plt.ylim=1.0

#plt.gca().get_legend().remove() #remove legend
# plt.legend(categories, loc='center left', bbox_to_anchor=(1, 0.6), fontsize=12)
# plt.savefig('Proportion of clusters accross organs.png')
# 去除刻度
#plt.xticks([])
#plt.yticks([])
# ax.tick_params(bottom=False, top=False, left=False, right=False)
handles, labels = ax.get_legend_handles_labels()
ax.legend(reversed(handles), reversed(labels), loc='center left', bbox_to_anchor=(1, 0.6), fontsize=8)
plt.grid(False)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.show()

In [None]:
T_index = adata.obs.loc[adata.obs["annotation"].isin(['C02-T01-NK cells', 'C02-T02-ILC1', 'C02-T03-ILC2', 'C02-T04-ILC3',
       'C02-T05-CD4+ Naive T', 'C02-T06-CD4+ Treg', 'C02-T07-CD4+Th2',
       'C02-T08-CD4+ Th17', 'C02-T09-CD4+ Trm', 'C02-T10-CD8+ T cells',
       'C02-T11-CD8+ GNLY+ T cells', 'C02-T12-Cycling T']), :].index
adata_T = adata[T_index, :]
adata_T

In [None]:
def progress_pca_harmony_umap(adata):
    
    # sc.tl.pca(adata, svd_solver='arpack')

    # Harmony处理批次效应
    harmony_out = hm.run_harmony(adata.obsm['X_pca'], adata.obs, vars_use = ['batch_name'],max_iter_harmony=20)
    adata.obsm['X_pca_harmony'] = harmony_out.Z_corr.T
    sc.pp.neighbors(adata, n_neighbors=10, use_rep='X_pca_harmony', n_pcs=20)
    
    # 计算UMAP
    sc.tl.umap(adata)
    # sc.tl.tsne(adata, n_jobs=6)

    return adata

adata_T = progress_pca_harmony_umap(adata_T)

In [None]:
sc.settings.set_figure_params(dpi=400, figsize = (4, 4), fontsize=7)
sc.pl.umap(adata_T, color=['annotation'], add_outline=True, outline_width = (0.2, 0.05), palette="tab20", frameon=False, legend_loc='on data')

In [None]:
adata_T.uns['annotation_colors']

In [None]:
adata_T.obs['annotation'].cat.categories

In [None]:
Groups_tab_1 = pd.crosstab(index=adata_T.obs['fig2_barplot'],  # Make a crosstab
                        columns=adata_T.obs['annotation'], margins=True)               # Name the count column
MyTab_1= Groups_tab_1.div(Groups_tab_1["All"], axis=0)
MyTab2_1 = MyTab_1.drop(columns="All")
MyTab2_1 = MyTab2_1.drop(index="All")

MyTab2_1 = MyTab2_1.T
order = ['HC-IT','Rhi-IT','Rhi-MT','Rhi-NP']
MyTab2_1 = MyTab2_1[order]
MyTab2_1 = MyTab2_1.T
#categories = IMM_group[::-1]

MyTab2_1.columns = pd.CategoricalIndex(MyTab2_1.columns.values)

# Sort the columns (axis=1) by the new categorical ordering
MyTab2_1 = MyTab2_1.sort_index(axis=1)

ax = MyTab2_1.plot.bar(
            figsize=(3.7,5.7),
            stacked=True,
            edgecolor = '#000000',
            linewidth=0.4,
            width=0.8, 
            fontsize=15,
            color={"C02-T01-NK cells": "#1f77b4",
                   'C02-T02-ILC1': "#aec7e8",
                   'C02-T03-ILC2': "#ffbb78", 
                   'C02-T04-ILC3': "#98df8a", 
                   'C02-T05-CD4+ Naive T': "#ff9896",
                   'C02-T06-CD4+ Treg': "#c5b0d5",
                   'C02-T07-CD4+Th2': "#8c564b", 
                   'C02-T08-CD4+ Th17': "#e377c2", 
                   'C02-T09-CD4+ Trm': "#7f7f7f",
                   'C02-T10-CD8+ T cells': "#bcbd22",
                   'C02-T11-CD8+ GNLY+ T cells': "#17becf", 
                   'C02-T12-Cycling T': "#9edae5"}
             )


plt.title("", fontsize=12)
plt.ylabel("Fraction of cells", fontsize=15)
plt.xlabel("", fontsize=12)
plt.ylim=1.0

#plt.gca().get_legend().remove() #remove legend
# plt.legend(categories, loc='center left', bbox_to_anchor=(1, 0.6), fontsize=12)
# plt.savefig('Proportion of clusters accross organs.png')
# 去除刻度
#plt.xticks([])
#plt.yticks([])
# ax.tick_params(bottom=False, top=False, left=False, right=False)
handles, labels = ax.get_legend_handles_labels()
ax.legend(reversed(handles), reversed(labels), loc='center left', bbox_to_anchor=(1, 0.6), fontsize=8)
plt.grid(False)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.show()

In [None]:
PSMB1,PSMC4,PSMA4,CUL1,TAB2,MAP2K4,PSME4,FBXW11,NFKB2,TOLLIP,PSMC5,IRAK3,PSME1,PSMD5,PSMD8,TAB1,RBX1,PSMC6,PSMA3,PSMC1,PSMB5,PSMA6,NFKBIA,PSME2,PSMA7,PSMD10,PSMD7,RIPK2,IKBKB,NFKBIB,NOD1,PSMA2,MAP3K8,PSMD3,PSMD11,MAP2K6,NFKB1,PSMD9,SKP1,IL1A,PSMD14,IL1R2,IL1R1,IL1B,PSMF1,PSMB2,SEM1,PSMA1,PSME3,IRAK2,MAP3K7,IL1RN,PSMB7,PELI2,APP,PSMB6,PSMA5,RPS27A,UBC,PSMA8,TAB3,PSMD4,PSMB4,S100B,SQSTM1,PSMC2,S100A12,PSMD6,PSMC3,BTRC,NOD2,NKIRAS2,TNIP2,MAP2K1,UBB,MYD88,RELA,SAA1,PSMD1,PELI3,TRAF6,PSMD2,UBE2N,IRAK1,PSMD13,HMGB1,IL1RAP,PSMD12,PELI1,NKIRAS1,IRAK4,MAP3K3,PSMB8,AGER,PSMB10,CHUK,UBA52,PSMB11,PSMB9,UBE2V1,IKBKG,PSMB3


In [None]:
B_index = adata.obs.loc[adata.obs["annotation"].isin(['C03-B01-Naive B cells', 'C03-B02-Memory B cells', 'C03-B03-Cycling B',
                                                     'C03-B04-IgA Plasma cells', 'C03-B05-IgG Plasma cells']), :].index
adata_B = adata[B_index, :]
adata_B

In [None]:
def progress_pca_harmony_umap(adata):
    
    # sc.tl.pca(adata, svd_solver='arpack')

    # Harmony处理批次效应
    harmony_out = hm.run_harmony(adata.obsm['X_pca'], adata.obs, vars_use = ['batch_name'],max_iter_harmony=20)
    adata.obsm['X_pca_harmony'] = harmony_out.Z_corr.T
    sc.pp.neighbors(adata, n_neighbors=10, use_rep='X_pca_harmony', n_pcs=20)
    
    # 计算UMAP
    sc.tl.umap(adata)
    # sc.tl.tsne(adata, n_jobs=6)

    return adata

adata_B = progress_pca_harmony_umap(adata_B)

In [None]:
sc.settings.set_figure_params(dpi=400, figsize = (4, 4), fontsize=7)
sc.pl.umap(adata_B, color=['annotation'], add_outline=True, outline_width = (0.2, 0.05), palette="tab20", frameon=False, legend_loc='on data')

In [None]:
adata_B.uns['annotation_colors']

In [None]:
adata_B.obs['annotation'].cat.categories

In [None]:
Groups_tab_1 = pd.crosstab(index=adata_B.obs['fig2_barplot'],  # Make a crosstab
                        columns=adata_B.obs['annotation'], margins=True)               # Name the count column
MyTab_1= Groups_tab_1.div(Groups_tab_1["All"], axis=0)
MyTab2_1 = MyTab_1.drop(columns="All")
MyTab2_1 = MyTab2_1.drop(index="All")

MyTab2_1 = MyTab2_1.T
order = ['HC-IT','Rhi-IT','Rhi-MT','Rhi-NP']
MyTab2_1 = MyTab2_1[order]
MyTab2_1 = MyTab2_1.T
#categories = IMM_group[::-1]

MyTab2_1.columns = pd.CategoricalIndex(MyTab2_1.columns.values)

# Sort the columns (axis=1) by the new categorical ordering
MyTab2_1 = MyTab2_1.sort_index(axis=1)

ax = MyTab2_1.plot.bar(
            figsize=(3.7,5.7),
            stacked=True,
            edgecolor = '#000000',
            linewidth=0.4,
            width=0.8, 
            fontsize=15,
            color={"C03-B01-Naive B cells": "#1f77b4",
                   'C03-B02-Memory B cells': "#98df8a",
                   'C03-B03-Cycling B': "#8c564b", 
                   'C03-B04-IgA Plasma cells': "#c7c7c7", 
                   'C03-B05-IgG Plasma cells': "#9edae5"}
             )


plt.title("", fontsize=12)
plt.ylabel("Fraction of cells", fontsize=15)
plt.xlabel("", fontsize=12)
plt.ylim=1.0

#plt.gca().get_legend().remove() #remove legend
# plt.legend(categories, loc='center left', bbox_to_anchor=(1, 0.6), fontsize=12)
# plt.savefig('Proportion of clusters accross organs.png')
# 去除刻度
#plt.xticks([])
#plt.yticks([])
# ax.tick_params(bottom=False, top=False, left=False, right=False)
handles, labels = ax.get_legend_handles_labels()
ax.legend(reversed(handles), reversed(labels), loc='center left', bbox_to_anchor=(1, 0.6), fontsize=8)
plt.grid(False)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.show()

In [None]:
M_index = adata.obs.loc[adata.obs["annotation"].isin(['C04-M01-Macrophages', 'C04-M02-CCL18+ Macrophages',
       'C04-M03-CCL13+ Macrophages', 'C04-M04-CXCL8+ Macrophages',
       'C04-M05-Monocytes', 'C04-M06-DC1', 'C04-M07-DC2',
       'C04-M08-Migratory DCs', 'C04-M09-pDC']), :].index
adata_M = adata[M_index, :]
adata_M

In [None]:
def progress_pca_harmony_umap(adata):
    
    # sc.tl.pca(adata, svd_solver='arpack')

    # Harmony处理批次效应
    harmony_out = hm.run_harmony(adata.obsm['X_pca'], adata.obs, vars_use = ['batch_name'],max_iter_harmony=20)
    adata.obsm['X_pca_harmony'] = harmony_out.Z_corr.T
    sc.pp.neighbors(adata, n_neighbors=10, use_rep='X_pca_harmony', n_pcs=20)
    
    # 计算UMAP
    sc.tl.umap(adata)
    # sc.tl.tsne(adata, n_jobs=6)

    return adata

adata_M = progress_pca_harmony_umap(adata_M)

In [None]:
sc.settings.set_figure_params(dpi=400, figsize = (4, 4), fontsize=7)
sc.pl.umap(adata_M, color=['annotation'], add_outline=True, outline_width = (0.2, 0.05), palette="tab20", frameon=False)

In [None]:
Groups_tab_1 = pd.crosstab(index=adata_M.obs['fig2_barplot'],  # Make a crosstab
                        columns=adata_M.obs['annotation'], margins=True)               # Name the count column
MyTab_1= Groups_tab_1.div(Groups_tab_1["All"], axis=0)
MyTab2_1 = MyTab_1.drop(columns="All")
MyTab2_1 = MyTab2_1.drop(index="All")

MyTab2_1 = MyTab2_1.T
order = ['HC-IT','Rhi-IT','Rhi-MT','Rhi-NP']
MyTab2_1 = MyTab2_1[order]
MyTab2_1 = MyTab2_1.T
#categories = IMM_group[::-1]

MyTab2_1.columns = pd.CategoricalIndex(MyTab2_1.columns.values)

# Sort the columns (axis=1) by the new categorical ordering
MyTab2_1 = MyTab2_1.sort_index(axis=1)

ax = MyTab2_1.plot.bar(
            figsize=(3.7,5.7),
            stacked=True,
            edgecolor = '#000000',
            linewidth=0.4,
            width=0.8, 
            fontsize=15,
             )


plt.title("", fontsize=12)
plt.ylabel("Fraction of cells", fontsize=15)
plt.xlabel("", fontsize=12)
plt.ylim=1.0

#plt.gca().get_legend().remove() #remove legend
# plt.legend(categories, loc='center left', bbox_to_anchor=(1, 0.6), fontsize=12)
# plt.savefig('Proportion of clusters accross organs.png')
# 去除刻度
#plt.xticks([])
#plt.yticks([])
# ax.tick_params(bottom=False, top=False, left=False, right=False)
handles, labels = ax.get_legend_handles_labels()
ax.legend(reversed(handles), reversed(labels), loc='center left', bbox_to_anchor=(1, 0.6), fontsize=8)
plt.grid(False)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.show()

In [None]:
sc.settings.set_figure_params(dpi=150, figsize = (4, 4), fontsize=12)

from matplotlib import cm, colors
import colorcet as cc

mymap = colors.LinearSegmentedColormap.from_list('my_colormap', cc.CET_L20)

colors2 = mymap(np.linspace(0.2, 1, 128)) # 30%-100%
colors3 = plt.cm.Greys_r(np.linspace(0.8,0.9,5))
colorsComb = np.vstack([colors3, colors2])
mymap = colors.LinearSegmentedColormap.from_list('my_colormap', colorsComb)

sc.pl.umap(adata, color=['IL1A','IL1B'], frameon=False, color_map = mymap)

In [None]:
from matplotlib.pyplot import rc_context
sc.settings.set_figure_params(dpi=300, fontsize=20)
with rc_context({'figure.figsize': (12, 3)}):
    sc.pl.violin(adata_M, ['IL1A'], groupby='annotation', stripplot=False, inner='box')

with rc_context({'figure.figsize': (12, 3)}):
    sc.pl.violin(adata_M, ['IL1B'], groupby='annotation', stripplot=False, inner='box')

In [None]:
from matplotlib.pyplot import rc_context
sc.settings.set_figure_params(dpi=300, fontsize=12)
with rc_context({'figure.figsize': (12, 3)}):
    sc.pl.violin(adata_M, ['IL1B'], groupby='annotation', stripplot=False, inner='box')

In [None]:
from matplotlib.pyplot import rc_context

with rc_context({'figure.figsize': (4, 3)}):
    sc.pl.violin(adata_M, ['IL1A'], groupby='fig2_barplot', stripplot=False, inner='box')

with rc_context({'figure.figsize': (4, 3)}):
    sc.pl.violin(adata_M, ['IL1B'], groupby='fig2_barplot', stripplot=False, inner='box')

In [None]:
readpath = '/mnt/data2/Datasets/Human_non_intestine_datasets/Qilu_Otorhinolaryngology_surgery_data/HRA000772/matrix/HRR222794/alevin/'

genes = pd.read_csv(readpath+'quants_mat_rows.txt', sep='\t', header=None)
barcodes = pd.read_csv(readpath+'quants_mat_cols.txt', sep='\t', header=None)

genes_list = genes[0].values.tolist()
barcodes_list = barcodes[0].values.tolist()

import scipy.io as sp_io
matrix = sp_io.mmread(readpath+'quants_mat.mtx')
matrix_dense = matrix.todense()

dataframe_adult = pd.DataFrame(data=matrix_dense, index=genes_list, columns=barcodes_list)
adata = sc.AnnData(dataframe_adult)

import scipy
adata.X = scipy.sparse.csr_matrix(adata.X)
adata

In [None]:
# 共用模块
    
def change_obs_index_v1(x):
    """
    This function is used to change barcodes' names, adding sample information, so as to add meta-data to Anndata.
    """
    x = barcode_name + '.' + x
    #x = x[:-2]
    
    return x

def change_obs_index_v2(x):
    """
    该函数用于修正concatenate样本或数据集后变更的barcodes名。
    """  
    x = x[:-4]
    return x


def concatenate_samples(file_name_list, file_output_h5ad, joint='outer'):
    """
    该函数将一个数据集中的多个样本h5ad合并为一个adata。
    """ 
    adata_list = []
    
    for sample_id in file_name_list:
        adata_single = sc.read(file_output_h5ad + sample_id + '.h5ad')
        adata_list.append(adata_single)
        
    # batch_list单个数据集的样本数限制在999以下时适用，此步骤是因为Anndata的concatenate函数会自动在barcode name尾部一个批次编号，
    # 在这里设置格式化的编号以便删除。
    batch_list = []
    for i in range(len(file_name_list)):
        if i+1 < 10:
            j = '00'+ str(i+1)
        elif i+1 < 100:
            j = '0'+ str(i+1)
        else: 
            j = str(i+1)
        batch_list.append(j)
    
    adata = adata_list[0].concatenate(adata_list[1:len(batch_list)],join=joint, batch_categories=batch_list)
    # 删除barcode name的尾部编号
    adata.obs.rename(index=change_obs_index_v2,inplace=True)
    
    return adata

In [None]:
# This moudle if for data format of alevin-fry mtx.
def file_name(file_path):
    for root, dirs, files in os.walk(file_path):
        print('The dataset has '+str(len(dirs))+' samples.')
        print('sub_dirs:', dirs)  # 当前路径下所有子目录
        file_name_list = dirs
        return file_name_list
    
def mtx_to_adata(file_name_list, # dirs
                 dataset_name,  # 'qilu_'
                 file_path, 
                 file_output_h5ad):
    """
    This function is used to bulk convert 10X single-cell matrix to annddata format.
    """
    for sample_id in file_name_list:

        genes = pd.read_csv(file_path+sample_id+'/alevin/quants_mat_rows.txt', sep='\t', header=None)
        barcodes = pd.read_csv(file_path+sample_id+'/alevin/quants_mat_cols.txt', sep='\t', header=None)
        genes_list = genes[0].values.tolist()
        barcodes_list = barcodes[0].values.tolist()
        import scipy.io as sp_io
        matrix = sp_io.mmread(file_path+sample_id+'/alevin/quants_mat.mtx')
        matrix_dense = matrix.todense()
        dataframe_adult = pd.DataFrame(data=matrix_dense, index=genes_list, columns=barcodes_list)
        adata = sc.AnnData(dataframe_adult)
        import scipy
        adata.X = scipy.sparse.csr_matrix(adata.X)
        
        print('The anndata of '+sample_id+' is:')
        print(adata)
        global barcode_name
        barcode_name = dataset_name + sample_id
        adata.obs.rename(index=change_obs_index_v1,inplace=True)
        adata.write(file_output_h5ad + sample_id + '.h5ad')
        
    return adata # 此函数返回最后一个（或唯一的）adata文件，用于一个数据集中仅有一个样本时的10x solution。

def solution_concatenate_alevin(file_path, 
                             file_output_h5ad, 
                             dataset_name, 
                             dataset_output, 
                             adata_name,
                             joint):
    """
    批量处理大规模alevin-fry比对的矩阵文件的解决方案，整合多样本，在barcode name中添加数据集和样本信息;
    生成每一个样本的h5ad文件，以及合并样本的总adata;
    生成的每个样本的adata文件存放于file_output_h5ad中，合并样本的adata存放于dataset_output;
    adata的df为原始基因表达。
    filtered默认为True，表示矩阵是alevin-fry过滤的矩阵；
    """
    print('using the framework of scRNA-seq analysis developed by Yue Wang, qilu hospital of Shandong University')
    file_name_list = file_name(file_path)
    
    adata_last = mtx_to_adata(file_name_list, dataset_name, file_path, file_output_h5ad)
        
    # 判断数据集内的样本数
    if len(file_name_list)>1:
        adata = concatenate_samples(file_name_list, file_output_h5ad, joint='outer')
    else:
        # 当数据集仅有一个样本时
        adata = adata_last
    adata.write(dataset_output + adata_name + '.h5ad')
    
    print('The final adata is: ')
    print(adata)
    
    return adata

In [None]:
file_path = '/mnt/data2/Datasets/Human_non_intestine_datasets/Qilu_Otorhinolaryngology_surgery_data/HRA000772/matrix/'
file_output_h5ad = '/mnt/data2/Datasets/Human_non_intestine_datasets/Qilu_Otorhinolaryngology_surgery_data/HRA000772/H5ad/'
dataset_name = 'HRA000772_'
adata_name = 'HRA000772'
dataset_output = '/mnt/data2/Datasets/Human_non_intestine_datasets/Qilu_Otorhinolaryngology_surgery_data/HRA000772/dataset_output/'
joint = 'outer' # 此数据集每个样本的基因数都一致

In [None]:
# AnnData object with n_obs × n_vars = 85162 × 27319
adata = solution_concatenate_alevin(file_path, file_output_h5ad, dataset_name, dataset_output, adata_name, joint = 'outer')

In [None]:
adata = sc.read('/mnt/data2/Datasets/Human_non_intestine_datasets/Qilu_Otorhinolaryngology_surgery_data/HRA000772/dataset_output/HRA000772.h5ad')

In [None]:
adata

In [None]:
adata.var_names_make_unique

In [None]:
adata.var['gene_ids'] = adata.var_names
adata.var

In [None]:
# 函数定义
def convert_genes_with_ids(adata_file, REFERENCE_DICT_DIR = "/mnt/data/project/qilu_singlecell_1/ref_geneset/Ensembl_ID2gene_Symbol_hg38.txt"):
    adata = adata_file
    print('adata before gene convert')    
    print(adata)
    
    target_frame = adata.var
    
    ref_hg38 = pd.read_csv(REFERENCE_DICT_DIR, sep='\t')
    
    # 第一步，对ENS编号取交集
    intersection = pd.merge(target_frame,ref_hg38,on=['gene_ids']) # 务必注意顺序，target_frame在前
    
    # 第二步，删除重复基因
    duplicated_genes_list = intersection[intersection['Gene name'].duplicated()]['Gene name'].to_list() # 获取重复基因列表
    print('duplicated genes list:')
    print(duplicated_genes_list)
    # 在intersection中删除重复基因，注意一定不能在adata文件替换gene_symbol，出现重复gene_symbol后，对adata更新gene_symbol，会出现报错。
    intersection_after_duplication_removed = intersection[~(intersection['Gene name'].isin(duplicated_genes_list))]
    
    # 第三步，根据交集的ENS编号，获取adata.var中相应ENS编号所在index（交集Index）
    adata_target_var_index = adata.var.loc[adata.var["gene_ids"].isin(list(intersection_after_duplication_removed['gene_ids'])), :].index
    
    # 第四步，根据交集index，删除未交集的var
    adata = adata[:, adata_target_var_index]
    
    # 第五步，更新var，这一步的正确结果要求第一步务必按照target_frame在前，ref_hg38在后的顺序。
    adata.var.index = intersection_after_duplication_removed["Gene name"]
    
    # save barcode names
    adata.obs['barcode_name'] = adata.obs_names.to_list()
    print('adata after gene convert') 
    print(adata)
    
    return adata

In [None]:
adata = convert_genes_with_ids(adata_file = adata)
adata

In [None]:
adata.var

In [None]:
sc.pp.filter_cells(adata, min_genes=250)
adata.var['mt'] = adata.var_names.str.startswith('MT-')
adata.var['rp'] = adata.var_names.str.startswith(("RPS","RPL"))
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
sc.pp.calculate_qc_metrics(adata, qc_vars=['rp'], percent_top=None, log1p=False, inplace=True)

In [None]:
adata

In [None]:
sc.pl.highest_expr_genes(adata, n_top=20, )
sc.pl.violin(adata, ['n_genes', 'total_counts', 'pct_counts_mt'],jitter=0.4, multi_panel=True)

In [None]:
def quality_control(adata):
    
    # 剔除MT-基因
    non_MT_genes_list = [name for name in adata.var_names if not name.startswith('MT-')]
    adata = adata[:, non_MT_genes_list]
    adata = adata[adata.obs.pct_counts_mt < 20, :]
    
    # 剔除RP基因
    non_RP_genes_list = [name for name in adata.var_names if not name.startswith(("RPS","RPL"))]
    adata = adata[:, non_RP_genes_list]
    
    sc.pp.filter_genes(adata, min_cells=20)
    
    return adata

In [None]:
adata = quality_control(adata)
adata

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata

In [None]:
adata.var

In [None]:
adata

In [None]:
adata.write('/mnt/data2/Datasets/Human_non_intestine_datasets/Qilu_Otorhinolaryngology_surgery_data/HRA000772/dataset_output/HRA000772_nor.h5ad')

In [None]:
def cycle_score(adata, cycle_gene_file='/mnt/data/project/qilu_singlecell_1/ref_geneset/regev_lab_cell_cycle_genes.txt'):
    cell_cycle_genes = [x.strip() for x in open(cycle_gene_file)]
    s_genes = cell_cycle_genes[:43]
    g2m_genes = cell_cycle_genes[43:]

    cell_cycle_genes = [x for x in cell_cycle_genes if x in adata.var_names]
    s_genes = [x for x in s_genes if x in adata.var_names]
    g2m_genes = [x for x in g2m_genes if x in adata.var_names]

    # sc.pp.scale(adata, zero_center=False) # 计算得分前的scale并非必须
    sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)

    adata_cc_genes = adata[:, cell_cycle_genes]
    sc.tl.pca(adata_cc_genes)
    sc.pl.pca_scatter(adata_cc_genes, color='phase')

    return adata

def EIS_score(adata):
    # 请注意B的基因集会导致滤泡B细胞被归为MNPs区室，需根据CD79A表达来区别
    pan_epi_marker = 'KRT8,EPCAM,KRT18,KRT19,CLDN3,CLDN7,ELF3,S100A6,C15orf48,CLDN4,SMIM22,S100A14,SPINK2'.split(',')
    pan_T_ILCs_marker = 'TRAC,IL32,CD3D,CCL5,CD2,TRBC2,CD3E,CD3G,EVL,IL7R,CD7,HCST,KLRB1,LCK,FYB1,CXCR4,CORO1A,TRDC,GNLY,GZMA,IFNG,PRF1,TNF,CCL3,CCL4,XCL1,XCL2,NKG7,IL22,IL2RA,IL7R,IL23R'.split(',')
    pan_B_marker = 'CD79A,IGHA1,MZB1,DERL3,IGHA2,HERPUD1,SSR4,TNFRSF17,SEC11C,UBE2J1,PRDX4,GNG7,XBP1,EAF2,PLPP5,CD27,IGLC3,SSR3,TNFRSF13B'.split(',')
    pan_MNPs_marker = 'HLA-DRA,CST3,HLA-DPB1,CD74,HLA-DPA1,AIF1,LYZ,HLA-DQA1,C1QA,MS4A6A,HLA-DMA,C1QC,C1QB,FCGRT,DNASE1E3,LST1,SELENOP,FGL2,HLA-DMB,CTSB,GRN'.split(',')
    pan_Mast_marker = 'TPSAB1,CPA3,TPSB2,CD9,HPGDS,ANXA1,NFKBIA,MS4A2,CD63,LAPTM4A,SRGN,LMNA,LTC4S,FCER1G,VWA5A,CTSG,KIT,CLU'.split(',')
    pan_Stromal_marker = 'IGFBP7,IFITM3,SPARC,A2M,CALD1,GSN,LGALS1,VIM,SPARCL1,CXCL14,COL1A2,COL3A1,COL6A2,TIMP1,C1S,S100A13,C1R,PLAT,MFAP4,RARRES2,COL1A1'.split(',')
    
    pan_epi_marker = [x for x in pan_epi_marker if x in adata.var_names]
    pan_T_ILCs_marker = [x for x in pan_T_ILCs_marker if x in adata.var_names]
    pan_B_marker = [x for x in pan_B_marker if x in adata.var_names]
    pan_MNPs_marker = [x for x in pan_MNPs_marker if x in adata.var_names]
    pan_Mast_marker = [x for x in pan_Mast_marker if x in adata.var_names]
    pan_Stromal_marker = [x for x in pan_Stromal_marker if x in adata.var_names]
    # pan_all_marker = pan_epi_marker + pan_imm_marker + pan_str_marker

    epi_genes_index = adata.var_names.isin(pan_epi_marker)
    T_ILCs_genes_index = adata.var_names.isin(pan_T_ILCs_marker)
    B_genes_index = adata.var_names.isin(pan_B_marker)
    MNPs_genes_index = adata.var_names.isin(pan_MNPs_marker)
    Mast_genes_index = adata.var_names.isin(pan_Mast_marker)
    Stromal_genes_index = adata.var_names.isin(pan_Stromal_marker)
    
    # 计算区室评分

    adata.obs['mean_epi_score'] = np.sum(adata[:, epi_genes_index].X, axis=1) / len(pan_epi_marker)
    adata.obs['mean_T_ILCs_score'] = np.sum(adata[:, T_ILCs_genes_index].X, axis=1) / len(pan_T_ILCs_marker)
    adata.obs['mean_B_score'] = np.sum(adata[:, B_genes_index].X, axis=1) / len(pan_B_marker)
    adata.obs['mean_MNPs_score'] = np.sum(adata[:, MNPs_genes_index].X, axis=1) / len(pan_MNPs_marker)
    adata.obs['mean_Mast_score'] = np.sum(adata[:, Mast_genes_index].X, axis=1) / len(pan_Mast_marker)
    adata.obs['mean_Stromal_score'] = np.sum(adata[:, Stromal_genes_index].X, axis=1) / len(pan_Stromal_marker)
    
    # 获取每个细胞的三类区室的得分
    compartments_values_list = adata.obs.loc[:, ['mean_epi_score', 'mean_T_ILCs_score', 'mean_B_score','mean_MNPs_score','mean_Mast_score','mean_Stromal_score']].values.tolist()

    return adata, compartments_values_list

def hvg_regress_scale(adata):
    # 注意，HVG要求输入的矩阵必须是对数化处理的
    sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
    adata = adata[:, adata.var.highly_variable]
    print(adata)
    
    # sc.pp.regress_out(adata, ['S_score', 'G2M_score'])
    
    sc.pp.scale(adata, zero_center=False)

    return adata

def progress_pca_bbknn_umap_tsne(adata, n_pcs=20, batch_key='batch'):
    ## 主成分分析降维
    sc.tl.pca(adata, svd_solver='arpack')

    # 使用bbknn算法处理批次效应(代替了neighbors)
    # bbknn要求数据已经过标准化和主成分分析
    sc.external.pp.bbknn(adata, batch_key=batch_key, n_pcs=n_pcs)
    
    # 计算UMAP
    sc.tl.umap(adata)
    # sc.tl.tsne(adata, n_jobs=6)

    return adata

def progress_pca_harmony_umap(adata):
    sc.tl.pca(adata, svd_solver='arpack')
    sc.pl.pca(adata)
    sc.pl.pca_variance_ratio(adata, log=True)

    # Harmony处理批次效应
    harmony_out = hm.run_harmony(adata.obsm['X_pca'], adata.obs, vars_use = ['batch_name'])
    adata.obsm['X_pca_harmony'] = harmony_out.Z_corr.T
    sc.pp.neighbors(adata, n_neighbors=10, use_rep='X_pca_harmony', n_pcs=20)
    
    # 计算UMAP
    sc.tl.umap(adata)
    # sc.tl.tsne(adata, n_jobs=6)

    return adata

def score_compartments(adata, compartments_values_list):
    # 每个细胞的Compartments标签为得分最高的类
    cell_compartments_list = []


    for i in compartments_values_list:
        a = np.where(i == np.max(i))
        if a[0][0] == 0:
            j = 'epi'
        elif a[0][0] == 1:
            j = 'T_ILCs'
        elif a[0][0] == 2:
            j = 'B'
        elif a[0][0] == 3:
            j = 'MNPs'
        elif a[0][0] == 4:
            j = 'Mast'
        elif a[0][0] == 5:
            j = 'Stromal'

        cell_compartments_list.append(j)

    adata.obs['Compartments'] = cell_compartments_list

    return adata

def add_leiden(adata):

    print("Performing clustering with a resolution of 0.5")
    sc.tl.leiden(adata, resolution=0.5)
    adata.obs['leiden-all-0.5'] = adata.obs['leiden']

    print("Performing clustering with a resolution of 1")
    sc.tl.leiden(adata, resolution=1)
    adata.obs['leiden-all-1'] = adata.obs['leiden']

    return adata

In [None]:
adata = cycle_score(adata = adata)
adata, compartments_values_list = EIS_score(adata)
adata = hvg_regress_scale(adata)

In [None]:
adata = progress_pca_bbknn_umap_tsne(adata)

In [None]:
compartments_values_list = adata.obs.loc[:, ['mean_epi_score', 'mean_T_ILCs_score', 'mean_B_score','mean_MNPs_score','mean_Mast_score','mean_Stromal_score']].values.tolist()

adata = score_compartments(adata, compartments_values_list = compartments_values_list)

adata = add_leiden(adata)

In [None]:
sc.settings.set_figure_params(dpi=150, figsize = (4, 3), fontsize=10)
sc.pl.umap(adata, color=['leiden-all-0.5'], add_outline=True,
           palette=sc.pl.palettes.vega_20_scanpy, outline_width = (0.2, 0.05), frameon=False, legend_loc='on data')
sc.pl.umap(adata, color=['leiden-all-1'], add_outline=True,
           palette=sc.pl.palettes.vega_20_scanpy, outline_width = (0.2, 0.05), frameon=False, legend_loc='on data')

In [None]:
sc.settings.set_figure_params(dpi=200, figsize = (4, 4), fontsize=7)
sc.pl.umap(adata, color=['Compartments'], add_outline=True,frameon=False, title='')

In [None]:
sc.settings.set_figure_params(dpi=200, figsize = (4, 4), fontsize=15)
sc.pl.umap(adata, color=['FCGR3B','HCAR3','CLC','IL1B'])
sc.pl.umap(adata, color=['IL1A','PI3','BHLHE40','BATF2'])

# FigS6

## 小提琴图

In [None]:
adata = sc.read('/home/wangyue/basic-calculation_data/c_Project_outputs/qilu_CRSwNP/IMM.h5ad')
adata = sc.AnnData(X=adata.raw.X, var=adata.raw.var, obs = adata.obs)
adata

In [None]:
adata_1 = sc.read('/home/wangyue/basic-calculation_data/b_Datasets/sc_Datasets/RNA_data/Human_non_intestine_datasets/Nasal_Xiehe_HRA000772/dataset_output/HRA000772_nor_neutrophils.h5ad')
adata_1

In [None]:
adata_1.obs['annotation'] = 'C04-M10-Neutrophils'

In [None]:
# 合并方式为outer
adata = adata.concatenate(adata_1,join="inner")
adata.var = adata.var[[]]
adata

In [None]:
adata.raw = adata

In [None]:
adata.obs['annotation'] = adata.obs['annotation'].astype('category')
adata.obs['annotation'].cat.categories

In [None]:
E_index = adata.obs.loc[adata.obs["annotation"].isin(['C04-M01-Macrophages', 'C04-M02-CCL18+ Macrophages',
       'C04-M03-CCL13+ Macrophages', 'C04-M04-CXCL8+ Macrophages',
       'C04-M05-Monocytes', 'C04-M06-DC1', 'C04-M07-DC2',
       'C04-M08-Migratory DCs', 'C04-M09-pDC', 'C04-M10-Neutrophils']), :].index
adata = adata[E_index, :]
adata

In [None]:
import cosg as cosg

cosg.cosg(adata,
    key_added='cosg',
        mu=1,
        n_genes_user=50,
               groupby='annotation')

sc.settings.set_figure_params(dpi=200, figsize = (4, 4), fontsize=20)
sc.pl.rank_genes_groups_dotplot(adata,groupby='annotation',
                                cmap='Spectral_r',
                                 standard_scale='var',
                                       n_genes=15,key='cosg')

In [None]:
import seaborn as sns
sns.set(style="ticks",rc={'figure.figsize':(13,6)},font_scale=3)

ax = sc.pl.violin(adata, 
             "IL1B", 
             groupby="annotation",
             # order = ["Duodenum","Jejunum","Ileum"],
             linewidth=0.3, 
             rotation=90, 
             # palette=["#0432ff","#ff0000","#03ab3e"],
             size=0.8,
             show =False,
            )
ax
sns.despine(trim=True)

In [None]:
sc.settings.set_figure_params(dpi=100, figsize = (4, 4), fontsize=20)
import seaborn as sns
sns.set(style="ticks",rc={'figure.figsize':(13,6)},font_scale=3)

ax = sc.pl.violin(adata, 
             ['NR1H4','NR1H3','VDR','GPBAR1','IL1R1'], 
             groupby="annotation",
             # order = ["Duodenum","Jejunum","Ileum"],
             linewidth=0.3, 
             rotation=90, 
             # palette=["#0432ff","#ff0000","#03ab3e"],
             size=0.8,
             show =False,
            )
ax
sns.despine(trim=True)

In [None]:
from matplotlib.pyplot import rc_context
sc.settings.set_figure_params(dpi=300, fontsize=15)
with rc_context({'figure.figsize': (12, 5)}):
    sc.pl.violin(adata, ['IL1B'], groupby='annotation', stripplot=False, inner='box',rotation=90)

In [None]:
adata_raw_matrix = adata.to_df()

In [None]:
adata_raw_matrix['annotation'] = adata.obs['annotation']

In [None]:
adata_raw_matrix_1 = pd.melt(adata_raw_matrix, value_vars=["IL1B"],id_vars = ['annotation'],
             var_name='AMPs', value_name='counts')

In [None]:
adata_raw_matrix_1

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="ticks",font_scale=1.8)

# Initialize the figure with a logarithmic x axis
f, ax = plt.subplots(figsize=(14, 7))
# ax.set_xscale("log")

# Plot the orbital period with horizontal boxes
sns.barplot(x='annotation',y='counts',data=adata_raw_matrix_1,
            capsize=.1,
            errwidth = 1.5,  # 误差棒的粗细
            edgecolor=".2",
            linewidth=1.0,
            # hue_order = ["Healthy","Uninflamed","Inflamed"],
            #order = ["Stem cells","TA","Paneth cells"],
            #palette=["#ffffff","#0d00ff","#ff0000"]
           ) # palette="vlag",字母不需要[]，#开头的颜色需要

# add_stat_annotation(ax, plot='barplot', data=adata_pre_matrix_1, x='AMPs', y='counts', hue='treatment',box_pairs=box_pairs, test='t-test_ind', loc='inside', verbose=2)
# Add in points to show each observation
#sns.stripplot(x='AMPs',y='counts',hue='treatment',data=adata_pre_matrix_1, size=1.5, color=".3", linewidth=0)
#sns.swarmplot(data=adata_pre_matrix_1, x='AMPs',y='counts',hue='treatment')
# Tweak the visual presentation
#ax.xaxis.grid(True)
plt.legend([],[], frameon=False)
sns.despine() # trim=True
ax.set(ylabel="",xlabel="")

## label transfer

In [None]:
adata = sc.read('/home/wangyue/basic-calculation_data/c_Project_outputs/qilu_CRSwNP/Fig1_All_219716.h5ad')
adata

In [None]:
adata = sc.AnnData(X=adata.raw.X, var=adata.raw.var, obs = adata.obs)
adata

In [None]:
adata_1 = sc.read('/home/wangyue/basic-calculation_data/b_Datasets/sc_Datasets/RNA_data/Human_non_intestine_datasets/Nasal_Xiehe_HRA000772/dataset_output/HRA000772_nor_neutrophils.h5ad')
adata_1.obs['annotation'] = 'C04-M10-Neutrophils'
adata_1

In [None]:
# 合并方式为outer
adata = adata.concatenate(adata_1,join="inner")
adata.var = adata.var[[]]
adata

In [None]:
adata.obs['annotation'] = adata.obs['annotation'].astype('category')
adata.obs['annotation'].cat.categories

In [None]:
adata_list = []

In [None]:
    for j in list(adata.obs['annotation'].cat.categories):
        
        print(j + ' is being processed ...')
        single_index = adata.obs.loc[adata.obs['annotation'].isin([j]), :].index
        adata_single = adata[single_index,:]

        print('The number of '+ j + ' is '+ str(len(adata_single.obs.index)))
        if len(adata_single.obs.index) > 1000:
            # 设置随机筛选200个细胞，设置随机种子为1
            sc.pp.subsample(adata_single, n_obs=1000, random_state=1, copy=False)
        else:
            adata_single = adata_single
            
        adata_list.append(adata_single)

In [None]:
def change_obs_index_v2(x):
    """
    该函数用于修正concatenate样本或数据集后变更的barcodes名。
    """  
    x = x[:-4]
    return x


# batch_list单个数据集的样本数限制在999以下时适用，此步骤是因为Anndata的concatenate函数会自动在barcode name尾部一个批次编号，
# 在这里设置格式化的编号以便删除。
batch_list = []
for i in range(len(adata_list)):
    if i+1 < 10:
        j = '00'+ str(i+1)
    elif i+1 < 100:
        j = '0'+ str(i+1)
    else: 
        j = str(i+1)
    batch_list.append(j)
    
adata = adata_list[0].concatenate(adata_list[1:len(batch_list)],join='outer', batch_categories=batch_list)
# 删除barcode name的尾部编号
adata.obs.rename(index=change_obs_index_v2,inplace=True)
adata

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata

In [None]:
import celltypist
from celltypist import models
# 区室
new_model = celltypist.train(adata, labels = 'annotation', n_jobs = 16, feature_selection = True)

In [None]:
# Save the model.
new_model.write('/home/wangyue/basic-calculation_data/c_Project_outputs/qilu_CRSwNP/CellTypist/model_42276cells_annotation.pkl')

In [None]:
adata = sc.read('/home/wangyue/basic-calculation_data/b_Datasets/sc_Datasets/RNA_data/Human_non_intestine_datasets/Nasal_Xiehe_HRA000772/dataset_output/HRA000772_nor.h5ad')
adata

In [None]:
import celltypist
from celltypist import models

new_model = models.Model.load('/home/wangyue/basic-calculation_data/c_Project_outputs/qilu_CRSwNP/CellTypist/model_42276cells_annotation.pkl')
predictions = celltypist.annotate(adata, model = new_model, majority_voting = False, mode = 'best match')
adata = predictions.to_adata()
adata

In [None]:
adata.write('/home/wangyue/basic-calculation_data/b_Datasets/sc_Datasets/RNA_data/Human_non_intestine_datasets/Nasal_Xiehe_HRA000772/dataset_output/HRA000772_nor.h5ad')
adata

In [None]:
sc.settings.set_figure_params(dpi=400, figsize = (4, 4), fontsize=7)
sc.pl.umap(adata, color=['predicted_labels'], add_outline=True,
           palette="tab20_r", outline_width = (0.2, 0.05), frameon=False)

In [None]:
Groups_tab_1 = pd.crosstab(index=adata.obs['predicted_labels'],  # Make a crosstab
                        columns=adata.obs['health'], margins=True)               # Name the count column
Groups_tab_1

In [None]:
sc.settings.set_figure_params(dpi=400, figsize = (4, 4), fontsize=7)
sc.pl.umap(adata, color=['IL1B'], add_outline=True)

In [None]:
sc.settings.set_figure_params(dpi=300, figsize = (4, 4), fontsize=12)

from matplotlib import cm, colors
import colorcet as cc

mymap = colors.LinearSegmentedColormap.from_list('my_colormap', cc.CET_L20)

colors2 = mymap(np.linspace(0.2, 1, 128)) # 30%-100%
colors3 = plt.cm.Greys_r(np.linspace(0.8,0.9,5))
colorsComb = np.vstack([colors3, colors2])
mymap = colors.LinearSegmentedColormap.from_list('my_colormap', colorsComb)

sc.pl.umap(adata, color=['IL1B'], frameon=False, color_map = mymap)