## Annotation of cells from tumor samples

In [None]:
import numpy as np
import seaborn as sns

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.legend import Legend
import matplotlib.colors as colors
from matplotlib import cm
from mpl_toolkits.mplot3d import axes3d
import pandas as pd
import scipy
import scanpy as sc
import anndata as ad
    
from sklearn import datasets
from sklearn.decomposition import PCA

from numba import jit

import celltypist
from celltypist import models

from matplotlib.cm import ScalarMappable

In [None]:
#Custom colormap

from matplotlib.cm import register_cmap
from matplotlib.colors import ListedColormap

tab20b = matplotlib.colormaps['tab20b']
tab20c = matplotlib.colormaps['tab20c']
colors1 = tab20b(np.linspace(3.001/5., 1, 9))
colors2 = tab20c(np.linspace(0, 3.999/5., 16))

colors = np.concatenate([colors1, colors2])

map_name = 'op_tab25'
op_cmap = ListedColormap(colors, name=map_name )
matplotlib.colormaps.register(name=map_name, cmap=op_cmap)

In [None]:
sc.set_figure_params(scanpy=True, dpi=300, dpi_save=1200, frameon=True, vector_friendly=True, fontsize=14,
                         figsize=(9,8),  format='pdf', facecolor=None, transparent=False, ipython_format='png2x')

In [None]:
#Load tumor data
adata = sc.read("maranou_032024_tumor_integrated.h5ad")
tissue = 'tumor'

In [None]:
# Create a new index by combining 'sample' column with the current index
adata.obs['unique_cell_name'] = adata.obs['sample'].astype(str) + '_' + adata.obs.index.astype(str)

# Set this new column as the index
adata.obs.index = adata.obs['unique_cell_name']

# Optionally, you might want to drop the 'unique_cell_name' column afterward if it's no longer needed
adata.obs.drop('unique_cell_name', axis=1, inplace=True)

In [None]:
adata.obs['WT/KO']=='wt'

In [None]:
# Define matplotlib Axes
# Number of Axes & plot size
ncols = 2
nrows = 1
figsize = 8
wspace = 0.1
fig, axs = plt.subplots(
    nrows=nrows,
    ncols=ncols,
    figsize=(ncols * figsize + figsize * wspace * (ncols - 1), nrows * figsize),
)
plt.subplots_adjust(wspace=wspace)

sc.pl.umap(adata[adata.obs['WT/KO']=='wt'], color=['Calr'],palette='tab20',cmap='coolwarm',ax=axs[0], show=False) 
sc.pl.umap(adata[adata.obs['WT/KO']=='ko'], color=['Calr'],palette='tab20',cmap='coolwarm',ax=axs[1]) 
plt.tight_layout()
# plt.savefig('Calr_tumor.pdf',dpi=600)
plt.show()

### Cancer cell and Cancer stem cell markers

In [None]:
cancer_genes = ['Mlana','Tyr', 'Pmel']
adata.obs['cancer_cell_score'] = 0.333*(np.asarray(adata[:,'Mlana'].X.sum(1)/np.max(adata[:,'Mlana'].X.sum(1)))+np.asarray(adata[:,'Tyr'].X.sum(1)/np.max(adata[:,'Tyr'].X.sum(1)))+np.asarray(adata[:,'Pmel'].X.sum(1)/np.max(adata[:,'Pmel'].X.sum(1))))*np.asarray(1-adata[:,'Ptprc'].X.sum(1)/np.max(adata[:,'Ptprc'].X.sum(1)))

cancer_stem_cell_genes = ['Cd34','Pdgfra']
adata.obs['cancer_stem_cell_score'] = adata[:,cancer_stem_cell_genes].X.sum(1)

sc.pl.umap(adata, color=['cancer_cell_score','cancer_stem_cell_score'],palette='tab20',cmap='coolwarm', vmax=1) 


In [None]:
sc.pl.umap(adata, color=['WT/KO'], title=[str(tissue) +' Leiden'],  legend_loc='right margin')


In [None]:
sns.set_style('ticks')
plt.figure(figsize=(5,4))
plt.hist(adata.obsm['X_umap'][:,0], bins=200)
plt.xticks(np.arange(-5,20))
plt.show()

## Let's split the data to immune cells and other

In [None]:
adata_immune = adata[adata.obsm['X_umap'][:,0]<=1.5].copy()
adata_other = adata[adata.obsm['X_umap'][:,0]>1.5].copy()

### Annotate immune cells

In [None]:
# Leiden clustering
sc.tl.leiden(adata_immune, resolution=1.0,key_added='leiden_initial')

In [None]:
# # Create a new index by combining 'sample' column with the current index
# adata.obs['unique_cell_name'] = adata.obs['sample'].astype(str) + '_' + adata.obs.index.astype(str)

# # Set this new column as the index
# adata.obs.index = adata.obs['unique_cell_name']

# # Optionally, you might want to drop the 'unique_cell_name' column afterward if it's no longer needed
# adata.obs.drop('unique_cell_name', axis=1, inplace=True)

In [None]:
sc.set_figure_params(scanpy=True, dpi=300, dpi_save=1200, frameon=True, vector_friendly=True, fontsize=14,
                         figsize=(9,9),  format='pdf', facecolor=None, transparent=False, ipython_format='png2x')

In [None]:
sc.pl.umap(adata_immune, color=['leiden_initial'], title=[str(tissue) +' Leiden'], palette='tab20', legend_loc='on data')


In [None]:
##Refine clustering of 16 and 23
# 2. Identify the cluster(s) containing DC2, monocytes, and macrophages
target_clusters = ['9','10']  # Replace with your actual cluster IDs

# 3. Subset the data
adata_subset = adata_immune[adata_immune.obs['leiden_initial'].isin(target_clusters)].copy()

# 4. Recompute the neighborhood graph on the subset
sc.pp.neighbors(adata_subset)

# 5. Perform Leiden clustering at higher resolution on the subset
sc.tl.leiden(adata_subset, resolution=0.3, key_added='leiden_refined')

# 6. Prepare categories for the combined clustering
initial_categories = list(adata_immune.obs['leiden_initial'].cat.categories)
refined_categories = list(adata_subset.obs['leiden_refined'].cat.categories)

# Remove target clusters from initial categories
initial_categories_filtered = [cat for cat in initial_categories if cat not in target_clusters]

# Create new category names for refined clusters
refined_categories_renamed = [f'r{cat}' for cat in refined_categories]

# Combine filtered initial categories with renamed refined categories
combined_categories = initial_categories_filtered + refined_categories_renamed

# 7. Create new column for combined clustering
adata_immune.obs['leiden'] = pd.Categorical(
    adata_immune.obs['leiden_initial'],
    categories=combined_categories
)

# 8. Update the combined clustering for the refined subset
for idx in adata_subset.obs.index:
    refined_value = adata_subset.obs.loc[idx, 'leiden_refined']
    adata_immune.obs.at[idx, 'leiden'] = f'r{refined_value}'

# 9. Optionally, sort the categories for better readability
adata_immune.obs['leiden'] = adata_immune.obs['leiden'].cat.reorder_categories(sorted(adata_immune.obs['leiden'].cat.categories))

In [None]:
sc.pl.umap(adata_immune, color=['leiden'], title=[str(tissue) +' Leiden'], palette='tab20', legend_loc='on data')


## Use celltypist for immune cell annotation

In [None]:
import celltypist
from celltypist import models

#Take only immune cells
adata_human = adata_immune.copy()

#Mouse-human gene conversion
mouse_human_genes = pd.read_csv("http://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt",sep="\t", index_col=False)
mouse = mouse_human_genes[mouse_human_genes['Common Organism Name']=='mouse, laboratory']
human = mouse_human_genes[mouse_human_genes['Common Organism Name']=='human']

# Collect dataframes for conversion
mouse = mouse[['DB Class Key', 'Symbol']]
mouse.index = np.arange(mouse.shape[0])

human = human[['DB Class Key', 'Symbol']]
human.index = np.arange(human.shape[0])

gene_list = np.array([])

# Convert from human to mouse
for sym_idx in np.arange(adata.var_names.shape[0]):

    row_idx  = np.where(mouse['Symbol']== adata_human.var_names[sym_idx])[0]

    if len(row_idx)>0:
        
        dbck = mouse['DB Class Key'][row_idx].values[0]
        human_gene = human[human['DB Class Key']==dbck]['Symbol'].values

        if len(human_gene)>0:
            gene_list =  np.append(gene_list,human_gene[0])

        else:

            gene_list =  np.append(gene_list,adata_human.var_names[sym_idx])

    else:

        gene_list =  np.append(gene_list,adata_human.var_names[sym_idx])


In [None]:
sc.pp.normalize_per_cell(adata_human, counts_per_cell_after=1e4)
sc.pp.log1p(adata_human)
np.expm1(adata_human.X).sum(1)

adata_human.var_names = gene_list
predictions = celltypist.annotate(adata_human, model = 'Immune_All_Low.pkl', majority_voting = True)
print(predictions.predicted_labels)
adata_human = predictions.to_adata()

In [None]:
sc.pl.umap(adata_human, color = ['majority_voting','leiden'], palette='tab20',  legend_loc = 'on data', legend_fontsize=12, legend_fontweight='heavy')


In [None]:
sc.tl.rank_genes_groups(adata_immune,'leiden', method='wilcoxon')
sc.pl.rank_genes_groups(adata_immune, n_genes=20, sharey=False)

immune_markers = sc.get.rank_genes_groups_df(adata_immune, None)
immune_markers = immune_markers[(immune_markers.pvals_adj < 0.03) & (immune_markers.logfoldchanges > 1.0)]
immune_markers

In [None]:
cluster ='0'
immune_markers[immune_markers['group']==cluster].iloc[np.argsort(immune_markers[immune_markers['group']==cluster]['logfoldchanges'])[::-1],:][0:20]


In [None]:
sc.pl.umap(adata_immune, color=['leiden'], title=[str(tissue) +' Leiden'],add_outline=True, outline_width = (0.2,0.2), palette='op_tab25', alpha=0.7, s=36,legend_loc='on data', legend_fontsize=12, legend_fontweight='heavy')
# sc.pl.umap(adata_immune, color=['leiden'], title=[str(tissue) +' Leiden'],add_outline=True, outline_width = (0.2,0.2), palette='tab20', alpha=0.7, s=36,legend_loc='right margin', legend_fontsize=12, legend_fontweight='heavy')


In [None]:
#for i in np.arange(26):
#    print(f'"{i}":"",')

annotation_dict = {"0":"Tissue-resident macrophages",
"1":"M2 tumor-associated macrophages",
"2":"Activated/TRM CD8+", #effector/memory T cells with potential tissue-resident characteristics
"3":"NK cells",#
"4":"Immunosuppressive, M2-like macrophages", #
"5":"Monocytes and macrophages",
"6":"M-MDSCs", #Monocytic Myeloid-Derived Suppressor Cells: monocytic origin, immature phenotype, lack of antigen presentation capability, and likely role in the immunosuppressive tumor microenvironment
"7":"CD8+",
"8":"Tex",
"9":"DC",

"11":"T cells under regulated activation" ,#T cell-DC interactions: co-stimulatory (CD28-CD80/CD86) and inhibitory (PD-1 and CTLA-4 pathways) signals. 
"r0":"DC2",
"r1":"DC1",
"r2":"Migratory DC",
"r3":"Mature B cells",#activated, tumor-infiltrating B cells that have not yet differentiated into plasma cells
"r4":"pDC",
"r5":"Migratory DC",
           }

ann_colors = plt.colormaps['tab20'].colors
ann_palette={"Germinal center B cells": ann_colors[18],
                       "B cells": ann_colors[0],
                       "CD8+":ann_colors[4],
                       "Activated B cells": ann_colors[13],
                       "Tissue-resident macrophages":plt.matplotlib.colors.to_rgb('dodgerblue'),
                       "Activated/TRM CD8+":ann_colors[2],
                        "Tcm":plt.matplotlib.colors.to_rgb('lightseagreen'),
                       "CD4+":plt.matplotlib.colors.to_rgb('greenyellow'),
                       "T cells under regulated activation":ann_colors[16],
                         "Th":plt.matplotlib.colors.to_rgb('lime'),
                        "M2 tumor-associated macrophages":ann_colors[3],
                       "CD8-CD4-CD3E- T cell-like":ann_colors[15],
                       "NK cells":ann_colors[17],
                       "Tex":plt.matplotlib.colors.to_rgb('lime'),
                       "MZB and B-1 cells":ann_colors[1],
                       "Activated B cells":ann_colors[9],
                       "Mature follicular B cells":plt.matplotlib.colors.to_rgb('navy'),
                       "Immunosuppressive, M2-like macrophages":ann_colors[5],
                       "Activated DC1-like":ann_colors[12],
                       "pDC":ann_colors[6],
                       "M-MDSCs":ann_colors[19],
                       "Monocytes and macrophages":ann_colors[7], #Probably Classical monocytes
                       "DC1":ann_colors[8], #XCR1+ [Gurka et al]
                       "Migratory DC":ann_colors[9], #[Gurka et al]
                       "DC2":plt.matplotlib.colors.to_rgb('darkorchid'), 
                       "Proliferative":plt.matplotlib.colors.to_rgb('b'),
                       "Mast cells":plt.matplotlib.colors.to_rgb('cornflowerblue'),
                       "Neutrophils":plt.matplotlib.colors.to_rgb('coral'),
                        "Mature B cells":plt.matplotlib.colors.to_rgb('cyan'),
                }

In [None]:
adata_immune.obs['cell_type'] = adata_immune.obs.leiden.map(annotation_dict)

In [None]:
sc.pl.umap(adata_immune, color=['cell_type'],palette=ann_palette, alpha=0.75, add_outline=True, outline_width = (0.2,0.2), title='Tumor immune cell cluster',legend_loc='on data',legend_fontsize=12, legend_fontweight='heavy', save='_tumor_immune_annotations.pdf')


In [None]:
# Save data
adata_immune.write_h5ad("maranou_032024_tumor_immune_cells_annotated.h5ad")

In [None]:
# Define matplotlib Axes
# Number of Axes & plot size
ncols = 2
nrows = 1
figsize = 8
wspace = 0.1
fig, axs = plt.subplots(
    nrows=nrows,
    ncols=ncols,
    figsize=(ncols * figsize + figsize * wspace * (ncols - 1), nrows * figsize),
)
plt.subplots_adjust(wspace=wspace)

sc.pl.umap(adata_immune[adata_immune.obs['WT/KO']=='wt'], ax=axs[0], show=False, color=['cell_type'], title=['Tumor, WT'], add_outline=True, outline_width = (0.2,0.5), palette=ann_palette, alpha=0.7, s=10,legend_loc='on data',legend_fontsize=12, legend_fontweight='medium')
sc.pl.umap(adata_immune[adata_immune.obs['WT/KO']=='ko'],  ax=axs[1], show=False, color=['cell_type'], title=['Tumor, KO'],add_outline=True, outline_width = (0.2,0.5), palette=ann_palette, alpha=0.7, s=10,legend_loc='on data',legend_fontsize=12, legend_fontweight='medium')

plt.tight_layout()
plt.savefig('umap_tumor_immune_annotations_samples.pdf',dpi=600)
plt.show()

In [None]:
# Get cell type counts
cell_counts = adata_immune.obs['cell_type'].value_counts()

# Calculate proportions
total_cells = cell_counts.sum()
cell_proportions = cell_counts / total_cells

# Sort cell types by proportion (descending)
cell_proportions_sorted = cell_proportions.sort_values(ascending=True)

# Create the plot
fig, ax = plt.subplots(figsize=(6, 8))

# Plot the stacked bar
bottom = 0
top = 1
for cell_type, proportion in cell_proportions_sorted.items():
    ax.bar(0, proportion, bottom=bottom, width=0.5, 
           color=ann_palette[cell_type], label=cell_type)
    bottom += proportion
    top -= proportion

# Customize the plot
ax.set_ylabel('Proportion of Cells')
ax.set_title('Cell Type Proportions', fontsize=16)
ax.set_xlim(-0.3, 0.3)
ax.set_xticks([])  # Remove x-axis ticks

bottom = 0
top = 1
# Add cell count and proportion labels
for i, (cell_type, proportion) in enumerate(cell_proportions_sorted.items()):
    count = cell_counts[cell_type]
    y_position = bottom + proportion/2

    if proportion>0.015:
        # ax.text(0, y_position, f'{cell_type} {count} ({proportion:.1%})', 
                # ha='center', va='center', fontsize=14)
        ax.text(0, y_position, f'{cell_type} ({proportion:.1%})', 
                ha='center', va='center', fontsize=14)
    bottom += proportion
    top -= proportion

# Add a legend
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

# Optionally, save the figure
plt.savefig('tumor_cell_type_proportions_stacked.png', dpi=300, bbox_inches='tight')

plt.tight_layout()
plt.show()



In [None]:
sns.set_style("ticks")

In [None]:
def get_cell_proportions(adata):
    
    cell_counts = adata.obs['cell_type'].value_counts()
    total_cells = cell_counts.sum()
    return cell_counts / total_cells

# Get cell proportions for both AnnData objects
adata1 = adata_immune[adata_immune.obs['WT/KO']=='wt']
adata2 = adata_immune[adata_immune.obs['WT/KO']=='ko']

cell_proportions1 = get_cell_proportions(adata1)
cell_proportions2 = get_cell_proportions(adata2)

# Combine all cell types from both datasets
all_cell_types = sorted(set(cell_proportions1.index) | set(cell_proportions2.index))
# Create a DataFrame with proportions from all datasets
df = pd.DataFrame({
    'WT': cell_proportions1.reindex(all_cell_types).fillna(0),
    'KO': cell_proportions2.reindex(all_cell_types).fillna(0),
 
})

# Sort by the maximum proportion across all datasets
df = df.sort_values(by=df.columns.tolist(), ascending=False)

# Create the plot
fig, ax = plt.subplots(figsize=(12, 10))

ax.set_xlim(0., 1.0)  # Adjust x-axis limits

# Plot the stacked bars
x = [0.1, 0.9]  # x-coordinates for the four bars
width = 0.5  # width of the bars

for cell_type in df.index:
    bottoms = [df.loc[:cell_type, col].sum() - df.loc[cell_type, col] for col in df.columns]
    
    ax.bar(x, df.loc[cell_type], bottom=bottoms, width=width, alpha=0.6,
           color=ann_palette[cell_type], label=cell_type if x[0] == 0 else "")

# Customize the plot
ax.set_ylabel('Proportion of Cells', fontsize=14)
# ax.set_title('Cell Type Proportions Comparison', fontsize=18)
ax.set_xticks(x)
ax.set_xticklabels(['WT', 'Cd74 KO'], fontsize=12, rotation=0, ha='center')
# ax.set_xlim(-0.5, 2.5)

# Add cell type labels
for i, dataset in enumerate(df.columns):
    bottom = 0
    for cell_type in df.index:
        if df.loc[cell_type, dataset] > 0.015:  # Only label if proportion > 1%
            ax.text(x[i], bottom + df.loc[cell_type, dataset]/2, 
                    f'{cell_type} {df.loc[cell_type, dataset]:.1%}', 
                    ha='center', va='center', fontsize=12)
        bottom += df.loc[cell_type, dataset]

# Add a legend
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0., fontsize=10)

# Remove top and right spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

ax.set_xlim(-0.4, 1.6)  # Adjust x-axis limits
# ax.text(0.25, 1.02, 'Naive', transform=ax.transAxes, ha='center', va='bottom', fontsize=16, fontweight='bold')
# ax.text(0.77, 1.02, 'Pathogenic', transform=ax.transAxes, ha='center', va='bottom', fontsize=16, fontweight='bold')

# Optionally, save the figure
plt.savefig('tumor_cell_type_proportions_WT_vs_KO.pdf', dpi=600, bbox_inches='tight')

plt.show()



## Check expression of Pdcd1, Tigit, Ctla4, Havcr2, and Eomes in T cells

In [None]:
def create_split_violin_plot(df_wt, df_ko, genes_to_plot, figsize=(10, 6)):
    # First, let's reshape the data to long format
    # Add a condition column to each dataframe
    df_wt_long = df_wt[genes_to_plot].copy()
    df_wt_long['condition'] = 'WT'
    df_wt_long = df_wt_long.melt(id_vars=['condition'], 
                                value_vars=genes_to_plot,
                                var_name='gene', 
                                value_name='expression')

    df_ko_long = df_ko[genes_to_plot].copy()
    df_ko_long['condition'] = 'KO'
    df_ko_long = df_ko_long.melt(id_vars=['condition'], 
                                value_vars=genes_to_plot,
                                var_name='gene', 
                                value_name='expression')

    # Combine the dataframes
    df_combined = pd.concat([df_wt_long, df_ko_long], axis=0)
    df_combined = df_combined[(df_combined['gene']!='cell_type')&(df_combined['gene']!='WT/KO')]

    # df_combined['expression'] = np.exp(df_combined['expression'])
    
    # return df_combined

    # Create the plot
    plt.figure(figsize=figsize)
    ax = sns.violinplot(data=df_combined, 
                        x='gene', 
                        y='expression', 
                        hue='condition',
                        split=True,
                        gap=0.1,
                        # split=False,
                        density_norm = 'width',
                        inner='stick',
                        cut=0,
                        palette=['dodgerblue','orchid'],
                        alpha=0.7)

    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.xlim(-0.5, len(genes_to_plot)+0.25)
    
    return ax, df_combined
    

# Statistical testing
from scipy import stats
from statsmodels.stats.multitest import multipletests

def add_statistical_annotation(ax, df_long, genes):
    # Perform statistical tests
    p_values = []
    directions = []
    y_max = []
    
    for gene in genes:
        gene_data = df_long[df_long['gene'] == gene]
        # Convert to numeric and handle any non-numeric values
        wt_expr = pd.to_numeric(gene_data[gene_data['condition'] == 'WT']['expression'], errors='coerce')
        ko_expr = pd.to_numeric(gene_data[gene_data['condition'] == 'KO']['expression'], errors='coerce')

    
        # Remove NaN values
        wt_expr = wt_expr.dropna()
        ko_expr = ko_expr.dropna()
        
        # Convert to numpy arrays
        wt_expr = wt_expr.to_numpy()
        ko_expr = ko_expr.to_numpy()
        
        # Mann-Whitney U test
        try:
            if len(wt_expr) > 0 and len(ko_expr) > 0:
                _, p_value = stats.mannwhitneyu(wt_expr, ko_expr, alternative='two-sided')
                if np.mean(wt_expr) > np.mean(ko_expr):
                    direction = '>'
    
                if np.mean(wt_expr) < np.mean(ko_expr):
                    direction = '<'
                    
            else:
                p_value = 1.0
                direction = ' '
                
        except ValueError:
            p_value = 1.0

        
        p_values.append(p_value)

        directions.append(direction)
        
        # Get max y value for positioning asterisks
        # Handle empty arrays
        wt_max = np.max(wt_expr) if len(wt_expr) > 0 else -np.inf
        ko_max = np.max(ko_expr) if len(ko_expr) > 0 else -np.inf
        current_max = max(wt_max, ko_max)
        
        # If both arrays were empty, use 0 or another suitable default
        if current_max == -np.inf:
            current_max = 0
            
        y_max.append(current_max)
    
    # Only proceed with multiple testing correction if we have valid p-values
    if len(p_values) > 0:
        _, p_values_corrected, _, _ = multipletests(p_values, method='fdr_bh')
    else:
        p_values_corrected = []

    p_values_corrected_short = np.ones(len(genes))
    # Add asterisks
    if len(y_max) > 0:  # Only proceed if we have valid y_max values
        y_range = max(y_max) - min(y_max) if len(y_max) > 1 else 1
        
        for idx, (p, dir, ymax) in enumerate(zip(p_values_corrected, directions, y_max)):
            asterisk = ('****' if p < 0.0001 else 
                       '***' if p < 0.001 else 
                       '**' if p < 0.01 else 
                       '*' if p < 0.05 else 
                       'ns')

            p_values_corrected_short= "{:.2E}".format(p_values_corrected[idx])
            
            # Position the asterisk above the violin plot
            y_position = ymax +  y_range
            ax.text(idx, y_position, dir+asterisk, ha='center', va='bottom')
            ax.text(idx, y_position-0.2, 'p='+ str(p_values_corrected_short) , ha='center', va='bottom')
        
        # Adjust the plot limits to show asterisks
        plt.ylim(0, max(y_max) * 1.5)



In [None]:
list(adata_immune.obs['cell_type'].unique())

In [None]:
gene_list = ['Pdcd1', 'Tigit', 'Ctla4', 'Havcr2', 'Eomes']

Tcells = ['Tex','CD8+','Activated/TRM CD8+','T cells under regulated activation']

adata_Tcells = adata_immune[adata_immune.obs['cell_type'].isin(Tcells)].copy()


In [None]:

vars = gene_list.copy()
vars.append('WT/KO')
vars.append('cell_type')

df = sc.get.obs_df(adata_immune, vars)

# Consider only DC clusters
df_Tcells = df[df['cell_type'].isin(Tcells)]
# df_Tcells = df_Tcells[df_Tcells['Tigit']>0]

df_Tcells_ko = df_Tcells[df_Tcells['WT/KO']=='ko'][gene_list]
df_Tcells_wt = df_Tcells[df_Tcells['WT/KO']=='wt'][gene_list]

ax, df_long = create_split_violin_plot(df_Tcells_wt, df_Tcells_ko, gene_list)
add_statistical_annotation(ax, df_long, gene_list)


plt.savefig('tumor_dysfunction_markers_in_T_cells.pdf', dpi=600, bbox_inches='tight')
plt.show()


In [None]:
def compare_gene_expression_single_celltype(df, features, condition_key, condition1, condition2):
    """
    Compare gene expression between conditions for multiple genes in a single cell type.
    
    Parameters:
    -----------
    df : pandas DataFrame
        Input data frame containing expression data
    features : list
        List of genes to analyze
    condition_key : str
        Column name for condition (e.g., 'WT/KO')
    condition1: str
        Name of condition 1 (e.g., 'wt')
    condition2: str
        Name of condition 2 (e.g., 'ko')
    
    Returns:
    --------
    pandas DataFrame with results including statistics and significance
    """
    
    all_results = []
    all_p_values = []  # Store all p-values for global correction
    
    # Calculate p-values for each gene
    for feature in features:
        c1_expr = df[df[condition_key] == condition1][feature]
        c2_expr = df[df[condition_key] == condition2][feature]
        
        # Calculate p-value if enough samples
        if (len(c1_expr) > 1) & (len(c2_expr) > 1):
            statistic, p_value = stats.mannwhitneyu(c1_expr, c2_expr, alternative='two-sided')
        else:
            p_value = 1.0
            
        direction = '_'
        if (len(c1_expr) > 0) & (len(c2_expr) > 0):
            if np.mean(c1_expr) > np.mean(c2_expr):
                direction = '>'
            if np.mean(c1_expr) < np.mean(c2_expr):
                direction = '<'
                
        # Calculate mean expression for each condition
        mean_c1 = np.mean(c1_expr)
        mean_c2 = np.mean(c2_expr)
                
        all_results.append({
            'feature': feature,
            'direction': direction,
            'p_value': p_value,
            f'{condition1}_mean': mean_c1,
            f'{condition2}_mean': mean_c2
        })
        all_p_values.append(p_value)
    
    # Convert to DataFrame
    results_df = pd.DataFrame(all_results)
    
    # Correct p-values
    _, p_values_corrected, _, _ = multipletests(all_p_values, method='fdr_bh')
    results_df['p_value_corrected'] = p_values_corrected
    
    # Add significance asterisks based on corrected p-values
    results_df['significance'] = results_df['p_value_corrected'].apply(
        lambda p: '****' if p < 0.0001 else
                 ('***' if p < 0.001 else
                  ('**' if p < 0.01 else
                   ('*' if p < 0.05 else '')))
    )
    
    return results_df


In [None]:
T_cell_types =['Tex','CD8+','Activated/TRM CD8+','T cells under regulated activation']
adata_T_cells = adata_immune[adata_immune.obs['cell_type'].isin(T_cell_types)].copy()
adata_T_cells.obs['cell_type']=pd.Categorical(adata_T_cells.obs['cell_type']).remove_unused_categories()

dysfunction_markers = ['Pdcd1', 'Tigit', 'Ctla4', 'Havcr2', 'Eomes']
activation_markers = ['Mki67', 'Ifng', 'Gzmb', 'Prf1']

In [None]:
features = dysfunction_markers+activation_markers
cell_key = 'cell_type'
condition_key = 'WT/KO'

df = sc.get.obs_df(adata_immune[adata_immune.obs['pathogenicity']=='pathogenic'], features+ [condition_key, cell_key])

df = df[df['cell_type'].isin(T_cell_types)]

df[cell_key] = pd.Categorical(df[cell_key]).remove_unused_categories()
df_all_cells = df.copy()
df_all_cells[cell_key] = 'Combined T Cells'

# Concatenate original and combined data
df = pd.concat([df, df_all_cells])
og_cell_list=list(df[cell_key].unique())

cell_ids = []
for cell_id in np.arange(len(og_cell_list)):

    cell_type = og_cell_list[cell_id]
    
    if ((len(df[(df[features[0]]>0)&(df[cell_key]==cell_type)&(df[condition_key]=='wt')])<1)&
        (len(df[(df[features[0]]>0)&(df[cell_key]==cell_type)&(df[condition_key]=='ko')])<1)):
   
        df = df[df[cell_key]!=cell_type]
        

    else:

        cell_ids.append(cell_id)

cell_list=list(df[cell_key].unique())

df_T_cells = df_all_cells[df_all_cells[cell_key] == 'Combined T Cells'].copy()


plt.figure(figsize=(12,4))

df_melted = pd.melt(df_T_cells, 
                    id_vars=['WT/KO'], 
                    value_vars=features,
                    var_name='Gene',
                    value_name='Expression') 

# For analyzing level of expression in expressing cells)
df_T_cells_expressing = df_T_cells.copy()
df_melted_expressing = df_melted.copy()
df_melted_expressing = df_melted_expressing[df_melted_expressing['Expression']>0]

# ax = sns.violinplot(data=df_melted, x='Gene', y='Expression' , hue=condition_key, split=True, cut=0, inner='box', gap=.2, 
#                   density_norm='count', width=0.8, palette=['darkgreen','lightgreen'], legend=True, linewidth=1,
#                   inner_kws=dict(box_width=3, whis_width=0, color='k', marker='s', zorder=10), alpha=0.5)

ax = sns.violinplot(data=df_melted, x='Gene', y='Expression' , hue=condition_key, split=True, cut=0, inner='stick', gap=.2, 
                  density_norm='count', width=0.8, palette=['darkgreen','lightgreen'], legend=True, linewidth=1,
                   alpha=0.5)


# Customize the plot
plt.xticks(rotation=30,rotation_mode="anchor", fontsize=14, ha='right')
plt.ylabel('Expression Level')
plt.title('Gene Expression in Combined T Cells in Tumor Samples (WT vs KO)')


# # Get the maximum y value for positioning asterisks
y_max = df_melted.groupby('Gene')['Expression'].max()

# # Add asterisks above each violin plot
stat_res = compare_gene_expression_single_celltype(df_all_cells[df_all_cells[cell_key] == 'Combined T Cells'], features, condition_key, 'wt', 'ko')

for idx, (gene, direction, asterisk) in enumerate(zip(stat_res['feature'], stat_res['direction'], stat_res['significance'])):
    
    if asterisk:  # Only add text if there is a significance marker
        # Add some padding above the maximum value
        y_position = y_max[gene] + 0.2 * (y_max.max() - y_max.min())
        ax.text(idx, y_position,  direction+asterisk, ha='center', va='bottom', fontsize=12)

plt.ylim([-0.25,np.max(y_max)+1])
plt.xlim([-1.0,len(features)+0.5])
plt.savefig('tumor_combined_Tcells_wt_vs_ko_split.pdf', dpi=600, bbox_inches = "tight")

plt.show()
stat_res

In [None]:
def compare_gene_expression_expressing_cells(df, features, condition_key, condition1, condition2):
    """
    Compare gene expression between conditions for multiple genes in expressing cells only.
    
    Parameters:
    -----------
    df : pandas DataFrame
        Input data frame containing expression data
    features : list
        List of genes to analyze
    condition_key : str
        Column name for condition
    condition1: str
        Name of condition 1 (e.g., 'wt')
    condition2: str
        Name of condition 2 (e.g., 'ko')
    
    Returns:
    --------
    pandas DataFrame with results including statistics and significance
    """
    
    all_results = []
    all_p_values = []
    
    # Calculate p-values for each gene
    for feature in features:
        # Filter for expressing cells only for this specific gene
        expressing_cells = df[df[feature] > 0]
        
        c1_expr = expressing_cells[expressing_cells[condition_key] == condition1][feature]
        c2_expr = expressing_cells[expressing_cells[condition_key] == condition2][feature]
        
        # Calculate p-value if enough samples
        if (len(c1_expr) > 1) & (len(c2_expr) > 1):
            statistic, p_value = stats.mannwhitneyu(c1_expr, c2_expr, alternative='two-sided')
        else:
            p_value = 1.0
            
        direction = '_'
        if (len(c1_expr) > 0) & (len(c2_expr) > 0):
            if np.mean(c1_expr) > np.mean(c2_expr):
                direction = '>'
            if np.mean(c1_expr) < np.mean(c2_expr):
                direction = '<'
                
        # Calculate mean expression and percent expressing for each condition
        mean_c1 = np.mean(c1_expr) if len(c1_expr) > 0 else 0
        mean_c2 = np.mean(c2_expr) if len(c2_expr) > 0 else 0
        
        # Calculate percentage of expressing cells
        total_c1 = len(df[df[condition_key] == condition1])
        total_c2 = len(df[df[condition_key] == condition2])
        pct_c1 = (len(c1_expr) / total_c1) * 100 if total_c1 > 0 else 0
        pct_c2 = (len(c2_expr) / total_c2) * 100 if total_c2 > 0 else 0
                
        all_results.append({
            'feature': feature,
            'direction': direction,
            'p_value': p_value,
            f'{condition1}_mean': mean_c1,
            f'{condition2}_mean': mean_c2,
            f'{condition1}_pct': pct_c1,
            f'{condition2}_pct': pct_c2
        })
        all_p_values.append(p_value)
    
    # Convert to DataFrame
    results_df = pd.DataFrame(all_results)
    
    # Correct p-values
    _, p_values_corrected, _, _ = multipletests(all_p_values, method='fdr_bh')
    results_df['p_value_corrected'] = p_values_corrected
    
    # Add significance asterisks
    results_df['significance'] = results_df['p_value_corrected'].apply(
        lambda p: '****' if p < 0.0001 else
                 ('***' if p < 0.001 else
                  ('**' if p < 0.01 else
                   ('*' if p < 0.05 else '')))
    )
    
    return results_df

In [None]:
features = dysfunction_markers+activation_markers
cell_key = 'cell_type'
condition_key = 'WT/KO'

df = sc.get.obs_df(adata_immune[adata_immune.obs['pathogenicity']=='pathogenic'], features+ [condition_key, cell_key])

df = df[df['cell_type'].isin(T_cell_types)]

df[cell_key] = pd.Categorical(df[cell_key]).remove_unused_categories()
df_all_cells = df.copy()
df_all_cells[cell_key] = 'Combined T Cells'

# Concatenate original and combined data
df = pd.concat([df, df_all_cells])
og_cell_list=list(df[cell_key].unique())

cell_ids = []
for cell_id in np.arange(len(og_cell_list)):

    cell_type = og_cell_list[cell_id]
    
    if ((len(df[(df[features[0]]>0)&(df[cell_key]==cell_type)&(df[condition_key]=='wt')])<1)&
        (len(df[(df[features[0]]>0)&(df[cell_key]==cell_type)&(df[condition_key]=='ko')])<1)):
   
        df = df[df[cell_key]!=cell_type]
        

    else:

        cell_ids.append(cell_id)

cell_list=list(df[cell_key].unique())

df_T_cells = df_all_cells[df_all_cells[cell_key] == 'Combined T Cells'].copy()


plt.figure(figsize=(12,4))

df_melted = pd.melt(df_T_cells, 
                    id_vars=['WT/KO'], 
                    value_vars=features,
                    var_name='Gene',
                    value_name='Expression') 

# For analyzing level of expression in expressing cells)
df_T_cells_expressing = df_T_cells.copy()
df_melted_expressing = df_melted.copy()
df_melted_expressing = df_melted_expressing[df_melted_expressing['Expression']>0]

# ax = sns.violinplot(data=df_melted, x='Gene', y='Expression' , hue=condition_key, split=True, cut=0, inner='box', gap=.2, 
#                   density_norm='count', width=0.8, palette=['darkgreen','lightgreen'], legend=True, linewidth=1,
#                   inner_kws=dict(box_width=3, whis_width=0, color='k', marker='s', zorder=10), alpha=0.5)

ax = sns.violinplot(data=df_melted_expressing, x='Gene', y='Expression' , hue=condition_key, split=True, cut=0, inner='stick', gap=.2, 
                  density_norm='count', width=0.8, palette=['darkgreen','lightgreen'], legend=True, linewidth=1,
                   alpha=0.5)


# Customize the plot
plt.xticks(rotation=30,rotation_mode="anchor", fontsize=14, ha='right')
plt.ylabel('Expression Level')
plt.title('Gene Expression in Combined T Cells in tdLN (WT vs KO)')


# # Get the maximum y value for positioning asterisks
y_max = df_melted.groupby('Gene')['Expression'].max()

# # Add asterisks above each violin plot
stat_res = compare_gene_expression_expressing_cells(df_all_cells[df_all_cells[cell_key] == 'Combined T Cells'], features, condition_key, 'wt', 'ko')

for idx, (gene, direction, asterisk) in enumerate(zip(stat_res['feature'], stat_res['direction'], stat_res['significance'])):
    
    if asterisk:  # Only add text if there is a significance marker
        # Add some padding above the maximum value
        y_position = y_max[gene] + 0.2 * (y_max.max() - y_max.min())
        ax.text(idx, y_position,  direction+asterisk, ha='center', va='bottom', fontsize=12)

plt.ylim([-0.25,np.max(y_max)+1])
plt.xlim([-1.0,len(features)+0.5])
plt.savefig('tumor_combined_Tcells_expressing_wt_vs_ko_split.pdf', dpi=600, bbox_inches = "tight")

plt.show()
stat_res

## Markers used in annotation

### T cell markers

In [None]:
cd4t_act = ['Cd3e','Cd4','Gzmb']
cd8t_act = ['Cd3e','Cd8a','Gzmk','Clec4e','Gzmb','Pdcd1']
activated_cd8t_act = ['Cd3e','Cd8a','Cd28','Cd69']
effector_cd8t_act = ['Cd3e','Cd8a','Cd44','Cd69']

sc.tl.score_genes(adata_immune, cd8t_act, score_name='cd8t_act_score')
sc.tl.score_genes(adata_immune, cd4t_act, score_name='cd4t_act_score')
sc.tl.score_genes(adata_immune, activated_cd8t_act, score_name='activated_cd8t_act_score')
sc.tl.score_genes(adata_immune, effector_cd8t_act, score_name='effector_cd8t_act_score')


sc.pl.umap(adata_immune, color=['cd8t_act_score','cd4t_act_score','Cd4'],palette='tab20',cmap='coolwarm',vmax=2)
sc.pl.umap(adata_immune, color=['activated_cd8t_act_score','effector_cd8t_act_score','Pdcd1'],palette='tab20',cmap='coolwarm',vmax=2)


In [None]:
#Itga (CD49A) is a tissue resident cytotoxic CD8+ T marker (doi: 10.1016/j.immuni.2017.01.009)
sc.pl.umap(adata_immune, color=['Itga1','Ctla4','Cd8a'],palette='tab20',cmap='coolwarm')

In [None]:

exhausted_cd8t_act = ['Gzmb','Havcr2','Lag3','Pdcd1','Prf1','Tigit']
dneg_thymocyte_act = ['Cd3e','Cd3g','Klrd1','Nkg7']
th_act = ['Cd4','Cxcl5']
treg_act = ['Cd4','Foxp3','Cd3e','Odc1']
t_naive_act = ['Ccr7','Lef1','Sell','Tcf7']

sc.tl.score_genes(adata_immune, exhausted_cd8t_act, score_name='exhausted_cd8t_act_score')
sc.tl.score_genes(adata_immune, dneg_thymocyte_act, score_name='dneg_thymocyte_act_score')
sc.tl.score_genes(adata_immune, th_act, score_name='th_act_score')
sc.tl.score_genes(adata_immune, treg_act, score_name='treg_act_score')
sc.tl.score_genes(adata_immune, t_naive_act, score_name='t_naive_act_score')


sc.pl.umap(adata_immune, color=['exhausted_cd8t_act_score','dneg_thymocyte_act_score'],palette='tab20',cmap='coolwarm')
sc.pl.umap(adata_immune, color=['th_act_score','treg_act_score','t_naive_act_score'],palette='tab20',cmap='coolwarm')


In [None]:

Treg_genes =['Ikzf2','Foxp3', 'Ctla4','Il2ra']
adata_immune.obs['Treg_score'] = adata_immune[:,Treg_genes].X.sum(1)

Th_genes = ['Cxcr6','Bcl6','Prdm1','Tbx21']
adata_immune.obs['Th_score'] = adata_immune[:,Th_genes].X.sum(1)

cd8effector_genes = ['Cd8a','Cx3cr1','Gzmb','Lgals1','S1pr5']
adata_immune.obs['cd8effector_score'] = adata_immune[:,cd8effector_genes].X.sum(1)
TRM_genes = ['Itgae','Cd69']
adata_immune.obs['TRM_score'] = adata_immune[:,TRM_genes].X.sum(1)

sc.pl.umap(adata_immune, color=['Treg_score','Th_score'],palette='tab20',cmap='coolwarm', vmax=5)

In [None]:
sc.pl.umap(adata_immune, color=['cd8effector_score','TRM_score'],palette='tab20',cmap='coolwarm')

In [None]:
# This shows CTLA4+ CD8+ T cells if present
adata_immune.obs['ctla4_cd8_score'] = np.asarray(adata_immune[:,'Ctla4'].X.sum(1))*np.asarray(adata_immune[:,'Cd8a'].X.sum(1))/(np.max(np.asarray(adata_immune[:,'Ctla4'].X.sum(1)))*np.max(np.asarray(adata_immune[:,'Cd8a'].X.sum(1))))

sc.pl.umap(adata_immune, color=['ctla4_cd8_score'],palette='tab20',cmap='coolwarm', vmax=0.8)

In [None]:
#CD3E+ CD8A- CD4-
adata_immune.obs['DNTscore'] = (np.asarray(1-adata_immune[:,'Cd8a'].X.sum(1)/np.max(adata_immune[:,'Cd8a'].X.sum(1)))*np.asarray(1-adata_immune[:,'Cd4'].X.sum(1)/np.max(adata_immune[:,'Cd4'].X.sum(1))))*np.asarray(adata_immune[:,'Cd3e'].X.sum(1)/np.max(adata_immune[:,'Cd3e'].X.sum(1)))

#CD3E- CD8A- CD4-
adata_immune.obs['TNTscore'] = (np.asarray(1 - adata_immune[:,'Cd8a'].X.sum(1)/np.max(adata_immune[:,'Cd8a'].X.sum(1)))*np.asarray(1 - adata_immune[:,'Cd4'].X.sum(1)/np.max(adata_immune[:,'Cd4'].X.sum(1))))*np.asarray(1- adata_immune[:,'Cd3e'].X.sum(1)/np.max(adata_immune[:,'Cd3e'].X.sum(1)))
ncols = 2
nrows = 1
figsize = 8
wspace = 0.1
fig, axs = plt.subplots(
    nrows=nrows,
    ncols=ncols,
    figsize=(ncols * figsize + figsize * wspace * (ncols - 1), nrows * figsize),
)
sc.pl.umap(adata_immune, color=['DNTscore'],palette='tab20',cmap='coolwarm', ax=axs[0], show=False)
sc.pl.umap(adata_immune, color=['TNTscore'],palette='tab20',cmap='coolwarm', ax=axs[1],vmax=1)

### NK cells

In [None]:
nk_genes = ['Klrb1c', 'Ncr1','Itgal']
adata_immune.obs['nk_score'] = adata_immune[:,nk_genes].X.sum(1)
sc.pl.umap(adata_immune, color=['nk_score','Fcgr3'],palette='tab20',cmap='coolwarm',vmax=4)

### Naive B

In [None]:
naiveB_genes = ['Ighd', 'Ighm']
memoryB_genes = ['Itgam','Cd80','Cxcr3','Nt5e','Pdcd1lg2']
adata_immune.obs['naiveB_score'] = adata_immune[:,naiveB_genes].X.sum(1)
adata_immune.obs['memoryB_score'] = adata_immune[:,memoryB_genes].X.sum(1)
sc.pl.umap(adata_immune, color=['naiveB_score','memoryB_score'],palette='tab20',cmap='coolwarm', vmax=4)


In [None]:
## memoryB_genes2 = ['Aicda']
adata_immune.obs['memoryB_score2'] = adata_immune[:,memoryB_genes2].X.sum(1)

memoryB_genes3 = ['Cd27', 'Cd80', 'Cd86']
adata_immune.obs['memoryB_score3'] = adata_immune[:,memoryB_genes3].X.sum(1)
sc.pl.umap(adata_immune, color=['memoryB_score2','memoryB_score3'],palette='tab20',cmap='coolwarm', vmax=3)


### Age-associated B

In [None]:
ageB_genes = ['Ighm','Il10', 'Ifng', 'Itgax', 'Tbx2', 'Itgam', 'Fas']

adata_immune.obs['ageB_score'] = adata_immune[:,ageB_genes].X.sum(1)
sc.pl.umap(adata_immune, color=['ageB_score'],palette='tab20',cmap='coolwarm',vmax=5)


In [None]:
AAB_markers = ['Ighm', 'Il10', 'Ifng', 'Itgax', 'Tbx2', 'Itgam', 'Fas','Cd19', 'Cd38', 'Bcl6', 'Tbx21', 'Irf4', 'Cxcr5']

adata_immune.obs['AAB_score'] = adata_immune[:,AAB_markers].X.sum(1)
sc.pl.umap(adata_immune, color=['AAB_score'],palette='tab20',cmap='coolwarm',vmax=10)

In [None]:
#B cell activation markers
sc.pl.umap(adata_immune, color=['Cd86','Cd69'],palette='tab20',cmap='coolwarm',vmax=3)


In [None]:
sc.pl.umap(adata_immune, color=['Prdm1','Xbp1'],palette='tab20',cmap='coolwarm')


In [None]:
# MZB and B-1 cells
mzB_genes = ['Cd9','Cr2','Spib']

adata_immune.obs['mzB_score'] = adata_immune[:,mzB_genes].X.sum(1)
sc.pl.umap(adata_immune, color=mzB_genes,palette='tab20',cmap='coolwarm',vmax=2)

B1_genes = ['Cd5','Spn','Ptpn22']
sc.pl.umap(adata_immune, color=B1_genes,palette='tab20',cmap='coolwarm',vmax=1)

In [None]:
#Follicular B cells
folB_genes = ['Cd19', 'Cd79a', 'Ms4a1']
folB_low_genes = ['Prdm1', 'Xbp1']
sc.pl.umap(adata_immune, color=folB_genes,palette='tab20',cmap='coolwarm', vmax=2)
sc.pl.umap(adata_immune, color=folB_low_genes,palette='tab20',cmap='coolwarm', vmax=2)

### DC

In [None]:
# dc1_genes = ['Cd8a','Irf8']
dc1_genes = ['Cd8a','Irf8', 'Batf3', 'Nfil3','Id2', 'Bcl6', 'Xcr1','Rab43', 'Itgax', 'Itgae', 'Cd24a']
adata_immune.obs['dc1_score'] = adata_immune[:,dc1_genes].X.sum(1)
# dc2_genes = ['Relb', 'Esam', 'Itgam','Irf4', 'Sirpa']
dc2_genes = ['H2-Ab1', 'Sirpa', 'Cd4', 'Notch2', 'Clec4a2','Esam', 'Irf4', 'Relb', 'Zeb2', 'Klf4']
adata_immune.obs['dc2_score'] = np.asarray(adata_immune[:,dc2_genes].X.sum(1)/np.max(adata_immune[:,dc2_genes].X.sum(1)))*np.asarray(1-adata_immune[:,'Cd8a'].X.sum(1)/np.max(adata_immune[:,'Cd8a'].X.sum(1)))
pdc_genes =['Cd209a','Lag3','Lifr','Tcf4', 'Zeb2', 'Pacsin1', 'Spib']
adata_immune.obs['pdc_score'] = adata_immune[:,pdc_genes].X.sum(1)

migratorydc_genes = ['Ccr7','Itgae', 'Cd207', 'Cx3cr1', 'Itgax', 'H2-Ab1', 'Cd40', 'Cd80', 'Cd86', 'Relb', 'Zbtb46']
adata_immune.obs['migratorydc_score'] = adata_immune[:,migratorydc_genes].X.sum(1)

sc.pl.umap(adata_immune, color=['dc1_score'],palette='tab20',cmap='coolwarm',vmax=15)
sc.pl.umap(adata_immune, color=['dc2_score'],palette='tab20',cmap='coolwarm')


In [None]:
sc.pl.umap(adata_immune, color=dc1_genes,palette='tab20',cmap='coolwarm')


In [None]:
# For testing DC2 hypothesis
dc2_genes2= ['Itgam', 'Sirpa', 'Zbtb46','Flt3']
sc.pl.umap(adata_immune, color=dc2_genes2,palette='tab20',cmap='coolwarm', vmax=0.9)


In [None]:
## Division to Cross-presenting and Sirpa+ DC
## See Gurka et al, Front. Immunol., 04 February 2015 Sec. Antigen Presenting Cell Biology
## Volume 6 - 2015 | https://doi.org/10.3389/fimmu.2015.00035

#Sirpa and Xcr1 ar enot expressed in the tumor cells

cdc1_genes1 = ['Cd8a','Clec9a','Batf3','Xcr1']

adata_immune.obs['dc_xcr_score'] = (np.asarray(adata_immune[:,'Cd8a'].X.sum(1)/np.max(adata_immune[:,'Cd8a'].X.sum(1)))*np.asarray(adata_immune[:,'Clec9a'].X.sum(1)/np.max(adata_immune[:,'Clec9a'].X.sum(1))))*np.asarray(adata_immune[:,'Batf3'].X.sum(1)/np.max(adata_immune[:,'Batf3'].X.sum(1)))*np.asarray(adata_immune[:,'Xcr1'].X.sum(1)/np.max(adata_immune[:,'Xcr1'].X.sum(1)))*np.asarray(1.-adata_immune[:,'Sirpa'].X.sum(1)/np.max(adata_immune[:,'Sirpa'].X.sum(1)))

cdc1_genes2 = ['Cd8a','Clec9a','Batf3','Sirpa']
adata_immune.obs['dc_sirpa_score'] = (np.asarray(adata_immune[:,'Cd8a'].X.sum(1)/np.max(adata_immune[:,'Cd8a'].X.sum(1)))*np.asarray(adata_immune[:,'Clec9a'].X.sum(1)/np.max(adata_immune[:,'Clec9a'].X.sum(1))))*np.asarray(adata_immune[:,'Batf3'].X.sum(1)/np.max(adata_immune[:,'Batf3'].X.sum(1)))*np.asarray(1.-adata_immune[:,'Xcr1'].X.sum(1)/np.max(adata_immune[:,'Xcr1'].X.sum(1)))*np.asarray(adata_immune[:,'Sirpa'].X.sum(1)/np.max(adata_immune[:,'Sirpa'].X.sum(1)))

sc.pl.umap(adata_immune, color=['dc_xcr_score'],palette='tab20',cmap='coolwarm')
sc.pl.umap(adata_immune, color=['dc_sirpa_score'],palette='tab20',cmap='coolwarm')


In [None]:
sc.pl.umap(adata_immune, color=['pdc_score'],palette='tab20',cmap='coolwarm',vmax=10)
sc.pl.umap(adata_immune, color=['migratorydc_score'],palette='tab20',cmap='coolwarm',vmax=15)

In [None]:
#pDCs express Siglech
sc.pl.umap(adata_immune, color=['Siglech'],palette='tab20',cmap='coolwarm')


In [None]:
### Activated cDCs
acdc_genes = ['Itgax','Cd83' ,'H2-Aa', 'H2-Ab1', 'H2-Eb1']

adata_immune.obs['acdc_score'] = adata_immune[:,acdc_genes].X.sum(1)

sc.pl.umap(adata_immune, color=acdc_genes,palette='tab20',cmap='coolwarm')
sc.pl.umap(adata_immune, color=['acdc_score'],palette='tab20',cmap='coolwarm')

### Monocytes

In [None]:
clmono_genes=['Ccl9', 'Ccr2','Cd68','Ly6c2']
nclmono_genes = ['Csf1r','Cx3cr1','Fabp4']
adata_immune.obs['clmono_score'] = adata_immune[:,clmono_genes].X.sum(1)
adata_immune.obs['nclmono_score'] = adata_immune[:,nclmono_genes].X.sum(1)
adata_immune.obs['mono_score'] = (adata_immune.obs['clmono_score']+adata_immune.obs['nclmono_score'])/2
sc.pl.umap(adata_immune, color=['clmono_score', 'nclmono_score', 'mono_score'],palette='tab20',cmap='coolwarm',vmax=10)
sc.pl.umap(adata_immune, color=clmono_genes,palette='tab20',cmap='coolwarm')
sc.pl.umap(adata_immune, color=nclmono_genes,palette='tab20',cmap='coolwarm')

### Plasma cells

In [None]:
plasma_genes = ['Jchain','Sdc1','Prdm1','Xbp1']
adata_immune.obs['plasma_score'] = adata_immune[:,plasma_genes].X.sum(1)
sc.pl.umap(adata_immune, color=['plasma_score'], palette='tab20',cmap='coolwarm', vmax=4.)

### Macrophages

In [None]:
adata.var_names[adata.var_names.str.startswith('Sep')]

In [None]:
# macroph_genes = ['Adgre1','Itgam','Ly6g']

#PanglaoDB
macroph_genes = ['Cd68','Fcgr1','Naaa','Lyz2','Ccl12']
#['Adgre1','Itgam','Apoe','C1qa','Cx3cr1']
adata_immune.obs['macroph_score'] = adata_immune[:,macroph_genes].X.sum(1)

sc.pl.umap(adata_immune, color=['macroph_score'],palette='tab20',cmap='coolwarm', vmax=15)


### Erythrophagocytic macrophages

In [None]:
ephM_genes = ['Cd68', 'Adgre1', 'Mrc1']
adata_immune.obs['ephM_score'] = adata_immune[:,ephM_genes].X.sum(1)
sc.pl.umap(adata_immune, color=ephM_genes, palette='tab20',cmap='coolwarm') 

In [None]:
### Activated macrophages
actM_genes = [
    'Cd80', 'Cd86', 'H2-Ab1', 'H2-DMb1', 'Itgam',
    'Tnf', 'Il1b', 'Il6', 'Ccl2', 
    'F4/80', 'Cd11b', 'Cd68', 
    'Arg1', 'Nos2', 'Irf5', 'Cd274'
]

# Check if the genes are in the dataset
actM_genes = [gene for gene in actM_genes if gene in adata.var_names]

adata_immune.obs['actM_score'] = adata_immune[:,actM_genes].X.sum(1)

# Plot UMAP with marker gene expression
sc.pl.umap(adata_immune, color='actM_score',cmap='coolwarm', vmax=30)

# Show the plot
plt.show()

In [None]:
sc.pl.umap(adata_immune, color=['Cd68', 'Cd14', 'Adgre1'],palette='tab20',cmap='coolwarm',vmax=2)

In [None]:
# M1 markers
sc.pl.umap(adata_immune, color=['Nos2', 'Il1b', 'Tnf'],palette='tab20',cmap='coolwarm')

In [None]:
# M2 markers
sc.pl.umap(adata_immune, color=['Zeb2', 'Gab2','Mitf'],palette='tab20',cmap='coolwarm')
sc.pl.umap(adata_immune, color=['Arg1', 'Mrc1', 'Il10'],palette='tab20',cmap='coolwarm',vmax=1)


In [None]:
# Test antigen-presenting properties
sc.pl.umap(adata_immune, color=['H2-Aa', 'H2-Ab1','H2-Eb1'],palette='tab20',cmap='coolwarm')
sc.pl.umap(adata_immune, color=['Ciita', 'Cd80','Cd86'],palette='tab20',cmap='coolwarm')


### Neutrophils

In [None]:
neutrophil_genes1 = ['Csf3r','S100a8','Il1r2']
neutrophil_genes2 = ['S100a9','Elane']
sc.pl.umap(adata_immune, color=neutrophil_genes1, palette='tab20',cmap='coolwarm', vmax=2)
sc.pl.umap(adata_immune, color=neutrophil_genes2, palette='tab20',cmap='coolwarm', vmax=1)


### Progenitor cells

In [None]:
progenitor_genes = ['Cxcr5', 'Id3','Slamf6','Tcf7']
adata_immune.obs['progenitor_score'] = adata_immune[:,progenitor_genes].X.sum(1)
sc.pl.umap(adata_immune, color=['progenitor_score'], palette='tab20',cmap='coolwarm')

### NKT cells

In [None]:
NKT_genes = ['Ncam1', 'Gata3', 'Il2rb'] #From PanglaoDB
adata_immune.obs['NKT_score'] = adata_immune[:,NKT_genes].X.sum(1)
sc.pl.umap(adata_immune, color=['NKT_score'], palette='tab20',cmap='coolwarm', vmax=5)
#NK and NKT markers (necessary)
sc.pl.umap(adata_immune, color=['Klrb1c','Itga2'], palette='tab20',cmap='coolwarm')

In [None]:
# Further markers for annotating
# the high-Tox cluster,  cluster 8, https://doi.org/10.1038/s41586-019-1325-x
# https://doi.org/10.3389/fimmu.2023.990419.)

exhaustion_markers = [
    "Pdcd1",    # PD-1
    "Havcr2",   # Tim-3
    "Lag3",
    "Tigit",
    "Tox",
    "Eomes",
    "Gzmb",     # Granzyme B
    "Prf1",     # Perforin
]

general_T_markers = [
    "Cd2",
    "Cd5",
    "Cd28",
    "Cd3e",     # Already mentioned as low, but good to confirm
    "Cd8a",     # Already mentioned as low, but good to confirm
]

sc.pl.umap(adata_immune, color=exhaustion_markers, palette='tab20',cmap='coolwarm')

In [None]:
# Final stage of exhaustion: TCF1neg T-betlo TOXhi Eomeshi

sc.pl.umap(adata_immune, color=['Tcf7','Tbx21','Tox','Eomes'], palette='tab20',cmap='coolwarm')

In [None]:
#additional exhaustion markers
#progenitor
sc.pl.umap(adata_immune, color=['Tcf7','Il7r'], palette='tab20',cmap='coolwarm')

#Tissue residency markers
sc.pl.umap(adata_immune, color=['Cd69', 'Itgae'], palette='tab20',cmap='coolwarm', vmax=2.5)


In [None]:
#Migratory properties
chemokine_receptors = ['Ccr7', 'Cxcr3', 'Cxcr4', 'Ccr5']
sc.pl.umap(adata_immune, color=chemokine_receptors, palette='tab20',cmap='coolwarm', vmax=4)

In [None]:
#Investigate T cell-DC interactions (cluster 11)
sc.pl.umap(adata_immune, color=['Cd28','Cd80'], palette='tab20',cmap='coolwarm',vmax=2)
sc.pl.umap(adata_immune, color=['Cd40lg','Cd40'], palette='tab20',cmap='coolwarm',vmax=2)
sc.pl.umap(adata_immune, color=['Pdcd1','Cd247', 'Pdcd1lg2'], palette='tab20',cmap='coolwarm',vmax=2)
sc.pl.umap(adata_immune, color=['Ctla4','Cd86'], palette='tab20',cmap='coolwarm',vmax=2)


### gamma delta T

In [None]:
gamma_delta_genes = ['Tcrg-V4', 'Tcrg-V6', 'Tcrg-V1','Tcrg', 'Tcrd', 'Trgv1', 'Trgv2', 'Trgv3', 'Trgv4', 'Trgv5', 'Trgv6', 'Trgv7','Trdv1', 'Trdv3', 'Trdv4', 'Trdv5','Trg', 'Trd', 'Sox13', 'Id3', 'Blk', 'Il17a', 'Ifng']
present_gamma_delta_genes = [gene for gene in gamma_delta_genes if gene in adata.var_names]
print("Present γδ T cell genes:", present_gamma_delta_genes)
adata_immune.obs['gamma_delta_score'] = adata_immune[:,present_gamma_delta_genes].X.sum(1)
sc.pl.umap(adata_immune, color=['gamma_delta_score'], palette='tab20',cmap='coolwarm', vmax=4)

## Tumor-associated suppressor cells

In [None]:
suppressor_markers =["Itgam",    # CD11b, myeloid marker
    "Cd33",     # myeloid marker
    "Arg1",     # often expressed in MDSCs
    "Nos2",     # iNOS, often in MDSCs
    "S100a8",   # often in MDSCs
    "S100a9",
]

sc.pl.umap(adata_immune, color=suppressor_markers, palette='tab20',cmap='coolwarm')

### DecoupleR

In [None]:
import decoupler as dc

In [None]:
markers = dc.get_resource('PanglaoDB')
markers

In [None]:
markers = dc.get_resource('PanglaoDB')

# Convert datatypes from string to float and bool
markers['mouse_sensitivity'] = markers['mouse_sensitivity'].astype(float)
markers['mouse'] = markers['mouse'].astype(bool)
markers['canonical_marker'] = markers['canonical_marker'].astype(bool)

# Filter by canonical_marker and mouse
markers = markers[(markers['mouse']==True) & (markers['canonical_marker']==True) & (markers['mouse_sensitivity'] > 0.5)]

# Remove duplicated entries
markers = markers[~markers.duplicated(['cell_type', 'genesymbol'])]
markers.index = np.arange(markers.shape[0])

# PanglaoDB genes are in human nomenclature. Download conversion list
mouse_human_genes = pd.read_csv("http://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt",sep="\t", index_col=False)
mouse = mouse_human_genes[mouse_human_genes['Common Organism Name']=='mouse, laboratory']
human = mouse_human_genes[mouse_human_genes['Common Organism Name']=='human']

# Collect dataframes for conversion
mouse = mouse[['DB Class Key', 'Symbol']]
mouse.index = np.arange(mouse.shape[0])

human = human[['DB Class Key', 'Symbol']]
human.index = np.arange(human.shape[0])

# Convert from human to mouse
for sym_idx in np.arange(markers.shape[0]):

    row_idx  = np.where(human['Symbol']== markers['genesymbol'].iloc[sym_idx])[0]

    if len(row_idx)>0:
        
        dbck = human['DB Class Key'][row_idx].values[0]
        mouse_gene = mouse[mouse['DB Class Key']==dbck]['Symbol'].values
        markers['genesymbol'].iloc[sym_idx] =  mouse_gene[0]

# Remove duplicated entries
markers = markers[~markers.duplicated(['cell_type', 'genesymbol'])]

markers

In [None]:
# Run Overrepresentation analysis
tissue = 'tumor'

dc.run_ora(
    mat=adata_immune,
    net=markers,
    source='cell_type',
    target='genesymbol',
    min_n=3,
    verbose=True,
    use_raw=False
)

# New anndata object with ora estimates
acts = dc.get_acts(adata_immune, obsm_key='ora_estimate')

# We need to remove inf and set them to the maximum value observed for pvals=0
acts_v = acts.X.ravel()
max_e = np.nanmax(acts_v[np.isfinite(acts_v)])
acts.X[~np.isfinite(acts.X)] = max_e

acts

df = dc.rank_sources_groups(acts, groupby='leiden', reference='rest', method='t-test_overestim_var')

# List most prominent predicted cell types in each Leiden cluster
n_ctypes = 3
ctypes_dict = df.groupby('group').head(n_ctypes).groupby('group')['names'].apply(lambda x: list(x)).to_dict()
ctypes_dict

# Dictionary for annotation
annotation_dict_PanglaoDB = df.groupby('group').head(1).set_index('group')['names'].to_dict()

In [None]:
# Add cell type column based on annotation
adata_immune.obs['cell_type_PanglaoDB'] = [annotation_dict_PanglaoDB[clust] for clust in (adata_immune.obs['leiden'])]

# Plot results
sc.pl.umap(adata_immune, color=['leiden','cell_type_PanglaoDB'], title=['Leiden','PanglaoDB'], palette='tab20')


### Non-immune cluster

In [None]:
# Fine-resolution clustering
sc.tl.leiden(adata_other, resolution=0.5)

In [None]:
sc.pl.umap(adata_other, color=['leiden','WT/KO'], title=[str(tissue) +' Leiden',str(tissue) +' WT/KO'], palette='tab20', legend_loc='on data')


In [None]:
# Run Overrepresentation analysis
tissue = 'tumor'

dc.run_ora(
    mat=adata_other,
    net=markers,
    source='cell_type',
    target='genesymbol',
    min_n=3,
    verbose=True,
    use_raw=False
)

# New anndata object with ora estimates
acts = dc.get_acts(adata_other, obsm_key='ora_estimate')

# We need to remove inf and set them to the maximum value observed for pvals=0
acts_v = acts.X.ravel()
max_e = np.nanmax(acts_v[np.isfinite(acts_v)])
acts.X[~np.isfinite(acts.X)] = max_e

acts

df = dc.rank_sources_groups(acts, groupby='leiden', reference='rest', method='t-test_overestim_var')

# List most prominent predicted cell types in each Leiden cluster
n_ctypes = 3
ctypes_dict = df.groupby('group').head(n_ctypes).groupby('group')['names'].apply(lambda x: list(x)).to_dict()
ctypes_dict

# Dictionary for annotation
annotation_dict_PanglaoDB = df.groupby('group').head(1).set_index('group')['names'].to_dict()

In [None]:
# Add cell type column based on annotation
adata_other.obs['cell_type_PanglaoDB'] = [annotation_dict_PanglaoDB[clust] for clust in (adata_other.obs['leiden'])]

# Plot results
sc.pl.umap(adata_other, color=['leiden','cell_type_PanglaoDB'], title=['Leiden','PanglaoDB'], palette='tab20')
