# Evaluate gene expression in spleen samples

In [None]:
import numpy as np
import seaborn as sns

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.legend import Legend
import matplotlib.colors as colors
from matplotlib import cm
from mpl_toolkits.mplot3d import axes3d
import pandas as pd
import scipy
import scanpy as sc
import anndata as ad
    
from sklearn import datasets
from sklearn.decomposition import PCA

from numba import jit

import celltypist
from celltypist import models

from collections import defaultdict

from matplotlib.cm import ScalarMappable

In [None]:
#Custom colormap

from matplotlib.cm import register_cmap
from matplotlib.colors import ListedColormap

tab20b = matplotlib.colormaps['tab20b']
tab20c = matplotlib.colormaps['tab20c']
colors1 = tab20b(np.linspace(3.001/5., 1, 9))
colors2 = tab20c(np.linspace(0, 3.999/5., 16))

colors = np.concatenate([colors1, colors2])

map_name = 'op_tab25'
op_cmap = ListedColormap(colors, name=map_name )
matplotlib.colormaps.register(name=map_name, cmap=op_cmap)

In [None]:
def lighten(rgb_tuple, factor=0.55):
    """
    factor in (0,1]; smaller -> closer to white.
    """
    r, g, b = rgb_tuple
    return (1 - (1 - r) * factor,
            1 - (1 - g) * factor,
            1 - (1 - b) * factor)

In [None]:
sc.set_figure_params(scanpy=True, dpi=300, dpi_save=1200, frameon=True, vector_friendly=False, fontsize=14,
                         figsize=(9,8),  format='pdf', facecolor=None, transparent=False, ipython_format='png2x')

In [None]:
from scipy import stats
from statsmodels.stats.multitest import multipletests
from matplotlib.colors import to_rgb
from matplotlib.collections import PolyCollection
from matplotlib.legend_handler import HandlerTuple
import textwrap

def compare_gene_expression(df, ordered_cell_list, features, condition_key, condition1, condition2, cell_type_key, correction_method):
    """
    Compare gene expression between conditions across multiple cell types and genes,
    including an analysis of all cells combined.
    
    Parameters:
    -----------
    df : pandas DataFrame
        Input data frame containing expression data
    ordered_cell_list : list
        List of cell types in desired order
    features : list
        List of genes or signature scores to analyze
    condition_key : str
        Column name for condition 
    condition1: str
        Name of condition 1 (e.g. 'wt')
    condition2: str
        Name of condition 2 (e.g. 'ko')       
    cell_type_key : str
        Column name for cell type (e.g. 'leiden' or 'cell_type')
    correction_method: str
        e.g. 'bonferroni' or 'fdr_bh'
    
    Returns:
    --------
    tuple: (pandas DataFrame with detailed results, pandas DataFrame with pivoted results)
    """
    
    cell_types = pd.Categorical(list(ordered_cell_list))
    
    all_results = []
    all_p_values = []  # Store all p-values for global correction
    
    # First, calculate all p-values
    for cell_type in cell_types:
        
        # For 'Combined Cells', use the complete dataset; otherwise filter by cell type
        if cell_type == 'Combined Cells':
            cell_type_data = df
        
        else:
            cell_type_data = df[df[cell_type_key] == cell_type]

        direction = '_'
        
        for feature in features:
            
            c1_expr = cell_type_data[cell_type_data[condition_key] ==  condition1][feature]
            c2_expr = cell_type_data[cell_type_data[condition_key] ==  condition2][feature]
            
            # Calculate p-value if enough samples
            if (len(c1_expr) > 1) & (len(c2_expr) > 1):
                statistic, p_value = stats.mannwhitneyu(c1_expr, c2_expr, alternative='two-sided')
                
            else:
                p_value = 1.0

            if (len(c1_expr) > 0) & (len(c2_expr) > 0):
                
                if np.mean(c1_expr) > np.mean(c2_expr):
                    direction = '>'
    
                if np.mean(c1_expr) < np.mean(c2_expr):
                    direction = '<'
                
            all_results.append({
                'cell_type': cell_type,
                'feature': feature,
                'direction': direction,
                'p_value': p_value
            })
            all_p_values.append(p_value)
    
    # Convert to DataFrame
    results_df = pd.DataFrame(all_results)
    
    # Correct all p-values together
    _, p_values_corrected, _, _ = multipletests(all_p_values, method=correction_method)
    results_df['p_value_corrected'] = p_values_corrected
    
    # Add significance asterisks based on corrected p-values
    results_df['significance'] = results_df['p_value_corrected'].apply(
        lambda p: '****' if p < 0.0001 else
        ('***' if p < 0.001 else
        ('**' if p < 0.01 else
        ('*' if p < 0.05 else '')))
    )
    
    # Reshape the results to a more readable format
    pivot_df = results_df.pivot(
        index='cell_type',
        columns='feature',
        values=['direction','p_value', 'p_value_corrected', 'significance']
    )
    
    return results_df, pivot_df


In [None]:
def compare_gene_expression_single_celltype(df, features, condition_key, condition1, condition2):
    """
    Compare gene expression between conditions for multiple genes in a single cell type.
    
    Parameters:
    -----------
    df : pandas DataFrame
        Input data frame containing expression data
    features : list
        List of genes to analyze
    condition_key : str
        Column name for condition (e.g., 'WT/KO')
    condition1: str
        Name of condition 1 (e.g., 'wt')
    condition2: str
        Name of condition 2 (e.g., 'ko')
    
    Returns:
    --------
    pandas DataFrame with results including statistics and significance
    """
    
    all_results = []
    all_p_values = []  # Store all p-values for global correction
    
    # Calculate p-values for each gene
    for feature in features:
        c1_expr = df[df[condition_key] == condition1][feature]
        c2_expr = df[df[condition_key] == condition2][feature]
        
        # Calculate p-value if enough samples
        if (len(c1_expr) > 1) & (len(c2_expr) > 1):
            statistic, p_value = stats.mannwhitneyu(c1_expr, c2_expr, alternative='two-sided')
        else:
            p_value = 1.0
            
        direction = '_'
        if (len(c1_expr) > 0) & (len(c2_expr) > 0):
            if np.mean(c1_expr) > np.mean(c2_expr):
                direction = '>'
            if np.mean(c1_expr) < np.mean(c2_expr):
                direction = '<'
                
        # Calculate mean expression for each condition
        mean_c1 = np.mean(c1_expr)
        mean_c2 = np.mean(c2_expr)
                
        all_results.append({
            'feature': feature,
            'direction': direction,
            'p_value': p_value,
            f'{condition1}_mean': mean_c1,
            f'{condition2}_mean': mean_c2
        })
        all_p_values.append(p_value)
    
    # Convert to DataFrame
    results_df = pd.DataFrame(all_results)
    
    # Correct p-values
    _, p_values_corrected, _, _ = multipletests(all_p_values, method='bonferroni')
    results_df['p_value_corrected'] = p_values_corrected
    
    # Add significance asterisks based on corrected p-values
    results_df['significance'] = results_df['p_value_corrected'].apply(
        lambda p: '****' if p < 0.0001 else
                 ('***' if p < 0.001 else
                  ('**' if p < 0.01 else
                   ('*' if p < 0.05 else '')))
    )
    
    return results_df


## Load annotated data

In [None]:

adata = sc.read("maranou_032024_spleen_annotated.h5ad")
tissue = 'spleen'

In [None]:
# Based on DEA and marker genes form PanglaoDB, Tabula Muris and Cell Marker 2.0 (currently dysfunctional), Annotation of Cell Types: ACT
annotation_dict_high_res = {"0":"Naive B cells",#
                   "1":"Naive B cells",#
                   "2":"MZB and B-1 cells",#
                   "3":"Naive B cells",#
                   "4":"Activated B cells",#
                   "5":"Immature B cells",#
                   "6":"NK cells",#
                   "7":"CD8+",#
                   "8":"Treg",#
                   "9":"Naive B cells",#
                   "10":"CD4+",#
                   "11":"Monocytes and macrophages", #Classical monocytes here
                   "12":"MZB and B-1 cells",#  
                   # "13":"DC2",#
                    "dc2r0":"WDFY4+ cDC2",#
                   "dc2r1":"Relb(low) cDC2", #
                   "dc2r2":"Migratory cDC2",#
                   
   
                   "14":"Tcm",#
                   "15":"Mature follicular B cells", #
                   "17":"Th",#
                   "18":"Monocytes and macrophages", #Nonclassical monocytes
                   "19":"Proliferative B cells",# Activated follicular B cells?
                   "20":"Lymphoid-resident cDC1",#
                   "21":"Heterogenous T cells",#
                   "22":"Neutrophils", #Activated neutrophils or myeloid-derived suppressor cells (MDSCs)
                   "24":"Mast cells",#

                   # The two DC1 clusters are also clearly separated by CyC than Cd8a (which has varied expression in CyC(hi))
                   # Ref: The protease inhibitor cystatin C is differentially expressed among dendritic cell populations, but does not control antigen presentation
                    #El-Sukkari et al. J Immunol. 2003 Nov 15;171(10):5003-11.  doi: 10.4049/jimmunol.171.10.5003. 
                   "dc1r0":"CD8- CCR2+ cDC1",#
                   "dc1r1":"CD8(low) cDC1", #
                   "dc1r2":"CD8- CCR2- cDC1",#
                   
                   "26":"pDC",#
                   "27":"Activated T cells", #With B cell characteristics Cd79a, Pax5, and Ighd
                   "28":"Plasma cells",#
                   "r0":"Undefined DC",#
                   "r1":"Red pulp macrophages", #
                   "r2":"Relb(int.) cDC2",#
                   "r3":"Germinal center B cells",
                   "r4":"Plasma cells",#
                   "r5":"Treg"
                  }

ann_colors = plt.colormaps['tab20'].colors
ann_palette_all={"Germinal center B cells": ann_colors[18],
                       "Naive B cells": ann_colors[0],
                       "CD8+":ann_colors[4],
                       "Activated B cells": ann_colors[13],
                       "Immature B cells":plt.matplotlib.colors.to_rgb('dodgerblue'),
                       "Activated T cells":ann_colors[2],
                        "Tcm":plt.matplotlib.colors.to_rgb('lightseagreen'),
                       "CD4+":plt.matplotlib.colors.to_rgb('greenyellow'),
                       "Treg":ann_colors[16],
                         "Th":plt.matplotlib.colors.to_rgb('lime'),
                        "Red pulp macrophages":ann_colors[3],
                       "Heterogenous T cells":ann_colors[15],
                       "NK cells":ann_colors[17],
                       "Th cells":plt.matplotlib.colors.to_rgb('mediumseagreen'),
                       "MZB and B-1 cells":ann_colors[1],
                       "Mature follicular B cells":plt.matplotlib.colors.to_rgb('navy'),
                       "Undefined DC":ann_colors[5],
                       "CCR7+ DC1":ann_colors[12],
                       "pDC":ann_colors[6],
                       "Plasma cells":ann_colors[19],
                       "Monocytes and macrophages":ann_colors[7], #Probably Classical monocytes
                       "Lymphoid-resident cDC1":plt.matplotlib.colors.to_rgb('palevioletred'), #XCR1+ [Gurka et al]
                        "CD8- CCR2+ cDC1":plt.matplotlib.colors.to_rgb('mediumvioletred'),#
                       "CD8(low) cDC1":ann_colors[13], #
                       "CD8- CCR2- cDC1":plt.matplotlib.colors.to_rgb('deeppink'),#
                        "Relb(int.) cDC2":plt.matplotlib.colors.to_rgb('darkmagenta'),#
                        "WDFY4+ cDC2":plt.matplotlib.colors.to_rgb('darkorchid'), #
                        "Relb(low) cDC2":plt.matplotlib.colors.to_rgb('indianred'),#
                       "Migratory cDC2":plt.matplotlib.colors.to_rgb('darkred'), 
                       "Proliferative":plt.matplotlib.colors.to_rgb('b'),
                       "Mast cells":plt.matplotlib.colors.to_rgb('cornflowerblue'),
                       "Neutrophils":plt.matplotlib.colors.to_rgb('coral'),
                        "Proliferative B cells":plt.matplotlib.colors.to_rgb('cyan'),
                }

adata.obs['cell_type'] = adata.obs.leiden.map(annotation_dict_high_res)

In [None]:
## Alternative colors for cell types

# ann_colors = plt.colormaps['tab20'].colors
# ann_palette_all={"Germinal center B cells": ann_colors[18], 
#                        "Naive B cells": ann_colors[0],
#                        "CD8+":ann_colors[4],
#                        "Activated B cells": ann_colors[13],
#                        "Immature B cells":plt.matplotlib.colors.to_rgb('dodgerblue'),
#                        "Activated T cells":ann_colors[2],
#                         "Tcm":plt.matplotlib.colors.to_rgb('lightseagreen'),
#                        "CD4+":plt.matplotlib.colors.to_rgb('greenyellow'),
#                        "Treg":ann_colors[16],
#                         "Th":plt.matplotlib.colors.to_rgb('lime'),
#                         "Red pulp macrophages":ann_colors[3],
#                        "Heterogenous T cells":ann_colors[15],
#                        "NK cells":ann_colors[17],
#                        "Th cells":plt.matplotlib.colors.to_rgb('mediumseagreen'),
#                        "MZB and B-1 cells":ann_colors[1],
#                        "Mature follicular B cells":plt.matplotlib.colors.to_rgb('navy'),
#                        "Undefined DC":ann_colors[5],
#                        "CCR7+ DC1":ann_colors[12],
#                        "pDC":ann_colors[6],
#                        "Plasma cells":ann_colors[19],
#                        "Monocytes and macrophages":ann_colors[7], #Probably Classical monocytes
#                        "Lymphoid-resident cDC1":ann_colors[8], #XCR1+ [Gurka et al]
#                         "CD8- CCR2+ cDC1":plt.matplotlib.colors.to_rgb('b'),#
#                        "CD8(low) cDC1":ann_colors[0], #
#                        "CD8- CCR2- cDC1":plt.matplotlib.colors.to_rgb('navy'),#
#                         "Relb(int.) cDC2":ann_colors[13],#
#                         "WDFY4+ cDC2":plt.matplotlib.colors.to_rgb('darkorchid'), #
#                         "Relb(low) cDC2":plt.matplotlib.colors.to_rgb('firebrick'),#
#                         "Relb(int.) cDC2":plt.matplotlib.colors.to_rgb('darkorchid'), 
#                        "Migratory cDC2":plt.matplotlib.colors.to_rgb('cornflowerblue'), 
#                        "Proliferative":plt.matplotlib.colors.to_rgb('b'),
#                        "Mast cells":plt.matplotlib.colors.to_rgb('cornflowerblue'),
#                        "Neutrophils":plt.matplotlib.colors.to_rgb('coral'),
#                         "Proliferative B cells":plt.matplotlib.colors.to_rgb('cyan'),
#                 }



In [None]:
sc.set_figure_params(scanpy=True, dpi=600, dpi_save=600, frameon=True, vector_friendly=False, fontsize=14,
                         figsize=(9,8),  format='pdf', facecolor=None, transparent=False, ipython_format='png2x')

In [None]:
sc.pl.umap(adata, color=['cell_type'], title='Spleen', palette=ann_palette_all,legend_loc='on data',add_outline=True, outline_width = (0.3,0.8),legend_fontsize=8, legend_fontweight='heavy', save='_spleen_annotations_high_res.pdf')


In [None]:
sc.pl.umap(adata, color=['cell_type'], title='Spleen', palette=ann_palette_all,legend_loc=None,add_outline=True, outline_width = (0.3,0.8),legend_fontsize=8, legend_fontweight='heavy', save='_spleen_annotations_no_legend.pdf')


## Define a helper function for plotting gene expression data in cell types under two conditions

In [None]:
adata.obs['cell_type_low_res']

In [None]:
# List of DC types in spleen dataa for plotting. 'Undefined DC' omitted.

DC_list = ['Relb(low) cDC2','WDFY4+ cDC2','Lymphoid-resident cDC1','pDC','CD8- CCR2+ cDC1','CD8- CCR2- cDC1','Relb(int.) cDC2','Migratory cDC2',
 'CD8(low) cDC1']

all_cells_list =list(adata.obs['cell_type_low_res'].unique())

DC_to_simple_dict = {'Relb(low) cDC2':'${\it\mathrm{Relb}}^{\mathrm{low}}$\n cDC2',
                     'Relb(int.) cDC2': '${\it\mathrm{Relb}}^{\mathrm{int}}$\n cDC2',
                     'WDFY4+ cDC2':'cDC2', 
                     'Lymphoid-resident cDC1':'Resident\n cDC1',
                     'pDC':'pDC',
                     'CD8(low) cDC1':'Migratory\n cDC1', 
                     'CD8- CCR2+ cDC1':'${\it \mathrm{Ccr2}}^{\mathrm{int}}$\n cDC1',
                     'CD8- CCR2- cDC1':'${\it\mathrm{Ccr2}}^{\mathrm{low}}$\n cDC1',
                     'Migratory cDC2':'Migratory\n cDC2',
                     'Combined DC':'Combined\n DC'}
                             
                             
DC_simple_names = [DC_to_simple_dict[cell] for cell in DC_list] 

In [None]:
DC_simple_names

In [None]:
#Evaluate significance

from scipy import stats
from statsmodels.stats.multitest import multipletests

def compare_gene_expression_pathogenicity(df, ordered_cell_list, gene='Cd74', condition_key='pathogenicity', cell_type_key='cell_type'):
    
    cell_types = pd.Categorical(list(ordered_cell_list))
    # cell_types = df[cell_type_key].unique()
    p_values = []
    effect_sizes = []
    # cell_types_not_nan = []
    
    for cell_type in cell_types:
        cell_type_data = df[df[cell_type_key] == cell_type]
        naive_expr = cell_type_data[cell_type_data[condition_key] == 'naive'][gene]
        pathogenic_expr = cell_type_data[cell_type_data[condition_key] == 'pathogenic'][gene]

        #Include only cell types with nonzero counts
        if (len(naive_expr)>0)&(len(pathogenic_expr)>0):
        # Perform Mann-Whitney U test
            statistic, p_value = stats.mannwhitneyu(naive_expr, pathogenic_expr, alternative='two-sided')
            p_values.append(p_value)
            # cell_types_not_nan.append(cell_type)

        else:

            p_values.append(1.)
            
    # Correct for multiple testing
    _, p_values_corrected, _, _ = multipletests(p_values, method='fdr_bh')
    
    results = pd.DataFrame({
        'cell_type': cell_types,
        'p_value': p_values,
        'p_value_corrected': p_values_corrected
    })

    # Add significance asterisks
    results['significance'] = results['p_value_corrected'].apply(lambda p: 
        '****' if p < 0.0001 else 
        (
            '***' if p < 0.001 else 
            ('**' if p < 0.01 else 
             ('*' if p < 0.05 else ''))
        )
    )
    
    return results


def visualize_results(results):
    plt.figure(figsize=(10, 8))
    
    # Heatmap of -log10(p-values)
    ax = sns.heatmap(-np.log10(results[['p_value', 'p_value_corrected']]), 
                annot=False, 
                cmap='YlOrRd',
                yticklabels=results['cell_type'],
                xticklabels=False
                    )
    
    # Add significance asterisks
    for i, (_, row) in enumerate(results.iterrows()):
        ax.text(1.0, i + 0.7, row['significance'], 
                horizontalalignment='center', verticalalignment='center', color='black')
        
    plt.title('Significance of Cd74 expression difference\n-log10(p-value) with significance levels')
    plt.xlabel('Raw p-value                 Corrected p-value')
    
    plt.tight_layout()
    plt.show()

In [None]:


def analyze_expression_under_conditions(adata, cell_cat = 'DC', cell_types=DC_list, features = ['Cd74'], cell_key = 'cell_type', condition_key = 'WT/KO', conditions = ['wt','ko'],tissue='pathogenic'):

    ''' 
    cell_cat: either 'all_cells' or 'DC' # plotting is optimized for DC
    cell_types: list of cell types to be included in the analysis and plotting. Combined DC includes only these.
    cell_key: annotations to be used
    condition_key: 'WT/KO' # 'pathogenicity' not yet implemented
    conditions: ['wt','ko'] # ['naive','pathogenic'] not yet implemented
    tissue: 'pathogenic' or 'naive' # ['wt','ko'] not yet implemented
    '''
    
    hue_levels = ['wt', 'ko']
    
    df = sc.get.obs_df(adata[adata.obs['pathogenicity']==tissue], [features[0], condition_key, cell_key])
    
    df = df[df[cell_key].isin(cell_types)]
    
    df[cell_key] = pd.Categorical(df[cell_key]).remove_unused_categories()
    df_all_cells = df.copy()
    

    if cell_cat == 'DC':
        df_all_cells[cell_key] = 'Combined DC'
    else: 
        df_all_cells[cell_key] = 'Combined Cells'
    
    # Concatenate original and combined data
    df = pd.concat([df, df_all_cells])
    cell_list = list(df[cell_key].unique())
    
    
    plt.figure(figsize=(9,4))

    cell_order = cell_list
    
    ax = sns.violinplot(data=df, x=cell_key, y=features[0], hue=condition_key, order=cell_order, split=True, cut=0, inner='box', gap=.2, 
                      density_norm='width', width=0.8, palette=['.4', '.7'], legend=True, linewidth=1,
                      inner_kws=dict(box_width=3, whis_width=0, color='k', marker='s', zorder=10), alpha=0.5)
    
    
    plt.xlabel('')
    plt.ylabel(features[0], style='italic', weight='book', fontsize=16)

    ### Add legend manually
    ax.legend(handles=ax.legend_.legend_handles, labels=['$+/+$', '$-/-$'], title='$\it{Cd74}$', fontsize=10, title_fontsize=11)


    if cell_cat == 'DC':

        # DC_to_simple_dict = {'Relb(low) cDC2':'$\mathrm{Relb}^{\mathrm{low}}$\n cDC2',
        #              'Relb(int.) cDC2': '$\mathrm{Relb}^{\mathrm{int}}$\n cDC2',
        #              'WDFY4+ cDC2':'cDC2', 
        #              'Lymphoid-resident cDC1':'Resident\n cDC1',
        #              'pDC':'pDC',
        #              'CD8(low) cDC1':'Migratory\n cDC1', 
        #              'CD8- CCR2+ cDC1':'$\mathrm{Ccr2}^{\mathrm{int}}$\n cDC1',
        #              'CD8- CCR2- cDC1':'$\mathrm{Ccr2}^{\mathrm{low}}$\n cDC1',
        #              'Migratory cDC2':'Migratory\n cDC2',
        #              'Combined DC':'Combined\n DC'}
                             

        DC_to_simple_dict = {'Relb(low) cDC2':'$Relb^{\mathrm{low}}$\n cDC2',
                     'Relb(int.) cDC2': '$Relb^{\mathrm{int}}$\n cDC2',
                     'WDFY4+ cDC2':'cDC2', 
                     'Lymphoid-resident cDC1':'Resident\n cDC1',
                     'pDC':'pDC',
                     'CD8(low) cDC1':'Migratory\n cDC1', 
                     'CD8- CCR2+ cDC1':'$Ccr2^{\mathrm{int}}$\n cDC1',
                     'CD8- CCR2- cDC1':'$Ccr2^{\mathrm{low}}$\n cDC1',
                     'Migratory cDC2':'Migratory\n cDC2',
                     'Combined DC':'Combined\n DC'}
        
        DC_simple_names = [DC_to_simple_dict[cell] for cell in cell_list] 
        x_labels = DC_simple_names
        
        # Or add labels manually
        # x_labels =  ['$\mathrm{Relb}^{\mathrm{low}}$\n cDC2','cDC2','Resident\n cDC1','pDC','$\mathrm{Ccr2}^{\mathrm{int}}$\n cDC1','$\mathrm{Ccr2}^{\mathrm{low}}$\n cDC1','$\mathrm{Relb}^{\mathrm{int}}$\n cDC2','Migratory\n cDC2']
        
        plt.yticks(np.arange(len(cell_list)))
        plt.xticks(np.arange(len(cell_list)),x_labels,rotation=0, rotation_mode="anchor", fontsize=10,  ha='center' )

    else:

        plt.xticks(np.arange(len(cell_list)),cell_list,rotation=30, rotation_mode="anchor", fontsize=10, ha='right' )
    
    base_color_map = ann_palette_all
    base_color_map['Combined Cells'] = (0.6, 0.6, 0.6) 
    base_color_map['Combined DC'] = (0.6, 0.6, 0.6) 
    
    # Collect only the violin body PolyCollections (filter out duplicates if any)
    all_polys = [p for p in ax.findobj(PolyCollection) if len(p.get_paths()) > 0]
    
    #### Advanced coloring that considers each violin half separately. 
    ### Solves issue of colors shifting in case of empty violin half 
    
    # 1. Collect only the actual violin half polygons (exclude inner='box' artifacts, which are tiny)
    raw_polys = []
    for p in ax.findobj(PolyCollection):
        # Each violin half has a path with many vertices; inner='box' pieces are short
        path = p.get_paths()[0]
        if path.vertices.shape[0] < 25:   # heuristic threshold; adjust if needed
            continue
        raw_polys.append(p)
    
    # 2. Compute x centroid for each polygon
    poly_info = []
    for p in raw_polys:
        verts = p.get_paths()[0].vertices
        x_center = verts[:, 0].mean()
        poly_info.append((p, x_center))
    
    # 3. Group polygons by nearest integer (category index)
    groups = defaultdict(list)
    for p, xc in poly_info:
        cat_index = int(round(xc))
        groups[cat_index].append((p, xc))
    
    # Safety: map cell_type -> which hue levels actually have data
    present = {}
    for ct in cell_list:
        pres = {h: not df[(df[cell_key] == ct) & (df['WT/KO'] == h)].empty for h in hue_levels}
        present[ct] = pres
    
    # 4. Assign colors robustly
    for i, ct in enumerate(cell_list):
        base_rgb = to_rgb(base_color_map[ct])
        lighten_rgb = lighten(base_rgb, factor=0.55)
    
        polys = groups.get(i, [])
        if not polys:
            continue  # no polygons drawn (fully empty cell type)
    
        # Sort polygons by x_center to identify left vs right (when both present)
        polys_sorted = sorted(polys, key=lambda t: t[1])
    
        if len(polys_sorted) == 2:
            # Both halves present: assign hue_levels[0] to left, hue_levels[1] to right
            (p_left, _), (p_right, _) = polys_sorted
            p_left.set_facecolor(base_rgb)
            p_left.set_edgecolor('black')
            p_left.set_alpha(0.7)
    
            p_right.set_facecolor(lighten_rgb)
            p_right.set_edgecolor('black')
            p_right.set_alpha(0.7)
    
        elif len(polys_sorted) == 1:
            # Only one half drawn. Need to know which hue it corresponds to.
            (p_only, xc) = polys_sorted[0]
    
            # With split=True, seaborn may center the single half (xc â‰ˆ i) if only one hue present.
            # Decide via data presence, not position.
            if present[ct][hue_levels[0]] and not present[ct][hue_levels[1]]:
                # Only first hue present -> base color
                p_only.set_facecolor(base_rgb)
            elif present[ct][hue_levels[1]] and not present[ct][hue_levels[0]]:
                # Only second hue present -> lighten color
                p_only.set_facecolor(lighten_rgb)
            else:
                # Both absent (shouldn't happen) or both present but seaborn collapsed (rare):
                # default to base
                p_only.set_facecolor(base_rgb)
    
            p_only.set_edgecolor('black')
            p_only.set_alpha(0.7)
        else:
            pass
    
    # Get the maximum y value for positioning asterisks
    y_max = df.groupby(cell_key)[features[0]].max()
    
    # Add asterisks above each violin plot
    results_long, results_pivot = compare_gene_expression(df, cell_list, features, condition_key, 'wt', 'ko', cell_key, 'bonferroni' )

    for idx, (cell_type, direction, p_value_corrected, asterisk) in enumerate(zip(results_long[cell_key], results_long['direction'], results_long['p_value_corrected'],results_long['significance'])):

        if asterisk:  # Only add text if there is a significance marker
        # Add some padding above the maximum value
            y_position = y_max[cell_type] + 0.25 

            # # Plot asterisks for significance
            # ax.text(idx, y_position,  asterisk+'\n'+direction, ha='center', va='bottom', fontsize=12)
            # # or print adjusted p-values themselves
            if p_value_corrected>0.001:
                ax.text(idx, y_position,  str("%.3f" % p_value_corrected)+"\n"+direction, ha='center', va='bottom', fontsize=12)
            else:
                ax.text(idx, y_position,  str("%.2E" %p_value_corrected)+"\n"+direction, ha='center', va='bottom', fontsize=12)
    
    plt.ylim([-0.25,np.max(y_max)+1])
    plt.xlim([-0.75,len(cell_list)+ 0.75])

    if tissue == 'pathogenic':

        figname='./expression_figures/pathogenic_spleen_'+cell_cat+'_'+conditions[0]+'_vs_'+conditions[1]+'_'+features[0]+'.pdf'
        docname='./expression_csv/pathogenic_spleen_'+cell_cat+'_'+conditions[0]+'_vs_'+conditions[1]+'_'+features[0]+'.csv'

    if tissue == 'naive':

        figname='./expression_figures/naive_spleen_'+cell_cat+'_'+conditions[0]+'_vs_'+conditions[1]+'_'+features[0]+'.pdf'
        docname='./expression_csv/naive_spleen_'+cell_cat+'_'+conditions[0]+'_vs_'+conditions[1]+'_'+features[0]+'.csv'
    
    plt.savefig(figname, dpi=600, bbox_inches = "tight")
    
    plt.show()
    print(results_long)
    results_long.to_csv(docname)

## Cd74

In [None]:
sc.pl.umap(adata,add_outline=True, outline_width = (0.2,0.5), color=['Cd74'], cmap='coolwarm', s=10, title='Cd74, spleen', vmax=4, save='_spleen_Cd74_all.pdf')
sc.pl.umap(adata[adata.obs['WT/KO']=='wt'],add_outline=True, outline_width = (0.2,0.5), color=['Cd74'], cmap='coolwarm', s=10, title='Cd74, WT spleen', vmax=4, save='_spleen_Cd74_wt.pdf')
sc.pl.umap(adata[adata.obs['sample']=='wt_naive'],add_outline=True, outline_width = (0.2,0.5),  color=['Cd74'], cmap='coolwarm', s=10, title='Cd74, WT Naive spleen', vmax=4,save='_spleen_Cd74_wt_naive.pdf')
sc.pl.umap(adata[adata.obs['sample']=='wt_pathogenic'],add_outline=True, outline_width = (0.2,0.5), color=['Cd74'], cmap='coolwarm', s=10, title='Cd74, WT Pathogenic spleen',vmax=4, save='_spleen_Cd74_wt_pathogenic.pdf')
sc.pl.umap(adata[adata.obs['WT/KO']=='ko'],add_outline=True, outline_width = (0.2,0.5), color=['Cd74'], cmap='coolwarm', s=10, title='Cd74, KO spleen',vmax=4, save='_spleen_Cd74_KO.pdf')


### Violin plots of CD74 expression

In [None]:
# Let's use the high res annotations
adata.obs['cell_type'] = adata.obs['cell_type_high_res']

In [None]:
# gene_ids = adata[adata.obs['sample']=='wt_pathogenic'].raw.var.index.values
# obs_data = adata[adata.obs['sample']=='wt_pathogenic'].raw[:,gene_ids].X.toarray()
# obsDF = pd.DataFrame(obs_data,columns=gene_ids,index=adata[adata.obs['sample']=='wt_pathogenic'].obs['cell_type_all'].values)

# Order violin plots according to mean expression in wt cells
gene_ids = adata[adata.obs['WT/KO']=='wt'].raw.var.index.values
obs_data = adata[adata.obs['WT/KO']=='wt'].raw[:,gene_ids].X.toarray()
obsDF = pd.DataFrame(obs_data,columns=gene_ids,index=adata[adata.obs['WT/KO']=='wt'].obs['cell_type'].values)


meanCd74=np.array([])

for ct in adata.obs['cell_type'].values.unique():

    # # No Mast cells in this sample
    # if (ct!='Mast cells'):
        
    Cd74_vals= obsDF.loc[ct]['Cd74']

    # if (ct=='Mast cells'):

    #     Cd74_vals=0
    
    meanCd74 = np.append(meanCd74,np.mean(Cd74_vals))

Cd74_order = adata.obs['cell_type'].values.unique()[np.argsort(meanCd74)[::-1]]

In [None]:
Cd74_order

In [None]:
colors = [0,0,0]

for ct in np.asarray(Cd74_order.tolist()):
    
    colors = np.vstack((colors,ann_palette_all[ct]))

colors = colors[1:colors.shape[0],:]

In [None]:
sns.set_style("ticks")

In [None]:
from matplotlib import rcParams

df = sc.get.obs_df(adata[adata.obs['WT/KO']=='wt'], ['Cd74', 'pathogenicity', 'cell_type'])

# df = df[df['cell_type_all']!='Plasma cells']

rcParams["figure.figsize"] =  (9, 6)

ax=sc.pl.violin(adata[adata.obs['WT/KO']=='wt'], keys='Cd74', use_raw=True, groupby='cell_type', 
             inner=None, linewidth=1, stripplot=False, jitter=True, scale='count',palette =ann_palette_all, 
                order=Cd74_order,rotation=90, inner_kws=dict(markeredgewidth=5, box_width=6, whis_width=0, 
                                                             color='darkorchid',zorder=10,alpha=0.9), alpha=0.7, size=3, show=False)

sns.boxplot(data=df,  x="cell_type", y='Cd74', saturation=0.5, width=0.2, fliersize=0,
            palette=ann_palette_all, boxprops={'zorder': 10},medianprops=dict(color="white", alpha=1, zorder=15, linewidth=2), order=Cd74_order, ax=ax)


plt.savefig('./expression_figures/Cd74/spleen_WT_Cd74_expression.pdf', dpi=600 ,bbox_inches = "tight")

plt.show()

In [None]:
df = sc.get.obs_df(adata[adata.obs['sample']=='wt_pathogenic'], ['Cd74', 'pathogenicity', 'cell_type'])

# df = df[df['cell_type']!='Plasma cells']
rcParams["figure.figsize"] =  (9, 6)

ax=sc.pl.violin(adata[adata.obs['sample']=='wt_pathogenic'], keys='Cd74', use_raw=True, groupby='cell_type', 
             inner=None, linewidth=1, stripplot=False, jitter=True, scale='count',palette =ann_palette_all, 
                order=Cd74_order,rotation=90, inner_kws=dict(markeredgewidth=5, box_width=6, whis_width=0, 
                                                             color='darkorchid',zorder=10,alpha=0.9), alpha=0.7, size=3, show=False)

sns.boxplot(data=df,  x="cell_type", y='Cd74', saturation=0.5, width=0.2, fliersize=0,
            palette=ann_palette_all, boxprops={'zorder': 10},medianprops=dict(color="white", alpha=1, zorder=15, linewidth=2), order=Cd74_order, ax=ax)

plt.savefig('./expression_figures/Cd74/Spleen_pathogenic_WT_expression_Cd74.pdf', dpi=600, bbox_inches = "tight")

plt.show()

In [None]:
cell_list_by_Cd74 = Cd74_order.tolist()

from matplotlib.colors import to_rgb
from matplotlib.collections import PolyCollection
from matplotlib.legend_handler import HandlerTuple

cell_order = Cd74_order

features = ['Cd74']
cell_key = 'cell_type'
condition_key = 'pathogenicity'

df = sc.get.obs_df(adata[adata.obs['WT/KO']=='wt'], [features[0], condition_key, cell_key])

# df = df[df['cell_type']!='Mast cells']

df[cell_key] = pd.Categorical(df[cell_key]).remove_unused_categories()
df_all_cells = df.copy()
df_all_cells[cell_key] = 'Combined Cells'

# Concatenate original and combined data
df = pd.concat([df, df_all_cells])
og_cell_list=list(df[cell_key].unique())

# cell_list=list(df[cell_key].unique())

cell_ids = []
for cell_id in np.arange(len(og_cell_list)):

    cell_type = og_cell_list[cell_id]
    
    if ((len(df[(df[features[0]]>0)&(df[cell_key]==cell_type)&(df[condition_key]=='naive')])<2)|
        (len(df[(df[features[0]]>0)&(df[cell_key]==cell_type)&(df[condition_key]=='pathogenic')])<2)):
   
        df = df[df[cell_key]!=cell_type]

    else:

        cell_ids.append(cell_id)

cell_list=list(df[cell_key].unique())

plt.figure(figsize=(9,6))

ax = sns.violinplot(data=df, x=cell_key, y=features[0], hue=condition_key, split=True, cut=0, inner='box', gap=.2, 
                  density_norm='width', width=0.8, palette=['.4', '.7'], order=cell_order, legend=True, linewidth=1,
                  inner_kws=dict(box_width=3, whis_width=0, color='k', marker='s', zorder=10), alpha=0.5)

plt.yticks([0.0,1.0,2.0,3.0,4.0])
plt.xticks(np.arange(len(cell_list_by_Cd74)),cell_list_by_Cd74,rotation=30, rotation_mode="anchor", fontsize=8, ha='right' )

handles = []
for ind, violin in enumerate(ax.findobj(PolyCollection)):
    rgb = to_rgb(colors[ind // 2])
    if ind % 2 != 0:
        rgb = 0.5 + 0.5 * np.array(rgb)  # make whiter
    violin.set_facecolor(rgb)
    handles.append(plt.Rectangle((0, 0), 0, 0, facecolor=rgb, edgecolor='black'))


# Get the maximum y value for positioning asterisks
y_max = df.groupby(cell_key)[features[0]].max()

# Add asterisks above each violin plot
results_long, results_pivot = compare_gene_expression(df, cell_order, features, condition_key, 'naive', 'pathogenic', cell_key,'bonferroni')

for idx, (cell_type, direction, asterisk) in enumerate(zip(results_long['cell_type'], results_long['direction'], results_long['significance'])):
    
    if asterisk:  # Only add text if there is a significance marker
        # Add some padding above the maximum value
        y_position = y_max[cell_type] + 0.25 * (y_max.max() - y_max.min())
        ax.text(idx, y_position,  direction+asterisk, ha='center', va='bottom', fontsize=12)

plt.ylim([-0.25,np.max(y_max)+1])
plt.xlim([-1.0,len(cell_ids)+1])

plt.savefig('./expression_figures/Cd74/spleen_WT_Cd74_expression_naive_vs_pathogenic_split.pdf', dpi=600, bbox_inches = "tight")

plt.show()

results_long

### Wdfy4

In [None]:

feature  = 'Wdfy4'
cell_types = 'cell_type_low_res'
cell_category = 'all_cells'
pathogenicity = 'pathogenic'
cell_type_list = all_cells_list 

analyze_expression_under_conditions(adata, cell_cat = cell_category, cell_types=cell_type_list, features = [feature] , cell_key = 'cell_type', condition_key = 'WT/KO', conditions = ['wt','ko'],tissue=pathogenicity)


In [None]:
# cell_list_by_Wdfy4 = Wdfy4_order.tolist()

features = ['Wdfy4']
cell_key = 'cell_type_low_res'
condition_key = 'WT/KO'

df = sc.get.obs_df(adata[adata.obs['pathogenicity']=='pathogenic'], [features[0], condition_key, cell_key])

# df = df[df['cell_type']!='Mast cells']

df[cell_key] = pd.Categorical(df[cell_key]).remove_unused_categories()
df_all_cells = df.copy()
df_all_cells[cell_key] = 'Combined Cells'

# Concatenate original and combined data
df = pd.concat([df, df_all_cells])
og_cell_list=list(df[cell_key].unique())

cell_ids = []
for cell_id in np.arange(len(og_cell_list)):

    cell_type = og_cell_list[cell_id]
    
    if ((len(df[(df[features[0]]>0)&(df[cell_key]==cell_type)&(df[condition_key]=='wt')])<0)|
        (len(df[(df[features[0]]>0)&(df[cell_key]==cell_type)&(df[condition_key]=='ko')])<0)):
   
        df = df[df[cell_key]!=cell_type]

    else:

        cell_ids.append(cell_id)

cell_list=list(df[cell_key].unique())

plt.figure(figsize=(12,5))

ax = sns.violinplot(data=df, x=cell_key, y=features[0], hue=condition_key, split=True, cut=0, inner='box', gap=.2, 
                  density_norm='width', width=0.8, palette=['.4', '.7'], legend='brief',linewidth=1,
                  inner_kws=dict(box_width=3, whis_width=0, color='k', marker='s', zorder=10), alpha=0.7)

plt.yticks([0.0,1.0,2.0,3.0,4.0])
plt.xticks(np.arange(len(cell_list)),cell_list,rotation=30, rotation_mode="anchor", fontsize=10, ha='right' )

handles = []
for ind, violin in enumerate(ax.findobj(PolyCollection)):
    rgb = to_rgb(colors[ind // 2])
    if ind % 2 != 0:
        rgb = 0.5 + 0.5 * np.array(rgb)  # make whiter
    violin.set_facecolor(rgb)
    handles.append(plt.Rectangle((0, 0), 0, 0, facecolor=rgb, edgecolor='black'))


# Get the maximum y value for positioning asterisks
y_max = df.groupby(cell_key)[features[0]].max()

# Add asterisks above each violin plot
results_long, results_pivot = compare_gene_expression(df, cell_list, features, condition_key, 'wt', 'ko', cell_key, 'bonferroni' )

for idx, (cell_type, direction, asterisk) in enumerate(zip(results_long['cell_type'], results_long['direction'], results_long['significance'])):
    
    if asterisk:  # Only add text if there is a significance marker
        # Add some padding above the maximum value
        y_position = y_max[cell_type] + 0.25 * (y_max.max() - y_max.min())
        ax.text(idx, y_position,  direction+asterisk, ha='center', va='bottom', fontsize=12)

plt.ylim([-0.25,np.max(y_max)+1])
plt.xlim([-1.0,len(cell_ids)+3])
plt.savefig('pathogenic_spleen_Wdfy4_wt_vs_ko_all_cells.pdf', dpi=600, bbox_inches = "tight")

plt.show()
results_long

In [None]:
feature  ='Wdfy4'
pathogenicity = 'pathogenic'
analyze_expression_under_conditions(adata, cell_cat = 'DC', cell_types=DC_list, features = [feature] , cell_key = 'cell_type', condition_key = 'WT/KO', conditions = ['wt','ko'],tissue=pathogenicity)

pathogenicity = 'naive'
analyze_expression_under_conditions(adata,  cell_cat = 'DC', cell_types=DC_list, features = [feature] , cell_key = 'cell_type', condition_key = 'WT/KO', conditions = ['wt','ko'],tissue=pathogenicity)


### c-Jun

In [None]:
[s for s in list(adata.var_names) if "Jun" in s]

In [None]:
# cell_list_by_Wdfy4 = Wdfy4_order.tolist()

features = ['Jun']
cell_key = 'cell_type_low_res'
condition_key = 'WT/KO'

df = sc.get.obs_df(adata[adata.obs['pathogenicity']=='pathogenic'], [features[0], condition_key, cell_key])

# df = df[df['cell_type']!='Mast cells']

df[cell_key] = pd.Categorical(df[cell_key]).remove_unused_categories()
df_all_cells = df.copy()
df_all_cells[cell_key] = 'Combined Cells'

# Concatenate original and combined data
df = pd.concat([df, df_all_cells])
og_cell_list=list(df[cell_key].unique())

cell_ids = []
for cell_id in np.arange(len(og_cell_list)):

    cell_type = og_cell_list[cell_id]
    
    if ((len(df[(df[features[0]]>0)&(df[cell_key]==cell_type)&(df[condition_key]=='wt')])<2)|
        (len(df[(df[features[0]]>0)&(df[cell_key]==cell_type)&(df[condition_key]=='ko')])<2)):
   
        df = df[df[cell_key]!=cell_type]

    else:

        cell_ids.append(cell_id)

cell_list=list(df[cell_key].unique())

plt.figure(figsize=(12,5))

ax = sns.violinplot(data=df, x=cell_key, y=features[0], hue=condition_key, split=True, cut=0, inner='box', gap=.2, 
                  density_norm='width', width=0.8, palette=['.4', '.7'], legend='brief',linewidth=1,
                  inner_kws=dict(box_width=3, whis_width=0, color='k', marker='s', zorder=10), alpha=0.5)

plt.yticks([0.0,1.0,2.0,3.0,4.0])
plt.xticks(np.arange(len(cell_list)),cell_list,rotation=30, rotation_mode="anchor", fontsize=10, ha='right' )

handles = []
for ind, violin in enumerate(ax.findobj(PolyCollection)):
    rgb = to_rgb(colors[ind // 2])
    if ind % 2 != 0:
        rgb = 0.5 + 0.5 * np.array(rgb)  # make whiter
    violin.set_facecolor(rgb)
    handles.append(plt.Rectangle((0, 0), 0, 0, facecolor=rgb, edgecolor='black'))


# Get the maximum y value for positioning asterisks
y_max = df.groupby(cell_key)[features[0]].max()

# Add asterisks above each violin plot
results_long, results_pivot = compare_gene_expression(df, cell_list, features, condition_key, 'wt', 'ko', cell_key, 'bonferroni' )

for idx, (cell_type, direction, asterisk) in enumerate(zip(results_long['cell_type'], results_long['direction'], results_long['significance'])):
    
    if asterisk:  # Only add text if there is a significance marker
        # Add some padding above the maximum value
        y_position = y_max[cell_type] + 0.25 * (y_max.max() - y_max.min())
        ax.text(idx, y_position,  direction+asterisk, ha='center', va='bottom', fontsize=12)

plt.ylim([-0.25,np.max(y_max)+1])
plt.xlim([-1.0,len(cell_ids)+3])
plt.savefig('pathogenic_spleen_Jun_wt_vs_ko_all_cells.pdf', dpi=600, bbox_inches = "tight")

plt.show()
results_long

In [None]:
feature  = 'Jun'
pathogenicity = 'pathogenic'
analyze_expression_under_conditions(adata, cell_cat = 'DC', cell_types=DC_list, features = [feature] , cell_key = 'cell_type', condition_key = 'WT/KO', conditions = ['wt','ko'],tissue=pathogenicity)

pathogenicity = 'naive'
analyze_expression_under_conditions(adata, cell_cat = 'DC', cell_types=DC_list, features = [feature] , cell_key = 'cell_type', condition_key = 'WT/KO', conditions = ['wt','ko'],tissue=pathogenicity)


### JunB

In [None]:
# cell_list_by_Wdfy4 = Wdfy4_order.tolist()

features = ['Junb']
cell_key = 'cell_type_low_res'
condition_key = 'WT/KO'

df = sc.get.obs_df(adata[adata.obs['pathogenicity']=='pathogenic'], [features[0], condition_key, cell_key])

# df = df[df['cell_type']!='Mast cells']

df[cell_key] = pd.Categorical(df[cell_key]).remove_unused_categories()
df_all_cells = df.copy()
df_all_cells[cell_key] = 'Combined Cells'

# Concatenate original and combined data
df = pd.concat([df, df_all_cells])
og_cell_list=list(df[cell_key].unique())

cell_ids = []
for cell_id in np.arange(len(og_cell_list)):

    cell_type = og_cell_list[cell_id]
    
    if ((len(df[(df[features[0]]>0)&(df[cell_key]==cell_type)&(df[condition_key]=='wt')])<0)|
        (len(df[(df[features[0]]>0)&(df[cell_key]==cell_type)&(df[condition_key]=='ko')])<0)):
   
        df = df[df[cell_key]!=cell_type]

    else:

        cell_ids.append(cell_id)

cell_list=list(df[cell_key].unique())

plt.figure(figsize=(12,5))

ax = sns.violinplot(data=df, x=cell_key, y=features[0], hue=condition_key, split=True, cut=0, inner='box', gap=.2, 
                  density_norm='width', width=0.8, palette=['.4', '.7'], legend='brief',linewidth=1,
                  inner_kws=dict(box_width=3, whis_width=0, color='k', marker='s', zorder=10), alpha=0.5)

plt.yticks([0.0,1.0,2.0,3.0,4.0])
plt.xticks(np.arange(len(cell_list)),cell_list,rotation=30, rotation_mode="anchor", fontsize=10, ha='right' )

handles = []
for ind, violin in enumerate(ax.findobj(PolyCollection)):
    rgb = to_rgb(colors[ind // 2])
    if ind % 2 != 0:
        rgb = 0.5 + 0.5 * np.array(rgb)  # make whiter
    violin.set_facecolor(rgb)
    handles.append(plt.Rectangle((0, 0), 0, 0, facecolor=rgb, edgecolor='black'))


# Get the maximum y value for positioning asterisks
y_max = df.groupby(cell_key)[features[0]].max()

# Add asterisks above each violin plot
results_long, results_pivot = compare_gene_expression(df, cell_list, features, condition_key, 'wt', 'ko', cell_key, 'bonferroni' )

for idx, (cell_type, direction, asterisk) in enumerate(zip(results_long['cell_type'], results_long['direction'], results_long['significance'])):
    
    if asterisk:  # Only add text if there is a significance marker
        # Add some padding above the maximum value
        y_position = y_max[cell_type] + 0.25 * (y_max.max() - y_max.min())
        ax.text(idx, y_position,  direction+asterisk, ha='center', va='bottom', fontsize=12)

plt.ylim([-0.25,np.max(y_max)+1])
plt.xlim([-1.0,len(cell_ids)+3])
plt.savefig('pathogenic_spleen_JunB_wt_vs_ko_all_cells.pdf', dpi=600, bbox_inches = "tight")

plt.show()
results_long

In [None]:
feature  = 'Junb'
pathogenicity = 'pathogenic'
analyze_expression_under_conditions(adata, cell_cat = 'DC', cell_types=DC_list, features = [feature] , cell_key = 'cell_type', condition_key = 'WT/KO', conditions = ['wt','ko'],tissue=pathogenicity)

pathogenicity = 'naive'
analyze_expression_under_conditions(adata, cell_cat = 'DC', cell_types=DC_list, features = [feature] , cell_key = 'cell_type', condition_key = 'WT/KO', conditions = ['wt','ko'],tissue=pathogenicity)


## Ccr7 in DC

In [None]:
feature = 'Ccr7'
pathogenicity = 'pathogenic'
analyze_expression_under_conditions(adata, cell_cat = 'DC', cell_types=DC_list, features = [feature] , cell_key = 'cell_type', condition_key = 'WT/KO', conditions = ['wt','ko'],tissue=pathogenicity)

pathogenicity = 'naive'
analyze_expression_under_conditions(adata, cell_cat = 'DC', cell_types=DC_list, features = [feature] , cell_key = 'cell_type', condition_key = 'WT/KO', conditions = ['wt','ko'],tissue=pathogenicity)


## Cd8a

In [None]:
sc.pl.umap(adata[adata.obs['sample']=='ko_naive'],add_outline=True, outline_width = (0.2,0.5),  color=['Cd8a'], cmap='coolwarm', s=10, title='Cd8a in Cd74 KO, naive spleen', vmax=2.5,save='_Cd8a_spleen_ko_naive.pdf')
sc.pl.umap(adata[adata.obs['sample']=='wt_naive'],add_outline=True, outline_width = (0.2,0.5), color=['Cd8a'], cmap='coolwarm', s=10, title='Cd8a in WT, naive spleen',vmax=2.5, save='_Cd8a_spleen_wt_naive.pdf')



In [None]:
sc.pl.umap(adata[adata.obs['sample']=='ko_pathogenic'],add_outline=True, outline_width = (0.2,0.5),  color=['Cd8a'], cmap='coolwarm', s=10, title='Cd8a in Cd74 KO, pathogenic spleen', vmax=2.5,save='_Cd8a_pathogenic_spleen_ko.pdf')
sc.pl.umap(adata[adata.obs['sample']=='wt_pathogenic'],add_outline=True, outline_width = (0.2,0.5), color=['Cd8a'], cmap='coolwarm', s=10, title='Cd8a in WT, pathogenic spleen',vmax=2.5, save='_Cd8a_pathogenic_spleen_wt.pdf')


## Cd4

In [None]:
sc.pl.umap(adata[adata.obs['sample']=='ko_pathogenic'],add_outline=True, outline_width = (0.2,0.5),  color=['Cd4'], cmap='coolwarm', s=10, title='Cd4 in Cd74 KO, pathogenic_spleen', vmax=2,save='_Cd4_pathogenic_spleen_ko.pdf')
sc.pl.umap(adata[adata.obs['sample']=='wt_pathogenic'],add_outline=True, outline_width = (0.2,0.5), color=['Cd4'], cmap='coolwarm', s=10, title='Cd4 in WT, pathogenic_spleen',vmax=2, save='_Cd4_pathogenic_spleen_wt.pdf')


## Ctla4

In [None]:
sc.pl.umap(adata[adata.obs['sample']=='ko_naive'],add_outline=True, outline_width = (0.2,0.5),  color=['Ctla4'], cmap='coolwarm', s=10, title='Ctla4 in Cd74 KO, naive spleen', vmax=2.5,save='_Ctla4_spleen_ko_naive.pdf')
sc.pl.umap(adata[adata.obs['sample']=='wt_naive'],add_outline=True, outline_width = (0.2,0.5), color=['Ctla4'], cmap='coolwarm', s=10, title='Ctla4 in WT, naive spleen',vmax=2.5, save='_Ctla4_spleen_wt_naive.pdf')


In [None]:
sc.pl.umap(adata[adata.obs['sample']=='ko_pathogenic'],add_outline=True, outline_width = (0.2,0.5),  color=['Ctla4'], cmap='coolwarm', s=10, title='Ctla4 in Cd74 KO, pathogenic spleen', vmax=2.5,save='_Ctla4_pathogenic_spleen_ko.pdf')
sc.pl.umap(adata[adata.obs['sample']=='wt_pathogenic'],add_outline=True, outline_width = (0.2,0.5), color=['Ctla4'], cmap='coolwarm', s=10, title='Ctla4 in WT, pathogenic spleen',vmax=2.5, save='_Ctla4_pathogenic_spleen_wt.pdf')


## Foxp3 

In [None]:
sc.pl.umap(adata[adata.obs['sample']=='ko_pathogenic'],add_outline=True, outline_width = (0.2,0.5),  color=['Foxp3'], cmap='coolwarm', s=10, title='Foxp3 in Cd74 KO, pathogenic spleen', vmax=2.,save='_Foxp3_ pathogenic_spleen_ko.pdf')
sc.pl.umap(adata[adata.obs['sample']=='wt_pathogenic'],add_outline=True, outline_width = (0.2,0.5), color=['Foxp3'], cmap='coolwarm', s=10, title='Foxp3 in WT, pathogenic spleen',vmax=2., save='_Foxp3_pathogenic_spleen_wt.pdf')


### Calr

In [None]:
sc.pl.umap(adata[adata.obs['sample']=='ko_pathogenic'],add_outline=True, outline_width = (0.2,0.5),  color=['Calr'], cmap='coolwarm', s=10, title='Calr in Cd74 KO, pathogenic spleen', vmax=2.,save='_Calr_ pathogenic_spleen_ko.pdf')
sc.pl.umap(adata[adata.obs['sample']=='wt_pathogenic'],add_outline=True, outline_width = (0.2,0.5), color=['Calr'], cmap='coolwarm', s=10, title='Calr in WT, pathogenic spleen',vmax=2., save='_Calr_pathogenic_spleen_wt.pdf')


### Violin plots

In [None]:
#Evaluate significance

from scipy import stats
from statsmodels.stats.multitest import multipletests

from matplotlib.colors import to_rgb
from matplotlib.collections import PolyCollection
from matplotlib.legend_handler import HandlerTuple
import textwrap

def compare_gene_expression(df, ordered_cell_list, gene='Cd74', condition_key='pathogenicity', cell_type_key='cell_type'):
    
    cell_types = pd.Categorical(list(ordered_cell_list))
    # cell_types = df[cell_type_key].unique()
    p_values = []
    effect_sizes = []
    
    for cell_type in cell_types:
        cell_type_data = df[df[cell_type_key] == cell_type]
        wt_expr = cell_type_data[cell_type_data[condition_key] == 'wt'][gene]
        ko_expr = cell_type_data[cell_type_data[condition_key] == 'ko'][gene]
        
        # Perform Mann-Whitney U test
        statistic, p_value = stats.mannwhitneyu(wt_expr, ko_expr, alternative='two-sided')
        p_values.append(p_value)
    
    # Correct for multiple testing
    _, p_values_corrected, _, _ = multipletests(p_values, method='fdr_bh')
    
    results = pd.DataFrame({
        'cell_type': cell_types,
        'p_value': p_values,
        'p_value_corrected': p_values_corrected
    })

    # Add significance asterisks
    results['significance'] = results['p_value_corrected'].apply(lambda p: 
        '****' if p < 0.0001 else 
        (
            '***' if p < 0.001 else 
            ('**' if p < 0.01 else 
             ('*' if p < 0.05 else ''))
        )
    )
    
    return results

In [None]:
test_genes = ['Cd74','Cd8a','Cd4','Ctla4','Pdcd1','Foxp3', 'Calr']

# test_genes = ['Pdcd1', 'Ctla4' , 'Mki67', 'Top2a', 'Pcna']

for gene in test_genes:

    # Order violin plots according to mean expression in wt cells
    gene_ids = adata[adata.obs['sample']=='wt_pathogenic'].raw.var.index.values
    obs_data = adata[adata.obs['sample']=='wt_pathogenic'].raw[:,gene_ids].X.toarray()
    obsDF = pd.DataFrame(obs_data,columns=gene_ids,index=adata[adata.obs['sample']=='wt_pathogenic'].obs['cell_type'].values)
    
    
    mean_xpr=np.array([])
    
    for ct in adata.obs['cell_type'].values.unique():
    
        ##No Mast cells in this sample
        if (ct!='Mast cells'):
            
            xpr_vals= obsDF.loc[ct][gene]
    
        if (ct=='Mast cells'):
    
            xpr_vals=0
        
        mean_xpr = np.append(mean_xpr,np.mean(xpr_vals))
    
    cell_order = adata.obs['cell_type'].values.unique()[np.argsort(mean_xpr)[::-1]]

    # No mast cells in these samples
    cell_order = cell_order.remove_categories(['Mast cells'])
    cell_order =cell_order[~pd.isna(cell_order)]
    
    colors = [0,0,0]

    for ct in np.asarray(cell_order.tolist()):
    
        colors = np.vstack((colors,ann_palette_all[ct]))

    colors = colors[1:colors.shape[0],:]

    
    df = sc.get.obs_df(adata[adata.obs['sample']=='wt_pathogenic'], [gene, 'pathogenicity', 'cell_type'])


    ax=sc.pl.violin(adata[adata.obs['sample']=='wt_pathogenic'], keys=gene, use_raw=True, groupby='cell_type', 
                 inner=None, linewidth=1, stripplot=False, jitter=True, scale='count',palette =ann_palette_all, 
                    order=cell_order,rotation=90, inner_kws=dict(markeredgewidth=5, box_width=6, whis_width=0, 
                                                                 color='darkorchid',zorder=10,alpha=0.7), alpha=0.5, size=3, show=False)
    
    sns.boxplot(data=df,  x="cell_type", y=gene, saturation=0.5, width=0.2, fliersize=0,
                palette=ann_palette_all, boxprops={'zorder': 10},
                medianprops=dict(color="white", alpha=1, zorder=15, linewidth=2), order=cell_order, ax=ax).set(title='Spleen pathogenic WT')
    
    plt.xlabel('')
    plt.savefig('Spleen_WT_'+str(gene)+'.pdf', dpi=600,bbox_inches = "tight")

    plt.show()
    df = sc.get.obs_df(adata[adata.obs['sample']=='ko_pathogenic'], [gene, 'pathogenicity', 'cell_type'])

    
    ax=sc.pl.violin(adata[adata.obs['sample']=='ko_pathogenic'], keys=gene, use_raw=True, groupby='cell_type', 
                 inner=None, linewidth=1, stripplot=False, jitter=True, scale='count',palette =ann_palette_all, 
                    order=cell_order,rotation=90, inner_kws=dict(markeredgewidth=5, box_width=6, whis_width=0, 
                                                                 color='darkorchid',zorder=10,alpha=0.7), alpha=0.5, size=3, show=False)
    
    sns.boxplot(data=df,  x="cell_type", y=gene, saturation=0.5, width=0.2, fliersize=0, 
                palette=ann_palette_all, boxprops={'zorder': 10},
                medianprops=dict(color="white", alpha=1, zorder=15, linewidth=2), order=cell_order, ax=ax).set(title='Spleen pathogenic Cd74 KO')

    plt.xlabel('')
    plt.savefig('Spleen_KO_'+str(gene)+'.pdf', dpi=600, bbox_inches = "tight")

    plt.show()

    ############################################
    # Split violin plot
    ############################################
    cell_list = cell_order.tolist()    
    
    df = sc.get.obs_df(adata[adata.obs['pathogenicity']=='pathogenic'], [gene, 'WT/KO', 'cell_type'])

    ax = sns.violinplot(data=df, x="cell_type", y=gene, hue='WT/KO', split=True, cut=0, inner='box', gap=.1, 
                      density_norm='width', width=0.9, palette=['.4', '.7'], order=cell_order, legend=None, linewidth=1,
                      inner_kws=dict(box_width=3, whis_width=0, color='k', marker='s', zorder=10), alpha=0.5)

    results = compare_gene_expression(df, cell_order, gene=gene, condition_key='WT/KO')

    # Get the maximum y value for positioning asterisks
    y_max = df.groupby('cell_type')[gene].max()

    # Add asterisks above each violin plot
    for idx, (cell_type, asterisk) in enumerate(zip(results['cell_type'], results['significance'])):
        if asterisk:  # Only add text if there is a significance marker
            # Add some padding above the maximum value
            y_position = y_max[cell_type] + 0.03 * (y_max.max() - y_max.min())
            ax.text(idx, y_position, asterisk, ha='center', va='bottom')
            
    handles = []
    for ind, violin in enumerate(ax.findobj(PolyCollection)):
        rgb = to_rgb(colors[ind // 2])
        if ind % 2 != 0:
            rgb = 0.5 + 0.5 * np.array(rgb)  # make whiter
        violin.set_facecolor(rgb)
        handles.append(plt.Rectangle((0, 0), 0, 0, facecolor=rgb, edgecolor='black'))

    plt.xticks(np.arange(len(cell_list)),cell_list,rotation=90, rotation_mode="anchor", fontsize=10, ha='right' )
    
    # Adjust the top margin to make room for asterisks
    plt.tight_layout()
    plt.subplots_adjust(top=1.2)  # Adjust this value if needed
    
    plt.xlabel('')
    plt.savefig('Spleen_wt_vs_ko_split_'+str(gene)+'.pdf', dpi=600, bbox_inches = "tight")
    
    plt.show()

In [None]:
list(adata.obs['cell_type'].values.unique())

## T cell dysfunction markers

In [None]:
T_cell_types = ['Treg','CD8+','Activated T cells','CD4+','Heterogenous T cells','Tcm','Th']
adata_T_cells = adata[adata.obs['cell_type'].isin(T_cell_types)].copy()
adata_T_cells.obs['cell_type']=pd.Categorical(adata_T_cells.obs['cell_type']).remove_unused_categories()

dysfunction_markers = ['Pdcd1', 'Tigit', 'Ctla4', 'Havcr2', 'Eomes']

In [None]:
df.shape[0]

In [None]:


for feature in dysfunction_markers:

    features = [feature]
    cell_key = 'cell_type'
    condition_key = 'WT/KO'

    df = sc.get.obs_df(adata[adata.obs['pathogenicity']=='pathogenic'], [features[0], condition_key, cell_key])
    
    df = df[df['cell_type'].isin(T_cell_types)]
    
    df[cell_key] = pd.Categorical(df[cell_key]).remove_unused_categories()
    df_all_cells = df.copy()
    df_all_cells[cell_key] = 'Combined T Cells'
    
    # Concatenate original and combined data
    df = pd.concat([df, df_all_cells])
    og_cell_list=list(df[cell_key].unique())
    
    cell_ids = []
    for cell_id in np.arange(len(og_cell_list)):
    
        cell_type = og_cell_list[cell_id]
        
        if ((len(df[(df[features[0]]>0)&(df[cell_key]==cell_type)&(df[condition_key]=='wt')])<0)&
            (len(df[(df[features[0]]>0)&(df[cell_key]==cell_type)&(df[condition_key]=='ko')])<0)):
       
            df = df[df[cell_key]!=cell_type]
            
    
        else:
    
            cell_ids.append(cell_id)

    df_eps = df.copy()
    
    scale, size = 0.001, df.shape[0]
    rng = np.random.default_rng()
    random_eps = rng.exponential(scale=scale, size=size)
    df_eps[feature] = df_eps[feature]+random_eps
    
    cell_list=list(df[cell_key].unique())
    
    plt.figure(figsize=(12,4))
    
    # ax = sns.violinplot(data=df, x=cell_key, y=features[0], hue=condition_key, split=True, cut=0, inner='stick', gap=.2, 
    #                   density_norm='count', width=0.8, palette=['.4', '.7'], legend=True, linewidth=1,
    #                   inner_kws=dict(box_width=3, whis_width=0, color='k', marker='s', zorder=10), alpha=0.5)

    
    ax = sns.violinplot(data=df_eps, x=cell_key, y=features[0], hue=condition_key, split=True, cut=0, inner='stick', gap=.1, 
                      density_norm='count', width=1.5, palette=['.4', '.7'], legend=True, linewidth=1, alpha=0.5)
    
    
    plt.yticks([0.0,1.0,2.0,3.0,4.0])
    plt.xticks(np.arange(len(cell_list)),cell_list,rotation=30, rotation_mode="anchor", fontsize=10, ha='right' )

    colors=[ann_palette_all[k] for k in T_cell_types]+[(0.0,0.5,0.5)]
    
    handles = []
    for ind, violin in enumerate(ax.findobj(PolyCollection)):
        rgb = to_rgb(colors[ind // 2])
        if ind % 2 != 0:
            rgb = 0.5 + 0.5 * np.array(rgb)  # make whiter
        violin.set_facecolor(rgb)
        # handles.append(plt.Rectangle((0, 0), 0, 0, facecolor=rgb, edgecolor='black'))
    
    
    # Get the maximum y value for positioning asterisks
    y_max = df.groupby(cell_key)[features[0]].max()
    
    # Add asterisks above each violin plot
    results_long, results_pivot = compare_gene_expression(df, cell_list, features, condition_key, 'wt', 'ko', cell_key, 'bonferroni' )
    
    for idx, (cell_type, direction, asterisk) in enumerate(zip(results_long['cell_type'], results_long['direction'], results_long['significance'])):
        
        if asterisk:  # Only add text if there is a significance marker
            # Add some padding above the maximum value
            y_position = y_max[cell_type] + 0.2 * (y_max.max() - y_max.min())
            ax.text(idx, y_position,  direction+asterisk, ha='center', va='bottom', fontsize=12)
    
    plt.ylim([-0.25,np.max(y_max)+1])
    plt.xlim([-1.0,len(cell_ids)+1])
    plt.savefig('Spleen_Tcells_wt_vs_ko_split_'+str(feature)+'.pdf', dpi=600, bbox_inches = "tight")
    
    plt.show()
    results_long

In [None]:
test_genes = dysfunction_markers


for gene in test_genes:

    # Order violin plots according to mean expression in wt cells
    gene_ids = adata_T_cells[adata_T_cells.obs['sample']=='wt_pathogenic'].raw.var.index.values
    obs_data = adata_T_cells[adata_T_cells.obs['sample']=='wt_pathogenic'].raw[:,gene_ids].X.toarray()
    obsDF = pd.DataFrame(obs_data,columns=gene_ids,index=adata_T_cells[adata_T_cells.obs['sample']=='wt_pathogenic'].obs['cell_type'].values)
    
    
    mean_xpr=np.array([])
    
    for ct in T_cell_types:
    
        ##No Mast cells in this sample
        if (ct!='Mast cells'):
            
            xpr_vals= obsDF.loc[ct][gene]
    
        if (ct=='Mast cells'):
    
            xpr_vals=0
        
        mean_xpr = np.append(mean_xpr,np.mean(xpr_vals))
    
    cell_order = adata.obs['cell_type'].values.unique()[np.argsort(mean_xpr)[::-1]]

    # No mast cells in these samples
    cell_order = cell_order.remove_categories(['Mast cells'])
    cell_order =cell_order[~pd.isna(cell_order)]
    
    colors = [0,0,0]

    for ct in np.asarray(cell_order.tolist()):
    
        colors = np.vstack((colors,ann_palette_all[ct]))

    colors = colors[1:colors.shape[0],:]

    
    df = sc.get.obs_df(adata_T_cells[adata_T_cells.obs['sample']=='wt_pathogenic'], [gene, 'pathogenicity', 'cell_type'])


    ax=sc.pl.violin(adata_T_cells[adata_T_cells.obs['sample']=='wt_pathogenic'], keys=gene, use_raw=True, groupby='cell_type', 
                 inner=None, linewidth=1, stripplot=False, jitter=True, scale='count',palette =ann_palette_all, 
                    order=cell_order,rotation=90, inner_kws=dict(markeredgewidth=5, box_width=6, whis_width=0, 
                                                                 color='darkorchid',zorder=10,alpha=0.7), alpha=0.5, size=3, show=False)
    
    sns.boxplot(data=df,  x="cell_type", y=gene, saturation=0.5, width=0.2, fliersize=0,
                palette=ann_palette_all, boxprops={'zorder': 10},
                medianprops=dict(color="white", alpha=1, zorder=15, linewidth=2), order=cell_order, ax=ax).set(title='Spleen pathogenic WT')
    
    plt.xlabel('')
    plt.savefig('Spleen_WT_'+str(gene)+'.pdf', dpi=600,bbox_inches = "tight")

    plt.show()
    df = sc.get.obs_df(adata_T_cells[adata_T_cells.obs['sample']=='ko_pathogenic'], [gene, 'pathogenicity', 'cell_type'])

    
    ax=sc.pl.violin(adata_T_cells[adata_T_cells.obs['sample']=='ko_pathogenic'], keys=gene, use_raw=True, groupby='cell_type', 
                 inner=None, linewidth=1, stripplot=False, jitter=True, scale='count',palette =ann_palette_all, 
                    order=cell_order,rotation=90, inner_kws=dict(markeredgewidth=5, box_width=6, whis_width=0, 
                                                                 color='darkorchid',zorder=10,alpha=0.7), alpha=0.5, size=3, show=False)
    
    sns.boxplot(data=df,  x="cell_type", y=gene, saturation=0.5, width=0.2, fliersize=0, 
                palette=ann_palette_all, boxprops={'zorder': 10},
                medianprops=dict(color="white", alpha=1, zorder=15, linewidth=2), order=cell_order, ax=ax).set(title='Spleen pathogenic Cd74 KO')

    plt.xlabel('')
    plt.savefig('Spleen_KO_'+str(gene)+'.pdf', dpi=600, bbox_inches = "tight")

    plt.show()

    ############################################
    # Split violin plot
    ############################################
    cell_list = cell_order.tolist()    
    
    df = sc.get.obs_df(adata_T_cells[adata_T_cells.obs['pathogenicity']=='pathogenic'], [gene, 'WT/KO', 'cell_type'])

    ax = sns.violinplot(data=df, x="cell_type", y=gene, hue='WT/KO', split=True, cut=0, inner='box', gap=.1, 
                      density_norm='width', width=0.9, palette=['.4', '.7'], order=cell_order, legend=None, linewidth=1,
                      inner_kws=dict(box_width=3, whis_width=0, color='k', marker='s', zorder=10), alpha=0.5)

    results = compare_gene_expression(df, cell_order, gene=gene, condition_key='WT/KO')

    # Get the maximum y value for positioning asterisks
    y_max = df.groupby('cell_type')[gene].max()

    # Add asterisks above each violin plot
    for idx, (cell_type, asterisk) in enumerate(zip(results['cell_type'], results['significance'])):
        if asterisk:  # Only add text if there is a significance marker
            # Add some padding above the maximum value
            y_position = y_max[cell_type] + 0.03 * (y_max.max() - y_max.min())
            ax.text(idx, y_position, asterisk, ha='center', va='bottom')
            
    handles = []
    for ind, violin in enumerate(ax.findobj(PolyCollection)):
        rgb = to_rgb(colors[ind // 2])
        if ind % 2 != 0:
            rgb = 0.5 + 0.5 * np.array(rgb)  # make whiter
        violin.set_facecolor(rgb)
        handles.append(plt.Rectangle((0, 0), 0, 0, facecolor=rgb, edgecolor='black'))

    plt.xticks(np.arange(len(cell_list)),cell_list,rotation=90, rotation_mode="anchor", fontsize=10, ha='right' )
    
    # Adjust the top margin to make room for asterisks
    plt.tight_layout()
    plt.subplots_adjust(top=1.2)  # Adjust this value if needed
    
    plt.xlabel('')
    plt.savefig('Spleen_wt_vs_ko_split_'+str(gene)+'.pdf', dpi=600, bbox_inches = "tight")
    
    plt.show()

## Transcription factor AP-1 complex and Ccr7 genes in DCs 

In [None]:
tf_ap1_genes = ['Fos','Jun','Junb','Jund','Nfatc2' ]
sc.tl.score_genes(adata, tf_ap1_genes, score_name='TF_AP1_score')

In [None]:
sc.set_figure_params(scanpy=True, dpi=300, dpi_save=1200, frameon=True, vector_friendly=True, fontsize=14,
                         figsize=(9,8),  format='pdf', facecolor=None, transparent=False, ipython_format='png2x')

In [None]:
dc_types = ["Lymphoid-resident cDC1","CD8(low) cDC1","CD8- CCR2+ cDC1","CD8- CCR2- cDC1", "CCR7+ DC1", 
                        "Relb(int.) cDC2","Migratory cDC2","WDFY4+ cDC2","Relb(low) cDC2","pDC"]


adata_dc = adata[adata.obs['cell_type'].isin(dc_types)].copy()
adata_dc.obs['cell_type']=pd.Categorical(adata_dc.obs['cell_type']).remove_unused_categories()

# Create the UMAP plot and capture the axis object that scanpy returns
fig, axes = plt.subplots(1,1)
sc.pl.umap(adata_dc,
                color=['cell_type'],
                add_outline=True,
                outline_width=(0.2, 0.5),
                palette=ann_palette_all,
                alpha=0.7,
                s=10,
                title='Spleen DCs',
                legend_loc='on data',
                legend_fontsize=12,
                legend_fontweight='medium',
                show=False,
                ax=axes,
                )  # This is important!

# Set the axis limits on the returned axis object
axes.set_xlim(-3.5, 8.)  # Adjust these values
axes.set_ylim(-5, 4)  # Adjust these values

plt.savefig('umap_spleen_dc_annotations.pdf', bbox_inches='tight')
plt.show()

In [None]:
# from scipy.stats import mode
# from adjustText import adjust_text

In [None]:
dc_types = ["Lymphoid-resident cDC1","CD8(low) cDC1","CD8- CCR2+ cDC1","CD8- CCR2- cDC1", "CCR7+ DC1", 
                        "Relb(int.) cDC2","Migratory cDC2","WDFY4+ cDC2","Relb(low) cDC2","pDC"]

adata_dc = adata[adata.obs['cell_type'].isin(dc_types)].copy()
adata_dc.obs['cell_type']=pd.Categorical(adata_dc.obs['cell_type']).remove_unused_categories()

adata_dc_pat = adata_dc[adata_dc.obs['pathogenicity']=='pathogenic']

# Create the UMAP plot and capture the axis object that scanpy returns
fig, axes = plt.subplots(1,1)
sc.pl.umap(adata_dc_pat,
                color=['Ccr7'],
                cmap='coolwarm',
                add_outline=True,
                outline_width=(0.2, 0.5),
                # palette=ann_palette_all,
                alpha=0.8,
                s=16,
                title='Spleen DCs, Ccr7',
                legend_loc='on data',
                legend_fontsize=12,
                legend_fontweight='medium',
                show=False,
                ax=axes,
                )  




plt.savefig('umap_pathogenic_spleen_dc_Ccr7.pdf', bbox_inches='tight')
plt.show()

In [None]:
dc_types = ["Lymphoid-resident cDC1","CD8(low) cDC1","CD8- CCR2+ cDC1","CD8- CCR2- cDC1", "CCR7+ DC1", 
                        "Relb(int.) cDC2","Migratory cDC2","WDFY4+ cDC2","Relb(low) cDC2","pDC"]

adata_dc = adata[adata.obs['cell_type'].isin(dc_types)].copy()
adata_dc.obs['cell_type']=pd.Categorical(adata_dc.obs['cell_type']).remove_unused_categories()

adata_dc_pat = adata_dc[adata_dc.obs['pathogenicity']=='pathogenic']

# Create the UMAP plot and capture the axis object that scanpy returns
fig, axes = plt.subplots(1,1)
sc.pl.umap(adata_dc_pat,
                color=['TF_AP1_score'],
                cmap='coolwarm',
                add_outline=True,
                outline_width=(0.2, 0.5),
                # palette=ann_palette_all,
                alpha=0.8,
                s=16,
                title='Spleen DCs, TF AP-1 complex score',
                legend_loc='on data',
                legend_fontsize=12,
                legend_fontweight='medium',
                show=False,
                ax=axes,
                vmax=1.5
                )  

# Set the axis limits on the returned axis object
# axes.set_xlim(-3.5, 8.)  # Adjust these values
# axes.set_ylim(-5, 4)  # Adjust these values

plt.savefig('umap_pathogenic_spleen_dc_tf_ap1.pdf', bbox_inches='tight')
plt.show()

## Violin plots of Ccr7 and TF AP-1 complex score in DCs

In [None]:
from scipy import stats
from statsmodels.stats.multitest import multipletests
from matplotlib.colors import to_rgb
from matplotlib.collections import PolyCollection
from matplotlib.legend_handler import HandlerTuple
import textwrap

In [None]:
def compare_gene_expression_wtko(df, ordered_cell_list, genes, condition_key='WT/KO', cell_type_key='cell_type'):
    """
    Compare gene expression between WT and KO conditions across multiple cell types and genes,
    including an analysis of all cells combined.
    
    Parameters:
    -----------
    df : pandas DataFrame
        Input data frame containing expression data
    ordered_cell_list : list
        List of cell types in desired order
    genes : list
        List of genes to analyze
    condition_key : str
        Column name for condition (default: 'WT/KO')
    cell_type_key : str
        Column name for cell type (default: 'cell_type')
    
    Returns:
    --------
    tuple: (pandas DataFrame with detailed results, pandas DataFrame with pivoted results)
    """
    # Add 'All Cells' to the cell types list
    all_cell_types = ordered_cell_list + ['Combined Cells']
    cell_types = pd.Categorical(list(all_cell_types))
    
    all_results = []
    all_p_values = []  # Store all p-values for global correction
    
    # First, calculate all p-values
    for cell_type in cell_types:
        # For 'All Cells', use the complete dataset; otherwise filter by cell type
        if cell_type == 'Combined Cells':
            cell_type_data = df
        else:
            cell_type_data = df[df[cell_type_key] == cell_type]
        
        for gene in genes:
            wt_expr = cell_type_data[cell_type_data[condition_key] == 'wt'][gene]
            ko_expr = cell_type_data[cell_type_data[condition_key] == 'ko'][gene]
            
            # Calculate p-value if enough samples
            if (len(wt_expr) > 1) & (len(ko_expr) > 1):
                statistic, p_value = stats.mannwhitneyu(wt_expr, ko_expr, alternative='two-sided')
            else:
                p_value = 1.0
            
            all_results.append({
                'cell_type': cell_type,
                'gene': gene,
                'p_value': p_value
            })
            all_p_values.append(p_value)
    
    # Convert to DataFrame
    results_df = pd.DataFrame(all_results)
    
    # Correct all p-values together
    _, p_values_corrected, _, _ = multipletests(all_p_values, method='fdr_bh')
    results_df['p_value_corrected'] = p_values_corrected
    
    # Add significance asterisks based on corrected p-values
    results_df['significance'] = results_df['p_value_corrected'].apply(
        lambda p: '****' if p < 0.0001 else
        ('***' if p < 0.001 else
        ('**' if p < 0.01 else
        ('*' if p < 0.05 else '')))
    )
    
    # Reshape the results to a more readable format
    pivot_df = results_df.pivot(
        index='cell_type',
        columns='gene',
        values=['p_value', 'p_value_corrected', 'significance']
    )
    
    return results_df, pivot_df


In [None]:

test_genes = ['Ccr7','TF_AP1_score']

df = sc.get.obs_df(adata[adata.obs['pathogenicity']=='pathogenic'], ['Ccr7','TF_AP1_score', 
                                                                     'WT/KO', 'cell_type'])

cell_type_list = dc_types

df = df[df['cell_type'].isin(cell_type_list)]
df['cell_type'] = pd.Categorical(df['cell_type']).remove_unused_categories()
   

results_long, results_pivot = compare_gene_expression_wtko(df, cell_type_list, test_genes)
results_pivot

In [None]:
results_long

In [None]:

cell_list = dc_types

test_genes = ['Ccr7','TF_AP1_score','Cd86']

sns.set_style("ticks")
for gene in test_genes:

    df = sc.get.obs_df(adata[adata.obs['pathogenicity']=='pathogenic'], [gene, 'WT/KO', 'cell_type'])
    
    cell_type_list = dc_types
    
    df = df[df['cell_type'].isin(cell_type_list)]
    
    df['cell_type'] = pd.Categorical(df['cell_type']).remove_unused_categories()

    df_combined = df.copy()
    df_combined['cell_type'] = 'Combined Cells'

    # Concatenate original and combined data
    df_all = pd.concat([df, df_combined])

    augmented_cell_type_list = dc_types +['Combined Cells']
    
    plt.figure(figsize=(9,3))
    
    ax = sns.violinplot(data=df_all, x="cell_type", y=gene, hue='WT/KO', split=True, inner='point', gap=.2, cut=0.,
                          density_norm='width', width=0.8, palette=['.4', '.7'], order=augmented_cell_type_list, legend=None, linewidth=1,
                          alpha=0.5)
    
    ax.set(xlabel = None)
    
    # Get the maximum y value for positioning asterisks
    y_max = df_all.groupby('cell_type')[gene].max()
    
    # Add asterisks above each violin plot
    results_long_gene = results_long[results_long['gene']==gene]
    for idx, (cell_type, asterisk) in enumerate(zip(results_long_gene['cell_type'], results_long_gene['significance'])):
        
        if asterisk:  # Only add text if there is a significance marker
            # Add some padding above the maximum value
            y_position = y_max[cell_type] + 0.03 * (y_max.max() - y_max.min())
            ax.text(idx, y_position, asterisk, ha='center', va='bottom')
    
    plt.yticks([0.0,1.0,2.0,3.0,4.0])
    plt.xticks(np.arange(len(augmented_cell_type_list)),augmented_cell_type_list,rotation=30,  fontsize=10, ha='center' )
    
    handles = []
    for ind, violin in enumerate(ax.findobj(PolyCollection)):
        rgb = to_rgb(ann_colors[4-ind // 2])
       
        if ind % 2 != 0:
            rgb = 0.5 + 0.5 * np.array(rgb)  # make whiter
        violin.set_facecolor(rgb)
        handles.append(plt.Rectangle((0, 0), 0, 0, facecolor=rgb, edgecolor='black'))
    
    # Adjust the top margin to make room for asterisks
    plt.tight_layout()
    # plt.subplots_adjust(top=1.1)  # Adjust this value if needed
    
    plt.savefig('Spleen_pathogenic_'+str(gene)+'_expression_WT_vs_KO_split.pdf', dpi=600)
    
    plt.show()

In [None]:
def create_split_violin_plot(df_wt, df_ko, genes_to_plot, figsize=(10, 6)):
    # First, let's reshape the data to long format
    # Add a condition column to each dataframe
    df_wt_long = df_wt[genes_to_plot].copy()
    df_wt_long['condition'] = 'WT'
    df_wt_long = df_wt_long.melt(id_vars=['condition'], 
                                value_vars=genes_to_plot,
                                var_name='gene', 
                                value_name='expression')

    df_ko_long = df_ko[genes_to_plot].copy()
    df_ko_long['condition'] = 'KO'
    df_ko_long = df_ko_long.melt(id_vars=['condition'], 
                                value_vars=genes_to_plot,
                                var_name='gene', 
                                value_name='expression')

    # Combine the dataframes
    df_combined = pd.concat([df_wt_long, df_ko_long], axis=0)
    df_combined = df_combined[(df_combined['gene']!='cell_type')&(df_combined['gene']!='WT/KO')]

    # df_combined['expression'] = np.exp(df_combined['expression'])
    
    # return df_combined

    # Create the plot
    plt.figure(figsize=figsize)
    ax = sns.violinplot(data=df_combined, 
                        x='gene', 
                        y='expression', 
                        hue='condition',
                        # split=True,
                        split=False,
                        inner='box',
                        cut=1,
                       palette=['dodgerblue','darkorchid'],
                       alpha=0.5)

    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.xlim(-0.5, len(genes_to_plot)+0.25)
    
    return ax, df_combined
    

# Statistical testing
from scipy import stats
from statsmodels.stats.multitest import multipletests

def add_statistical_annotation(ax, df_long, genes):
    # Perform statistical tests
    p_values = []
    y_max = []
    
    for gene in genes:
        gene_data = df_long[df_long['gene'] == gene]
        # Convert to numeric and handle any non-numeric values
        wt_expr = pd.to_numeric(gene_data[gene_data['condition'] == 'WT']['expression'], errors='coerce')
        ko_expr = pd.to_numeric(gene_data[gene_data['condition'] == 'KO']['expression'], errors='coerce')

    
        # Remove NaN values
        wt_expr = wt_expr.dropna()
        ko_expr = ko_expr.dropna()
        
        # Convert to numpy arrays
        wt_expr = wt_expr.to_numpy()
        ko_expr = ko_expr.to_numpy()
        
        # Mann-Whitney U test
        try:
            if len(wt_expr) > 0 and len(ko_expr) > 0:
                _, p_value = stats.mannwhitneyu(wt_expr, ko_expr, alternative='two-sided')
            else:
                p_value = 1.0
        except ValueError:
            p_value = 1.0

        
        p_values.append(p_value)
        
        # Get max y value for positioning asterisks
        # Handle empty arrays
        wt_max = np.max(wt_expr) if len(wt_expr) > 0 else -np.inf
        ko_max = np.max(ko_expr) if len(ko_expr) > 0 else -np.inf
        current_max = max(wt_max, ko_max)
        
        # If both arrays were empty, use 0 or another suitable default
        if current_max == -np.inf:
            current_max = 0
            
        y_max.append(current_max)
    
    # Only proceed with multiple testing correction if we have valid p-values
    if len(p_values) > 0:
        _, p_values_corrected, _, _ = multipletests(p_values, method='fdr_bh')
    else:
        p_values_corrected = []

    p_values_corrected_short = np.ones(len(genes))
    # Add asterisks
    if len(y_max) > 0:  # Only proceed if we have valid y_max values
        y_range = max(y_max) - min(y_max) if len(y_max) > 1 else 1
        
        for idx, (p, ymax) in enumerate(zip(p_values_corrected, y_max)):
            asterisk = ('****' if p < 0.0001 else 
                       '***' if p < 0.001 else 
                       '**' if p < 0.01 else 
                       '*' if p < 0.05 else 
                       'ns')

            p_values_corrected_short= "{:.2E}".format(p_values_corrected[idx])
            
            # Position the asterisk above the violin plot
            y_position = ymax + 0.2 * y_range
            ax.text(idx, y_position, asterisk, ha='center', va='bottom')
            ax.text(idx, y_position-0.2, 'p='+ str(p_values_corrected_short) , ha='center', va='bottom')
        
        # Adjust the plot limits to show asterisks
        plt.ylim(0, max(y_max) * 1.5)



In [None]:
test_genes = ['Cd74','Cd8a','Cd4','Ctla4','Pdcd1','Foxp3', 'Calr']

vars = test_genes.copy()
vars.append('WT/KO')
vars.append('cell_type')

df = sc.get.obs_df(adata[adata.obs['pathogenicity']=='pathogenic'], vars)

# Consider only DC clusters
df_dc = df[df['cell_type'].isin(['CD8+ DC1','CD8- DC1','Activated DC1-like','DC2','pDC'])]
df_dc_ko = df_dc[df_dc['WT/KO']=='ko'][test_genes]
df_dc_wt = df_dc[df_dc['WT/KO']=='wt'][test_genes]

ax, df_long = create_split_violin_plot(df_dc_wt, df_dc_ko, test_genes)
add_statistical_annotation(ax, df_long, test_genes)
plt.show()
