# Play around with decoupleR

In [None]:

import decoupler as dc
import scanpy as sc

# Only needed for processing
import numpy as np
import pandas as pd

# Needed for some plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.colors as mcolors
from matplotlib import rcParams
from matplotlib.pyplot import rc_context
from matplotlib import cm

# Plotting options, change to your liking
sc.settings.set_figure_params(dpi=200, frameon=False)
sc.set_figure_params(dpi=200)
sc.set_figure_params(figsize=(4, 4))



In [None]:
def load_RdOrYl_cmap_settings(fig_height=6, fig_width =6, dpi = 150, save_dpi =300, transparent = True):
    # Plot settings
    

    ## Plotting parameters
    rcParams['figure.figsize']=(fig_height,fig_width) #rescale figures
    #sc.set_figure_params(scanpy=True, frameon=False, vector_friendly=False, color_map='tab10' ,transparent=True, dpi=150, dpi_save=300)
    sc.set_figure_params(scanpy=True, frameon=False, vector_friendly=False ,transparent=transparent, dpi=dpi, dpi_save=save_dpi)

    ## Grid & Ticks
    rcParams['grid.alpha'] = 0
    rcParams['xtick.bottom'] = True
    rcParams['ytick.left'] = True

    from matplotlib import colors
    plt.rcParams.update({
        "text.usetex": False,
        "font.family": "serif",
        "font.serif": "NewComputerModern10", #Computer Modern Roman fontsize 10
    })
    ## Define new default settings
    plt.rcParamsDefault = plt.rcParams

    ## Embed font
    plt.rc('pdf', fonttype=42)

    ## Define new default settings
    plt.rcParamsDefault = plt.rcParams

    # Color maps
    colors2 = plt.cm.YlOrRd(np.linspace(0.05, 1, 150)) 
    colors3 = plt.cm.Greys_r(np.linspace(0.8,0.9,1)) 
    colorsComb = np.vstack([colors3, colors2]) 
    mymap = colors.LinearSegmentedColormap.from_list('my_colormap', colorsComb)
    return mymap

In [None]:
mymap = load_RdOrYl_cmap_settings(transparent=False)

# Pseudo-bulk functional analysis

From decoupleR tutorial on https://github.com/saezlab/decoupler-py/blob/main/docs/source/notebooks/pseudobulk.ipynb

When cell lineage is clear (there are clear cell identity clusters), it might be beneficial to perform functional analyses at the pseudo-bulk level instead of the single-cell.
By doing so, we recover lowly expressed genes that before where affected by the "drop-out" effect of single-cell. 
Additionaly, if there is more than one condition in our data, we can perform differential expression analysis (DEA) and use the gene statistics as input for enrichment analysis.

## Loading the data

In [None]:
adata_healthy= sc.read_h5ad('adata_ref_latent_with_dca.h5ad')

In [None]:
import h5py

In [None]:
# add dca imputed counts
from anndata._io.specs import read_elem
with h5py.File('adata_markedDoublets_mergedPeaks_normalized_initialAnno_rmDoublets_integrated_imputed_annotated.h5ad', 'r') as f:
    # Read specific columns from `obs`
    #sample_column = f['obs/sample'][:]
    #n_counts_column = f['obs/n_counts'][:]
    #https://github.com/scverse/anndata/issues/436:
    anno = read_elem(f["obs/cell_type_annotation_lv1"])

In [None]:
adata_healthy.obs['cell_type_annotation_lv1'] = anno

In [None]:
adata_healthy

In [None]:
adata_healthy.obs['atlas']= 'reference'

In [None]:
adata_diseased = sc.read_h5ad('adata_diseased_normalized_integrated_annotated.h5ad')

In [None]:
adata_diseased

In [None]:
adata_diseased.obs['cell_type_annotation_lv1'] = adata_diseased.obs['cell_type_annotation_lv1_transferred_label_unfiltered']

In [None]:
adata_diseased

In [None]:
adata_diseased.obs['atlas']= 'query'

In [None]:
adata = sc.concat(
    (adata_healthy, adata_diseased))#, join="outer") #,index_unique="_")

In [None]:
import gc
del adata_diseased
del adata_healthy
gc.collect()

In [None]:
adata

In [None]:
adata.write('joint_diseased_healthy_with_layers.h5ad')

### updated anndata

In [None]:
adata = sc.read_h5ad('joint_diseased_healthy_with_layers_metadata_corrected_anno_updated.h5ad')

In [None]:
adata

In [None]:
annotation_key = 'cell_type_annotation_lv1'

## Processing

To be able to use `decoupler` we need to transform them into gene symbols:

Since the meta-data of this data-set is available, we can filter cells that were not annotated:

We will store the raw counts in the `.layers` attribute so that we can use them
afterwards to generate pseudo-bulk profiles.

We can also look how cells cluster by cell identity:

In [None]:
# Identify highly variable genes
sc.pp.highly_variable_genes(adata, batch_key='sample')

# Scale the data
sc.pp.scale(adata, max_value=10)

# Generate PCA features
sc.tl.pca(adata, svd_solver='arpack', use_highly_variable=True)

# Compute distances in the PCA space, and find cell neighbors
sc.pp.neighbors(adata)

# Generate UMAP features
sc.tl.umap(adata)

# Visualize
sc.pl.umap(adata, color=['Status','kit',annotation_key], frameon=False, wspace =0.65)

In this data-set we have two condition, `COVID-19` and `healthy`, across 6 different cell types.

## Generation of pseudo-bulk profiles

After the annotation of clusters into cell identities, we often would like to perform differential expression analysis (DEA) between conditions within particular cell types to further characterize them. DEA can be performed at the single-cell level, but the obtained p-values are often inflated as each cell is treated as a sample. We know that single cells within a sample are not independent of each other, since they were isolated from the same environment. If we treat cells as samples, we are not testing the variation across a population of samples, rather the variation inside an individual one. Moreover, if a sample has more cells than another it might bias the results. 

The current best practice to correct for this is using a pseudo-bulk approach ([Squair J.W., et al 2021](https://doi.org/10.1038/s41467-021-25960-2)), which involves the following steps:

1. Subsetting the cell type(s) of interest to perform DEA.
2. Extracting their raw integer counts.
3. Summing their counts per gene into a single profile if they pass quality control.
4. Performing DEA if at least two biological replicates per condition are available (more replicates are recommended).

We can pseudobulk using the function `decoupler.get_pseudobulk`. In this example, we are interested in summing the counts but other
modes are available, for more information check its argument `mode`.

In [None]:
adata

In [None]:
# Rename meta-data
columns = ['sample','pretty name' 'phase', 'proliferation', 'initial_cell_type', 'leiden', 'Project', 'Status','sequencing', 'condition', 'kit', 'strain', 'enriched', 'treatment', 'diet', 'sequencing machine', 'cell_type_annotation_lv1', 'atlas']
adata.obs = adata.obs[columns]
#adata.obs.columns = ['sex','individual','disease','cell_type']

In [None]:
# Get pseudo-bulk profile
pdata = dc.get_pseudobulk(
    adata,
    sample_col='sample',
    groups_col=annotation_key,
    layer='raw_counts',
    mode='sum',
    min_cells=0,
    min_counts=0
)

In [None]:
pdata

It has generated a profile for each sample and cell type. We can plot their quality control metrics:

In [None]:
dc.plot_psbulk_samples(pdata, groupby=annotation_key, figsize=(12, 4))

In [None]:
adata

In [None]:
sc.pp.neighbors(adata, n_neighbors=30, use_rep='X_scarches_emb')
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata,color=['atlas'])

In [None]:
# Get pseudo-bulk profile
pdata = dc.get_pseudobulk(
    adata,
    sample_col='sample',
    groups_col=annotation_key,
    layer='raw_counts',
    mode='sum',
    min_cells=0,
    min_counts=0
)

In [None]:
dc.plot_psbulk_samples(pdata, groupby='atlas', figsize=(12, 4))

In [None]:
pdata

There are two criteria to filter low quality samples: its number of cells (`psbulk_n_cells`), and its total sum of counts (`psbulk_counts`).
In these plots it can be seen that there are some samples of platelet cells that contain less than 10 cells, we might want to remove
them by using the arguments `min_cells` and `min_counts`. Note that these thresholds are arbitrary and will change depening on the
dataset, but a good rule of thumb is to have at least 10 cells with 1000 accumulated counts.

In [None]:
# Get filtered pseudo-bulk profile
pdata = dc.get_pseudobulk(
    adata,
    sample_col='sample',
    groups_col=annotation_key,
    layer='raw_counts',
    mode='sum',
    min_cells=10,
    min_counts=100
)
pdata

### Exploration of pseudobulk profiles
Now that we have generated the pseudobulk profiles for each patient and each cell type, let's explore the variability between them. For that, we will first do some simple preprocessing and then do a PCA

In [None]:
# Store raw counts in layers
pdata.layers['counts'] = pdata.X.copy()

# Normalize, scale and compute pca
sc.pp.normalize_total(pdata, target_sum=1e4)
sc.pp.log1p(pdata)
sc.pp.scale(pdata, max_value=10)
sc.tl.pca(pdata)

# Return raw counts to X
dc.swap_layer(pdata, 'counts', X_layer_key=None, inplace=True)

In [None]:
sc.pl.pca(pdata, color=['atlas', 'Status',annotation_key, 'condition', 'kit', 'sequencing machine', 'diet'], ncols=1, size=100, frameon=True, legend_fontsize=9)
sc.pl.pca_variance_ratio(pdata)

In [None]:
sc.pl.pca(pdata, color=[annotation_key], ncols=1, size=50, frameon=True, legend_fontsize=4, legend_loc='on data')


In [None]:
sc.pl.pca(pdata, color=[annotation_key],components=['2,3'], ncols=1, size=100, frameon=True, legend_fontsize=6, legend_loc='on data')

In [None]:
sc.pl.pca(pdata, color=['atlas', annotation_key, 'condition', 'kit', 'sequencing machine', 'diet'],components=['2,3'], ncols=1, size=100, frameon=True, legend_fontsize=9)


When looking at the PCA, it seems like the two first components explain most of the variance and they easily separate cell types from one another. In contrast, the principle components do not seem to be associated with disease status as such.

In order to have a better overview of the association of PCs with sample metadata, let's perform ANOVA on each PC and see whether they are significantly associated with any technical or biological annotations of our samples

In [None]:
pdata

In [None]:
dc.get_metadata_associations(
    pdata,
    obs_keys = ['Status','condition', 'treatment', 'diet', 'atlas', annotation_key, 'psbulk_n_cells', 'psbulk_counts'],  # Metadata columns to associate to PCs
    obsm_key='X_pca',  # Where the PCs are stored
    uns_key='pca_anova',  # Where the results are stored
    inplace=True,
)

In [None]:
pdata.uns['pca_anova']

In [None]:
print(pdata.uns['pca_anova'].describe())


In [None]:
pdata.obs.replace([np.inf, -np.inf], np.nan, inplace=True)  # Replace infinite values with NaN
pdata.obs.dropna()  # Dropping rows with NaNs again

In [None]:
pdata

In [None]:
pca_anova_numeric = pdata.uns['pca_anova'].select_dtypes(include=[np.number])
print("Numeric columns in pca_anova:\n", pca_anova_numeric.head())


In [None]:
nan_count = np.isnan(pca_anova_numeric).sum().sum()
inf_count = np.isinf(pca_anova_numeric).sum().sum()

print("NaN values in numeric columns:", nan_count)
print("Infinite values in numeric columns:", inf_count)


In [None]:
dc.plot_associations(
    pdata,
    uns_key='pca_anova',  # Summary statistics from the anova tests
    obsm_key='X_pca',  # where the PCs are stored
    stat_col='p_adj',  # Which summary statistic to plot
    obs_annotation_cols = [annotation_key], # which sample annotations to plot
    titles=['Principle component scores', 'Adjusted p-values from ANOVA'],
    figsize=(7, 5),
    n_factors=10,
)

### exclude multiome

In [None]:
adata.obs['kit'].value_counts()

### Exploration of pseudobulk profiles
Now that we have generated the pseudobulk profiles for each patient and each cell type, let's explore the variability between them. For that, we will first do some simple preprocessing and then do a PCA

In [None]:
# Store raw counts in layers
pdata.layers['counts'] = pdata.X.copy()

# Normalize, scale and compute pca
sc.pp.normalize_total(pdata, target_sum=1e4)
sc.pp.log1p(pdata)
sc.pp.scale(pdata, max_value=10)
sc.tl.pca(pdata)

# Return raw counts to X
dc.swap_layer(pdata, 'counts', X_layer_key=None, inplace=True)

In [None]:
sc.pl.pca(pdata, color=['atlas', annotation_key, 'condition', 'kit', 'sequencing machine', 'diet'], ncols=1, size=100, frameon=True, legend_fontsize=9)
sc.pl.pca_variance_ratio(pdata)

On the PCA plots above, T and B cells seemed not to be that well separated. However when looking at the hierarchical clustering in the heatmap, one can see that the inclusion of more PCs helps to distinguish them.

When looking at the p-values from the ANOVA models, it becomes clear that the top PCs, which explain most of the observed variability, are statistically associated with the `cell_type` category.

### Pseudo-bulk profile gene filtering
Additionally to filtering low quality samples, we can also filter noisy expressed genes in case we want to perform downstream analyses such as DEA afterwards. Note that this step should be done at the cell type level, since each cell type may express different collection of genes.

For this vignette, we will explore the effects of COVID on T cells. Let's first select our samples of interest:

To filter genes, we will follow the strategy implemented in the function `filterByExpr` from [edgeR](https://rdrr.io/bioc/edgeR/man/filterByExpr.html).
It keeps genes that have a minimum total number of reads across samples (`min_total_count`), and that have a minimum number of counts in a number of samples (`min_count`).

We can plot how many genes do we keep, you can play with the `min_count` and `min_total_count` to check how many genes would be kept when changed:

In [None]:
adata.obs['cell_type_annotation_lv1'].cat.categories


In [None]:
renaming_dict ={ 'EEC' :['Goblet/EEC prog. (early)',
'K-cell (Gip+)',
'EC (mature)',
'EC (immature)',
'EEC (Peptide/immature)' ,
'L/I-cell (Glp1+/Cck+)' ,
'EEC prog. (mid)',
'EC prog. (late)',
'D-cell (Sst+)',
'EEC prog. (late/Peptide)',
'EC 2' ,
'X-cell (Ghrl+)'], 'secr_progenitor' : ['Goblet/EEC prog. (early)','Tuft prog.','Tuft prog. 2','Goblet-Paneth-like(cycling)','Paneth prog.','Goblet prog. (late)'],
'TA' : ['TA (prox.))','TA'],
'Enterocyte':['early Enterocyte','Enterocyte']}

In [None]:
adata.obs['cell_type_annotation_lv0'] = adata.obs['cell_type_annotation_lv1'].copy()

In [None]:
# Reverse dictionary
reversed_dict = {v: k for k, values in renaming_dict.items() for v in values}

# map dict values
#adata.obs = adata.obs.assign(cell_type_annotation_lv0=lambda x: x['cell_type_annotation_lv1'].map(reversed_dict))

adata.obs['cell_type_annotation_lv0'] = adata.obs['cell_type_annotation_lv0'].replace(reversed_dict)



In [None]:
sc.pl.umap(adata,color=['cell_type_annotation_lv0'])

In [None]:
# Get filtered pseudo-bulk profile
pdata = dc.get_pseudobulk(
    adata,
    sample_col='sample',
    groups_col=annotation_key,
    layer='raw_counts',
    mode='sum',
    min_cells=10,
    min_counts=100
)
pdata

In [None]:
# Select EECs
eecells = pdata[pdata.obs['cell_type_annotation_lv0'] == 'EEC'].copy()

In [None]:
dc.plot_filter_by_expr(eecells, group='atlas', min_count=10, min_total_count=15)

Here we can observe the frequency of genes with different total sum of counts and number of samples. The dashed lines indicate the current thresholds, meaning that only the genes in the upper-right corner are going to be kept. Filtering parameters is completely arbitrary, but a good rule of thumb is to identify bimodal distributions and split them modifying the available thresholds.
In this example, with the default values we would keep a good quantity of genes while filtering potential noisy genes.

<div class="alert alert-info">

**Note**
    
Changing the value of `min_count` will drastically change the distribution of "Number of samples", not change its threshold.
In case you want to lower or increase it, you need to play with the `group`, `large_n` and `min_prop` parameters. 


</div>

Once we are content with the threshold parameters, we can perform the actual filtering:

In [None]:
# Obtain genes that pass the thresholds
genes = dc.filter_by_expr(eecells, group='atlas', min_count=10, min_total_count=15)

# Filter by these genes
tcells = eecells[:, genes].copy()
tcells

Another filtering strategy is to filter out genes that are not expressed in a percentage of cells and samples, as implemented
in `decoupler.filter_by_prop`.

## Contrast between conditions
Once we have generated robust pseudo-bulk profiles, we can compute DEA. For this example, we will perform a simple
experimental design where we compare the gene expression of T cells from diseased patients against controls. We will use the
python implementation of the framework DESeq2, but we could have used any other one (`limma` or `edgeR` for example).
For a better understanding how it works, check [DESeq2's documentation](https://pydeseq2.readthedocs.io/en/latest/). Note that
more complex experimental designs can be used by adding more factors to the `design_factors` argument.

In [None]:
# Import DESeq2
from pydeseq2.dds import DeseqDataSet, DefaultInference
from pydeseq2.ds import DeseqStats

In [None]:
adata.obs['atlas'].value_counts()

In [None]:
# Build DESeq2 object
inference = DefaultInference(n_cpus=8)
dds = DeseqDataSet(
    adata=tcells,
    design_factors=['kit','Status'],
    ref_level=['Status', 'healthy'],
    refit_cooks=True,
    inference=inference,
)

In [None]:
# Compute LFCs
dds.deseq2()

In [None]:
# Extract contrast between COVID-19 vs normal
stat_res = DeseqStats(
    dds,
    contrast=["Status", 'diseased', 'healthy'],
    inference=inference,
)

In [None]:
# Compute Wald test
stat_res.summary()

In [None]:
# Extract results
results_df = stat_res.results_df
results_df

We can plot the obtained results in a volcano plot:

In [None]:
sc.pl.violin(adata,groupby='atlas',keys=['Fabp1', 'Defa17'])

In [None]:
dc.plot_volcano_df(
    results_df,
    x='log2FoldChange',
    y='padj',
    top=20,
    figsize=(8, 4)
)

After performing DEA, we can use the obtained gene level statistics to perform enrichment analysis. Any statistic can be used,
but we recommend using the t-values instead of logFCs since t-values incorporate the significance of change in their value.
We will transform the obtained t-values stored in `stats` to a wide matrix so that it can be used by `decoupler`:

In [None]:
mat = results_df[['stat']].T.rename(index={'stat': 'EECs'})
mat

## Transcription factor activity inference

The first functional analysis we can perform is to infer transcription factor (TF) activities from our transcriptomics data. We will need a gene regulatory network (GRN) and a statistical method.

### CollecTRI network
[CollecTRI](https://github.com/saezlab/CollecTRI) is a comprehensive resource
containing a curated collection of TFs and their transcriptional targets
compiled from 12 different resources. This collection provides an increased
coverage of transcription factors and a superior performance in identifying
perturbed TFs compared to our previous
[DoRothEA](https://saezlab.github.io/dorothea/) network and other literature
based GRNs. Similar to DoRothEA, interactions are weighted by their mode of
regulation (activation or inhibition).

For this example we will use the human version (mouse and rat are also
available). We can use `decoupler` to retrieve it from `omnipath`. The argument
`split_complexes` keeps complexes or splits them into subunits, by default we
recommend to keep complexes together.

<div class="alert alert-info">

**Note**

In this tutorial we use the network CollecTRI, but we could use any other GRN coming from an inference method such as [CellOracle](https://morris-lab.github.io/CellOracle.documentation/), [pySCENIC](https://pyscenic.readthedocs.io/en/latest/) or [SCENIC+](https://scenicplus.readthedocs.io/en/latest/). 

</div> 

In [None]:
# Retrieve CollecTRI gene regulatory network
collectri = dc.get_collectri(organism='mouse', split_complexes=False)
collectri

### Activity inference with Univariate Linear Model (ULM)

To infer TF enrichment scores we will run the Univariate Linear Model (`ulm`) method. For each sample in our dataset (`mat`) and each TF in our network (`net`), it fits a linear model that predicts the observed gene expression
based solely on the TF's TF-Gene interaction weights. Once fitted, the obtained t-value of the slope is the score. If it is positive, we interpret that the TF is active and if it is negative we interpret that it is inactive.

<img src="../ulm.png" />

We can run `ulm` with a one-liner:

### only kit and atlas

In [None]:
# Infer pathway activities with ulm
tf_acts, tf_pvals = dc.run_ulm(mat=mat, net=collectri)
tf_acts

Let us plot the obtained scores for the top active/inactive transcription factors:

In [None]:
dc.plot_barplot(
    acts=tf_acts,
    contrast='EECs',
    top=25,
    vertical=True,
    figsize=(3, 6)
)

In accordance to the previous pathway results, T cells seem to activate for TFs responsible for cell growth (E2F4, TFDP1, E2F1).

Like with pathways, we can explore how the target genes look like:

In [None]:
# Extract logFCs and pvals
logFCs = results_df[['log2FoldChange']].T.rename(index={'log2FoldChange': 'EECs'})
pvals = results_df[['padj']].T.rename(index={'padj': 'EECs'})

# Plot
dc.plot_volcano(
    logFCs=logFCs,
    pvals=pvals,
    contrast='EECs',
    name='Myc',
    net=collectri,
    top=100,
    sign_thr=0.05,
    lFCs_thr=0.5
)

We can also plot the network of interesting TFs (top and bottom by activity) and color the nodes by activity and target gene expression:

In [None]:
dc.plot_network(
    net=collectri,
    obs=mat,
    act=tf_acts,
    n_sources=['Myc', 'Foxa2', 'Spdef', 'Rel','Rela','Bcl6'],
    n_targets=15,
    node_size=100,
    figsize=(7, 7),
    c_pos_w='darkgreen',
    c_neg_w='darkred',
    vcenter=True
)

Green edges are positive regulation (activation), red edges are negative regulation (inactivation):

In [None]:
dc.run_ulm(
    mat=adata,
    net=collectri,
    source='source',
    target='target',
    weight='weight',
    verbose=True,
    use_raw=False
)

In [None]:
acts = dc.get_acts(adata, obsm_key='ulm_estimate')
acts

In [None]:
sc.pl.umap(acts, color=['Egr1', 'cell_type_annotation_lv0','atlas'], cmap='RdBu_r', vcenter=0, size=4)
sc.pl.violin(acts, keys=['Egr1'], groupby='cell_type_annotation_lv0', rotation=90)

In [None]:
sc.pl.umap(acts, color=['Nfkb1','Stat3','Egr1','Pax6','atlas', 'cell_type_annotation_lv1'], cmap='RdBu_r', vcenter=0, size =4)
sc.pl.violin(acts, keys=['Nfkb1','Stat3','Egr1','Pax6'], groupby='cell_type_annotation_lv0', rotation=90)

In [None]:
sc.pl.umap(acts, color=['Pax6', 'cell_type_annotation_lv0'], cmap='RdBu_r', vcenter=0)
sc.pl.violin(acts, keys=['Pax6'], groupby='cell_type_annotation_lv0', rotation=90)

### with diet in design factors

In [None]:
# Infer pathway activities with ulm
tf_acts, tf_pvals = dc.run_ulm(mat=mat, net=collectri)
tf_acts

Let us plot the obtained scores for the top active/inactive transcription factors:

In [None]:
dc.plot_barplot(
    acts=tf_acts,
    contrast='EECs',
    top=25,
    vertical=True,
    figsize=(3, 6)
)

In accordance to the previous pathway results, T cells seem to activate for TFs responsible for cell growth (E2F4, TFDP1, E2F1).

Like with pathways, we can explore how the target genes look like:

In [None]:
# Extract logFCs and pvals
logFCs = results_df[['log2FoldChange']].T.rename(index={'log2FoldChange': 'EECs'})
pvals = results_df[['padj']].T.rename(index={'padj': 'EECs'})

# Plot
dc.plot_volcano(
    logFCs=logFCs,
    pvals=pvals,
    contrast='EECs',
    name='Irf1',
    net=collectri,
    top=100,
    sign_thr=0.05,
    lFCs_thr=0.5
)

We can also plot the network of interesting TFs (top and bottom by activity) and color the nodes by activity and target gene expression:

In [None]:
dc.plot_network(
    net=collectri,
    obs=mat,
    act=tf_acts,
    n_sources=['Irf1', 'Bcl6', 'Ciita', 'Rela'],
    n_targets=15,
    node_size=100,
    figsize=(7, 7),
    c_pos_w='darkgreen',
    c_neg_w='darkred',
    vcenter=True
)

Green edges are positive regulation (activation), red edges are negative regulation (inactivation):

In [None]:
dc.run_ulm(
    mat=adata,
    net=collectri,
    source='source',
    target='target',
    weight='weight',
    verbose=True,
    use_raw=False
)

In [None]:
acts = dc.get_acts(adata, obsm_key='ulm_estimate')
acts

In [None]:
acts_vars = [name for name in acts.var_names]

In [None]:
Lyz_vars = [name for name in acts.var_names if str(name).startswith('B')]

In [None]:
Lyz_vars

In [None]:
sc.pl.stacked_violin(acts, ['Sox2'], groupby='atlas', dendrogram=True)

In [None]:
acts.obs['atlas_celltype'] = acts.obs['cell_type_annotation_lv0'].astype(str) + '_' + acts.obs['atlas'].astype(str)

In [None]:
sc.pl.umap(acts, color=['Sox2', 'atlas'], cmap='RdBu_r', vcenter=0) #Socs2 non ecxistent, Akt1 neither
sc.pl.violin(acts, keys=['Sox2'], groupby='atlas_celltype', rotation=90)

In [None]:
adata.obs['atlas_celltype'] = adata.obs['cell_type_annotation_lv0'].astype(str) + '_' + adata.obs['atlas'].astype(str)

In [None]:
sc.pl.umap(adata, color=['Socs2', 'cell_type_annotation_lv0'], cmap='RdBu_r', vcenter=0)
sc.pl.violin(adata, keys=['Socs2'], groupby='atlas_celltype', rotation=90)

## Pathway activity inference

Another analysis we can perform is to infer pathway activities from our transcriptomics data.

### PROGENy model

[PROGENy](https://saezlab.github.io/progeny/) is a comprehensive resource containing a curated collection of pathways and their target genes, with weights for each interaction.
For this example we will use the human weights (other organisms are available) and we will use the top 500 responsive genes ranked by p-value. Here is a brief description of each pathway:

- **Androgen**: involved in the growth and development of the male reproductive organs.
- **EGFR**: regulates growth, survival, migration, apoptosis, proliferation, and differentiation in mammalian cells
- **Estrogen**: promotes the growth and development of the female reproductive organs.
- **Hypoxia**: promotes angiogenesis and metabolic reprogramming when O2 levels are low.
- **JAK-STAT**: involved in immunity, cell division, cell death, and tumor formation.
- **MAPK**: integrates external signals and promotes cell growth and proliferation.
- **NFkB**: regulates immune response, cytokine production and cell survival.
- **p53**: regulates cell cycle, apoptosis, DNA repair and tumor suppression.
- **PI3K**: promotes growth and proliferation.
- **TGFb**: involved in development, homeostasis, and repair of most tissues.
- **TNFa**: mediates haematopoiesis, immune surveillance, tumour regression and protection from infection.
- **Trail**: induces apoptosis.
- **VEGF**: mediates angiogenesis, vascular permeability, and cell migration.
- **WNT**: regulates organ morphogenesis during development and tissue repair.

To access it we can use `decoupler`.

In [None]:
# Retrieve PROGENy model weights
progeny = dc.get_progeny(organism='mouse',top=500)
progeny

### Activity inference with Multivariate Linear Model (MLM)

To infer pathway enrichment scores we will run the Multivariate Linear Model (`mlm`) method. For each sample in our dataset (`adata`), it fits a linear model that predicts the observed gene expression based on all pathways' Pathway-Gene interactions weights.
Once fitted, the obtained t-values of the slopes are the scores. If it is positive, we interpret that the pathway is active and if it is negative we interpret that it is inactive.

<img src="../mlm.png" />
     
We can run `mlm` with a one-liner:

In [None]:
# Infer pathway activities with mlm
pathway_acts, pathway_pvals = dc.run_mlm(mat=mat, net=progeny)
pathway_acts

Let us plot the obtained scores:

In [None]:
dc.plot_barplot(
    acts=pathway_acts,
    contrast='EECs',
    top=25,
    vertical=False,
    figsize=(6, 3)
)

It looks like JAK-STAT, a known immunity pathway is more active in T cells from COVID-19 patients than in controls. To further explore how the target genes of a pathway of interest behave, we can plot them in scatter plot:

In [None]:
dc.plot_targets(
    data=results_df,
    stat='stat',
    source_name='WNT',
    net=progeny,
    top=15
)

The observed activation of JAK-STAT is due to the fact that majority of its target genes with positive weights have positive
t-values (1st quadrant), and the majority of the ones with negative weights have negative t-values (3d quadrant).

In [None]:
dc.plot_targets(
    data=results_df,
    stat='stat',
    source_name='Trail',
    net=progeny,
    top=15
)

In [None]:
dc.plot_targets(
    data=results_df,
    stat='stat',
    source_name='Hypoxia',
    net=progeny,
    top=15
)

In [None]:
dc.run_mlm(
    mat=adata,
    net=progeny,
    source='source',
    target='target',
    weight='weight',
    verbose=True,
    use_raw=False
)

In [None]:
adata.obsm['progeny_mlm_estimate'] = adata.obsm['mlm_estimate'].copy()
adata.obsm['progeny_mlm_pvals'] = adata.obsm['mlm_pvals'].copy()
adata

In [None]:
acts = dc.get_acts(adata, obsm_key='mlm_estimate')
acts

In [None]:
sc.pl.umap(acts, color=['WNT','atlas', 'cell_type_annotation_lv1'], cmap='RdBu_r', vcenter=0)
sc.pl.violin(acts, keys=['WNT'], groupby='cell_type_annotation_lv0', rotation=90)

In [None]:
sc.pl.umap(acts, color=['Trail', 'cell_type_annotation_lv0','atlas'], cmap='RdBu_r', vcenter=0)
sc.pl.violin(acts, keys=['Trail'], groupby='cell_type_annotation_lv0', rotation=90)

In [None]:
sc.pl.umap(acts, color=['Hypoxia', 'cell_type_annotation_lv0'], cmap='RdBu_r', vcenter=0)
sc.pl.violin(acts, keys=['Hypoxia'], groupby='cell_type_annotation_lv0', rotation=90)

In [None]:
sc.pl.matrixplot(acts, var_names=acts.var_names, groupby='cell_type_annotation_lv1', dendrogram=True, standard_scale='var',
                 colorbar_title='Z-scaled scores', cmap='RdBu_r')

# only EEC

In [None]:
EECs = ['Goblet/EEC prog. (early)',
'K-cell (Gip+)',
'EC (mature)',
'EC (immature)',
'EEC (Peptide/immature)' ,
'L/I-cell (Glp1+/Cck+)' ,
'EEC prog. (mid)',
'EC prog. (late)',
'D-cell (Sst+)',
'EEC prog. (late/Peptide)',
'EC 2' ,
'X-cell (Ghrl+)']

In [None]:
adata = adata[adata.obs['cell_type_annotation_lv1'].isin(EECs)].copy()

We can also look how cells cluster by cell identity:

In [None]:
# Identify highly variable genes
sc.pp.highly_variable_genes(adata, batch_key='sample')

# Scale the data
sc.pp.scale(adata, max_value=10)

# Generate PCA features
sc.tl.pca(adata, svd_solver='arpack', use_highly_variable=True)

# Compute distances in the PCA space, and find cell neighbors
sc.pp.neighbors(adata)

# Generate UMAP features
sc.tl.umap(adata)

# Visualize
sc.pl.umap(adata, color=['Status','kit',annotation_key], frameon=False, wspace =0.65)

In [None]:
import gc
gc.collect()

In this data-set we have two condition, `COVID-19` and `healthy`, across 6 different cell types.

## Generation of pseudo-bulk profiles

After the annotation of clusters into cell identities, we often would like to perform differential expression analysis (DEA) between conditions within particular cell types to further characterize them. DEA can be performed at the single-cell level, but the obtained p-values are often inflated as each cell is treated as a sample. We know that single cells within a sample are not independent of each other, since they were isolated from the same environment. If we treat cells as samples, we are not testing the variation across a population of samples, rather the variation inside an individual one. Moreover, if a sample has more cells than another it might bias the results. 

The current best practice to correct for this is using a pseudo-bulk approach ([Squair J.W., et al 2021](https://doi.org/10.1038/s41467-021-25960-2)), which involves the following steps:

1. Subsetting the cell type(s) of interest to perform DEA.
2. Extracting their raw integer counts.
3. Summing their counts per gene into a single profile if they pass quality control.
4. Performing DEA if at least two biological replicates per condition are available (more replicates are recommended).

We can pseudobulk using the function `decoupler.get_pseudobulk`. In this example, we are interested in summing the counts but other
modes are available, for more information check its argument `mode`.

In [None]:
adata

In [None]:
# Rename meta-data
columns = ['sample', 'pretty name','phase','line', 'target cell number', 'leiden', 'Project', 'Status','sequencing', 'condition', 'kit', 'strain', 'enriched','enrichment proportion', 'treatment', 'doublet_calls','diet', 'sequencing machine', 'cell_type_annotation_lv1', 'atlas']
adata.obs = adata.obs[columns]
#adata.obs.columns = ['sex','individual','disease','cell_type']

In [None]:
# Get pseudo-bulk profile
pdata = dc.get_pseudobulk(
    adata,
    sample_col='sample',
    groups_col=annotation_key,
    layer='raw_counts',
    mode='sum',
    min_cells=0,
    min_counts=0
)

In [None]:
pdata

It has generated a profile for each sample and cell type. We can plot their quality control metrics:

In [None]:
dc.plot_psbulk_samples(pdata, groupby=annotation_key, figsize=(12, 4))

In [None]:
adata

In [None]:
sc.pp.neighbors(adata, n_neighbors=30, use_rep='X_scarches_emb')
sc.tl.umap(adata, min_dist=0.3)

In [None]:
sc.pl.umap(adata,color=['atlas'])

In [None]:
dc.plot_psbulk_samples(pdata, groupby='atlas', figsize=(12, 4))

In [None]:
pdata

There are two criteria to filter low quality samples: its number of cells (`psbulk_n_cells`), and its total sum of counts (`psbulk_counts`).
In these plots it can be seen that there are some samples of platelet cells that contain less than 10 cells, we might want to remove
them by using the arguments `min_cells` and `min_counts`. Note that these thresholds are arbitrary and will change depening on the
dataset, but a good rule of thumb is to have at least 10 cells with 1000 accumulated counts.

In [None]:
# Get filtered pseudo-bulk profile
pdata = dc.get_pseudobulk(
    adata,
    sample_col='sample',
    groups_col=annotation_key,
    layer='raw_counts',
    mode='sum',
    min_cells=3,
    min_counts=10
)
pdata

### Exploration of pseudobulk profiles
Now that we have generated the pseudobulk profiles for each patient and each cell type, let's explore the variability between them. For that, we will first do some simple preprocessing and then do a PCA

In [None]:
# Store raw counts in layers
pdata.layers['counts'] = pdata.X.copy()

# Normalize, scale and compute pca
sc.pp.normalize_total(pdata, target_sum=1e4)
sc.pp.log1p(pdata)
sc.pp.scale(pdata, max_value=10)
sc.tl.pca(pdata)

# Return raw counts to X
dc.swap_layer(pdata, 'counts', X_layer_key=None, inplace=True)

In [None]:
sc.pl.pca(pdata, color=['atlas', 'Status',annotation_key, 'condition', 'kit', 'sequencing machine', 'diet'], ncols=1, size=100, frameon=True, legend_fontsize=9)
sc.pl.pca_variance_ratio(pdata)

In [None]:
sc.pl.pca(pdata, color=[annotation_key], ncols=1, size=50, frameon=True, legend_fontsize=4, legend_loc='on data')


In [None]:
sc.pl.pca(pdata, color=[annotation_key],components=['1,3'], ncols=1, size=50, frameon=True, legend_fontsize=6)

In [None]:
sc.pl.pca(pdata, color=['atlas', annotation_key, 'condition', 'kit', 'sequencing machine', 'diet'],components=['2,3'], ncols=1, size=50, frameon=True, legend_fontsize=9)


When looking at the PCA, it seems like the two first components explain most of the variance and they easily separate cell types from one another. In contrast, the principle components do not seem to be associated with disease status as such.

In order to have a better overview of the association of PCs with sample metadata, let's perform ANOVA on each PC and see whether they are significantly associated with any technical or biological annotations of our samples

In [None]:
pdata

In [None]:
dc.get_metadata_associations(
    pdata,
    obs_keys = ['line', 'Project', 'Status', 'sequencing', 'condition', 'kit', 'strain', 'enriched', 'treatment', 'diet', 'atlas', annotation_key, 'psbulk_n_cells', 'psbulk_counts'],  # Metadata columns to associate to PCs
    obsm_key='X_pca',  # Where the PCs are stored
    uns_key='pca_anova',  # Where the results are stored
    inplace=True
)

In [None]:
pdata.uns['pca_anova']

In [None]:
print(pdata.uns['pca_anova'].describe())


In [None]:
dc.plot_associations(
    pdata,
    uns_key='pca_anova',  # Summary statistics from the anova tests
    obsm_key='X_pca',  # where the PCs are stored
    stat_col='p_adj',  # Which summary statistic to plot
    obs_annotation_cols = [annotation_key], # which sample annotations to plot
    titles=['Principle component scores', 'Adjusted p-values from ANOVA'],
    figsize=(9, 7),
    n_factors=10,
)

### Pseudo-bulk profile gene filtering
Additionally to filtering low quality samples, we can also filter noisy expressed genes in case we want to perform downstream analyses such as DEA afterwards. Note that this step should be done at the cell type level, since each cell type may express different collection of genes.

For this vignette, we will explore the effects of COVID on T cells. Let's first select our samples of interest:

To filter genes, we will follow the strategy implemented in the function `filterByExpr` from [edgeR](https://rdrr.io/bioc/edgeR/man/filterByExpr.html).
It keeps genes that have a minimum total number of reads across samples (`min_total_count`), and that have a minimum number of counts in a number of samples (`min_count`).

We can plot how many genes do we keep, you can play with the `min_count` and `min_total_count` to check how many genes would be kept when changed:

In [None]:
# Get filtered pseudo-bulk profile
pdata = dc.get_pseudobulk(
    adata,
    sample_col='sample',
    groups_col=annotation_key,
    layer='raw_counts',
    mode='sum',
    min_cells=3,
    min_counts=10
)
pdata

In [None]:
pdata.obs['cell_type_annotation_lv1'].value_counts()

In [None]:
# Select EECs
eecells = pdata[pdata.obs['cell_type_annotation_lv1'] == 'EEC'].copy()

In [None]:
dc.plot_filter_by_expr(pdata, group='atlas', min_count=3, min_total_count=10)

Here we can observe the frequency of genes with different total sum of counts and number of samples. The dashed lines indicate the current thresholds, meaning that only the genes in the upper-right corner are going to be kept. Filtering parameters is completely arbitrary, but a good rule of thumb is to identify bimodal distributions and split them modifying the available thresholds.
In this example, with the default values we would keep a good quantity of genes while filtering potential noisy genes.

<div class="alert alert-info">

**Note**
    
Changing the value of `min_count` will drastically change the distribution of "Number of samples", not change its threshold.
In case you want to lower or increase it, you need to play with the `group`, `large_n` and `min_prop` parameters. 


</div>

Once we are content with the threshold parameters, we can perform the actual filtering:

In [None]:
# Obtain genes that pass the thresholds
genes = dc.filter_by_expr(pdata, group='atlas', min_count=3, min_total_count=10)

# Filter by these genes
#tcells = eecells[:, genes].copy()
#tcells

Another filtering strategy is to filter out genes that are not expressed in a percentage of cells and samples, as implemented
in `decoupler.filter_by_prop`.

## Contrast between conditions
Once we have generated robust pseudo-bulk profiles, we can compute DEA. For this example, we will perform a simple
experimental design where we compare the gene expression of T cells from diseased patients against controls. We will use the
python implementation of the framework DESeq2, but we could have used any other one (`limma` or `edgeR` for example).
For a better understanding how it works, check [DESeq2's documentation](https://pydeseq2.readthedocs.io/en/latest/). Note that
more complex experimental designs can be used by adding more factors to the `design_factors` argument.

In [None]:
# Import DESeq2
from pydeseq2.dds import DeseqDataSet, DefaultInference
from pydeseq2.ds import DeseqStats

In [None]:
adata.obs['atlas'].value_counts()

In [None]:
# Build DESeq2 object
inference = DefaultInference(n_cpus=8)
dds = DeseqDataSet(
    adata=pdata,
    design_factors=['kit','Status'],
    ref_level=['Status', 'healthy'],
    refit_cooks=True,
    inference=inference,
)

In [None]:
# Compute LFCs
dds.deseq2()

In [None]:
# Extract contrast between COVID-19 vs normal
stat_res = DeseqStats(
    dds,
    contrast=["Status", 'diseased', 'healthy'],
    inference=inference,
)

In [None]:
# Compute Wald test
stat_res.summary()

In [None]:
# Extract results
results_df = stat_res.results_df
results_df

We can plot the obtained results in a volcano plot:

In [None]:
sc.pl.violin(adata,groupby='atlas',keys=['Fabp1', 'Defa17'])

In [None]:
dc.plot_volcano_df(
    results_df,
    x='log2FoldChange',
    y='padj',
    top=30,
    figsize=(10, 4)
)

After performing DEA, we can use the obtained gene level statistics to perform enrichment analysis. Any statistic can be used,
but we recommend using the t-values instead of logFCs since t-values incorporate the significance of change in their value.
We will transform the obtained t-values stored in `stats` to a wide matrix so that it can be used by `decoupler`:

In [None]:
mat = results_df[['stat']].T.rename(index={'stat': 'EECs'})
mat

## Transcription factor activity inference

The first functional analysis we can perform is to infer transcription factor (TF) activities from our transcriptomics data. We will need a gene regulatory network (GRN) and a statistical method.

### CollecTRI network
[CollecTRI](https://github.com/saezlab/CollecTRI) is a comprehensive resource
containing a curated collection of TFs and their transcriptional targets
compiled from 12 different resources. This collection provides an increased
coverage of transcription factors and a superior performance in identifying
perturbed TFs compared to our previous
[DoRothEA](https://saezlab.github.io/dorothea/) network and other literature
based GRNs. Similar to DoRothEA, interactions are weighted by their mode of
regulation (activation or inhibition).

For this example we will use the human version (mouse and rat are also
available). We can use `decoupler` to retrieve it from `omnipath`. The argument
`split_complexes` keeps complexes or splits them into subunits, by default we
recommend to keep complexes together.

<div class="alert alert-info">

**Note**

In this tutorial we use the network CollecTRI, but we could use any other GRN coming from an inference method such as [CellOracle](https://morris-lab.github.io/CellOracle.documentation/), [pySCENIC](https://pyscenic.readthedocs.io/en/latest/) or [SCENIC+](https://scenicplus.readthedocs.io/en/latest/). 

</div> 

In [None]:
# Retrieve CollecTRI gene regulatory network
collectri = dc.get_collectri(organism='mouse', split_complexes=False)
collectri

### Activity inference with Univariate Linear Model (ULM)

To infer TF enrichment scores we will run the Univariate Linear Model (`ulm`) method. For each sample in our dataset (`mat`) and each TF in our network (`net`), it fits a linear model that predicts the observed gene expression
based solely on the TF's TF-Gene interaction weights. Once fitted, the obtained t-value of the slope is the score. If it is positive, we interpret that the TF is active and if it is negative we interpret that it is inactive.

<img src="../ulm.png" />

We can run `ulm` with a one-liner:

### only kit and atlas

In [None]:
# Infer pathway activities with ulm
tf_acts, tf_pvals = dc.run_ulm(mat=mat, net=collectri)
tf_acts

Let us plot the obtained scores for the top active/inactive transcription factors:

In [None]:
dc.plot_barplot(
    acts=tf_acts,
    contrast='EECs',
    top=25,
    vertical=True,
    figsize=(3, 6)
)

Like with pathways, we can explore how the target genes look like:

In [None]:
# Extract logFCs and pvals
logFCs = results_df[['log2FoldChange']].T.rename(index={'log2FoldChange': 'EECs'})
pvals = results_df[['padj']].T.rename(index={'padj': 'EECs'})

# Plot
dc.plot_volcano(
    logFCs=logFCs,
    pvals=pvals,
    contrast='EECs',
    name='Ctnnb1',
    net=collectri,
    top=200,
    sign_thr=0.05,
    lFCs_thr=0.5
)

In [None]:
# Plot
dc.plot_volcano(
    logFCs=logFCs,
    pvals=pvals,
    contrast='EECs',
    name='Ppara',
    net=collectri,
    top=100,
    sign_thr=0.05,
    lFCs_thr=0.5
)

In [None]:
# Plot
dc.plot_volcano(
    logFCs=logFCs,
    pvals=pvals,
    contrast='EECs',
    name='Myc',
    net=collectri,
    top=100,
    sign_thr=0.05,
    lFCs_thr=0.5
)

In [None]:
# Plot
dc.plot_volcano(
    logFCs=logFCs,
    pvals=pvals,
    contrast='EECs',
    name='Irf1',
    net=collectri,
    top=100,
    sign_thr=0.05,
    lFCs_thr=0.5
)

In [None]:
# Plot
dc.plot_volcano(
    logFCs=logFCs,
    pvals=pvals,
    contrast='EECs',
    name='Rel',
    net=collectri,
    top=100,
    sign_thr=0.05,
    lFCs_thr=0.5
)

In [None]:
# Plot
dc.plot_volcano(
    logFCs=logFCs,
    pvals=pvals,
    contrast='EECs',
    name='Rel',
    net=collectri,
    top=100,
    sign_thr=0.05,
    lFCs_thr=0.5
)

We can also plot the network of interesting TFs (top and bottom by activity) and color the nodes by activity and target gene expression:

In [None]:
dc.plot_network(
    net=collectri,
    obs=mat,
    act=tf_acts,
    n_sources=['Myc', 'Ppara', 'Ppargc1b', 'Ctnnb1','Rel','Irf1','Neurog3', 'Sox18'],
    n_targets=15,
    node_size=100,
    figsize=(12, 12),
    c_pos_w='darkgreen', #positive weights?!
    c_neg_w='darkred',
    t_cmap='YlOrRd',
    vcenter=True
)

Green edges are positive regulation (activation), red edges are negative regulation (inactivation):

In [None]:
dc.run_ulm(
    mat=adata,
    net=collectri,
    source='source',
    target='target',
    weight='weight',
    verbose=True,
    use_raw=False
)

In [None]:
adata.obs['Status_strain'] = adata.obs['Status'].astype(str) + '_' + adata.obs['strain'].astype(str)

# To check the result
print(adata.obs[['Status', 'strain', 'Status_strain']].head())

In [None]:
acts = dc.get_acts(adata, obsm_key='ulm_estimate')
acts

In [None]:
genes = [gene for gene in acts.var_names]

In [None]:
genes

In [None]:
sc.pl.umap(acts, color=['Egr1','atlas', 'cell_type_annotation_lv1'], cmap='RdBu_r', vcenter=0, size=4)
sc.pl.violin(acts, keys=['Egr1'], groupby='Status_strain', rotation=90)

In [None]:
sc.pl.umap(adata, color=['Foxa2','Glis3','Hhex','Pax6','atlas', 'cell_type_annotation_lv1'], layer= 'log_dca_counts',cmap=mymap, size =4, frameon=True)
sc.pl.violin(adata, keys=['Nfkb1','Stat3','Egr1','Pax6'],layer='sct_logcounts', groupby='Status_strain', rotation=90)

### with diet in design factors

In [None]:
# Infer pathway activities with ulm
tf_acts, tf_pvals = dc.run_ulm(mat=mat, net=collectri)
tf_acts

Let us plot the obtained scores for the top active/inactive transcription factors:

In [None]:
dc.plot_barplot(
    acts=tf_acts,
    contrast='EECs',
    top=25,
    vertical=True,
    figsize=(3, 6)
)

In accordance to the previous pathway results, T cells seem to activate for TFs responsible for cell growth (E2F4, TFDP1, E2F1).

Like with pathways, we can explore how the target genes look like:

In [None]:
# Extract logFCs and pvals
logFCs = results_df[['log2FoldChange']].T.rename(index={'log2FoldChange': 'EECs'})
pvals = results_df[['padj']].T.rename(index={'padj': 'EECs'})

# Plot
dc.plot_volcano(
    logFCs=logFCs,
    pvals=pvals,
    contrast='EECs',
    name='Irf1',
    net=collectri,
    top=100,
    sign_thr=0.05,
    lFCs_thr=0.5
)

We can also plot the network of interesting TFs (top and bottom by activity) and color the nodes by activity and target gene expression:

In [None]:
dc.plot_network(
    net=collectri,
    obs=mat,
    act=tf_acts,
    n_sources=['Irf1', 'Bcl6', 'Ciita', 'Rela'],
    n_targets=15,
    node_size=100,
    figsize=(7, 7),
    c_pos_w='darkgreen',
    c_neg_w='darkred',
    vcenter=True
)

Green edges are positive regulation (activation), red edges are negative regulation (inactivation):

In [None]:
dc.run_ulm(
    mat=adata,
    net=collectri,
    source='source',
    target='target',
    weight='weight',
    verbose=True,
    use_raw=False
)

In [None]:
acts = dc.get_acts(adata, obsm_key='ulm_estimate')
acts

In [None]:
acts_vars = [name for name in acts.var_names]

In [None]:
Lyz_vars = [name for name in acts.var_names if str(name).startswith('B')]

In [None]:
Lyz_vars

In [None]:
sc.pl.stacked_violin(acts, ['Sox2'], groupby='atlas', dendrogram=True)

In [None]:
acts.obs['atlas_celltype'] = acts.obs['cell_type_annotation_lv0'].astype(str) + '_' + acts.obs['atlas'].astype(str)

In [None]:
sc.pl.umap(acts, color=['Sox2', 'atlas'], cmap='RdBu_r', vcenter=0) #Socs2 non ecxistent, Akt1 neither
sc.pl.violin(acts, keys=['Sox2'], groupby='atlas_celltype', rotation=90)

In [None]:
adata.obs['atlas_celltype'] = adata.obs['cell_type_annotation_lv0'].astype(str) + '_' + adata.obs['atlas'].astype(str)

In [None]:
sc.pl.umap(adata, color=['Socs2', 'cell_type_annotation_lv0'], cmap='RdBu_r', vcenter=0)
sc.pl.violin(adata, keys=['Socs2'], groupby='atlas_celltype', rotation=90)

## Pathway activity inference

Another analysis we can perform is to infer pathway activities from our transcriptomics data.

### PROGENy model

[PROGENy](https://saezlab.github.io/progeny/) is a comprehensive resource containing a curated collection of pathways and their target genes, with weights for each interaction.
For this example we will use the human weights (other organisms are available) and we will use the top 500 responsive genes ranked by p-value. Here is a brief description of each pathway:

- **Androgen**: involved in the growth and development of the male reproductive organs.
- **EGFR**: regulates growth, survival, migration, apoptosis, proliferation, and differentiation in mammalian cells
- **Estrogen**: promotes the growth and development of the female reproductive organs.
- **Hypoxia**: promotes angiogenesis and metabolic reprogramming when O2 levels are low.
- **JAK-STAT**: involved in immunity, cell division, cell death, and tumor formation.
- **MAPK**: integrates external signals and promotes cell growth and proliferation.
- **NFkB**: regulates immune response, cytokine production and cell survival.
- **p53**: regulates cell cycle, apoptosis, DNA repair and tumor suppression.
- **PI3K**: promotes growth and proliferation.
- **TGFb**: involved in development, homeostasis, and repair of most tissues.
- **TNFa**: mediates haematopoiesis, immune surveillance, tumour regression and protection from infection.
- **Trail**: induces apoptosis.
- **VEGF**: mediates angiogenesis, vascular permeability, and cell migration.
- **WNT**: regulates organ morphogenesis during development and tissue repair.

To access it we can use `decoupler`.

In [None]:
# Retrieve PROGENy model weights
progeny = dc.get_progeny(organism='mouse',top=500)
progeny

### Activity inference with Multivariate Linear Model (MLM)

To infer pathway enrichment scores we will run the Multivariate Linear Model (`mlm`) method. For each sample in our dataset (`adata`), it fits a linear model that predicts the observed gene expression based on all pathways' Pathway-Gene interactions weights.
Once fitted, the obtained t-values of the slopes are the scores. If it is positive, we interpret that the pathway is active and if it is negative we interpret that it is inactive.

<img src="../mlm.png" />
     
We can run `mlm` with a one-liner:

In [None]:
# Infer pathway activities with mlm
pathway_acts, pathway_pvals = dc.run_mlm(mat=mat, net=progeny)
pathway_acts

In [None]:
dc.run_mlm(
    mat=adata,
    net=progeny,
    source='source',
    target='target',
    weight='weight',
    verbose=True,
    use_raw=False
)

In [None]:
acts = dc.get_acts(adata, obsm_key='mlm_estimate')
acts

In [None]:
sc.pl.matrixplot(acts, var_names=acts.var_names, groupby='cell_type_annotation_lv1', dendrogram=True, standard_scale='var',
                 colorbar_title='Z-scaled scores', cmap='RdBu_r')

In [None]:
sc.pl.matrixplot(acts, var_names=acts.var_names, groupby='Status', dendrogram=True, standard_scale='var',
                 colorbar_title='Z-scaled scores', cmap='RdBu_r')

In [None]:
sc.pl.matrixplot(acts, var_names=acts.var_names, groupby='Status_strain', dendrogram=True, standard_scale='var',
                 colorbar_title='Z-scaled scores', cmap='RdBu_r')

Let us plot the obtained scores:

In [None]:
dc.plot_barplot(
    acts=pathway_acts,
    contrast='EECs',
    top=25,
    vertical=False,
    figsize=(6, 3)
)

It looks like JAK-STAT, a known immunity pathway is more active in T cells from COVID-19 patients than in controls. To further explore how the target genes of a pathway of interest behave, we can plot them in scatter plot:

In [None]:
dc.plot_targets(
    data=results_df,
    stat='stat',
    source_name='Trail',
    net=progeny,
    top=15
)

The observed activation of JAK-STAT is due to the fact that majority of its target genes with positive weights have positive
t-values (1st quadrant), and the majority of the ones with negative weights have negative t-values (3d quadrant).

In [None]:
dc.plot_targets(
    data=results_df,
    stat='stat',
    source_name='TNFa',
    net=progeny,
    top=15
)

In [None]:
dc.plot_targets(
    data=results_df,
    stat='stat',
    source_name='Hypoxia',
    net=progeny,
    top=15
)

In [None]:
dc.plot_targets(
    data=results_df,
    stat='stat',
    source_name='NFkB',
    net=progeny,
    top=15
)

In [None]:
sc.pl.umap(acts, color=['TNFa','atlas', 'cell_type_annotation_lv1'], cmap='RdBu_r', vcenter=0, frameon=True, save = 'umap_decoupler_TNFalpha.png')
sc.pl.violin(acts, keys=['TNFa'], groupby='Status_strain', rotation=90, save = 'violin_decoupler_TNFalpha.png')

In [None]:
sc.pl.umap(acts, color=['Trail','atlas', 'cell_type_annotation_lv1'], cmap='RdBu_r', vcenter=0, frameon=True, save = 'umap_decoupler_Trail.png')
sc.pl.violin(acts, keys=['Trail'], groupby='Status_strain', rotation=90, save = 'umap_decoupler_Trail.png')

In [None]:
sc.pl.umap(acts, color=['NFkB','atlas', 'cell_type_annotation_lv1'], cmap='RdBu_r', vcenter=0, frameon=True, save = 'umap_decoupler_NFkB.png')
sc.pl.violin(acts, keys=['NFkB'], groupby='Status_strain', rotation=90, save = 'umap_decoupler_NFkB.png')

In [None]:
sc.pl.umap(acts, color=['Hypoxia','atlas', 'cell_type_annotation_lv1'], cmap='RdBu_r', vcenter=0, frameon=True, save = 'umap_decoupler_Hypoxia.png')
sc.pl.violin(acts, keys=['Hypoxia'], groupby='Status_strain', rotation=90, save = 'umap_decoupler_Hypoxia.png')

In [None]:
sc.pl.umap(acts, color=['WNT','atlas', 'Status_strain', 'cell_type_annotation_lv1'], cmap='RdBu_r', vcenter=0)
sc.pl.violin(acts, keys=['WNT'], groupby='Status_strain', rotation=90)

In [None]:
sc.pl.umap(acts, color=['PI3K','atlas', 'cell_type_annotation_lv1'], cmap='RdBu_r', vcenter=0, frameon=True)
sc.pl.violin(acts, keys=['PI3K'], groupby='Status_strain', rotation=90)

In [None]:
sc.pl.umap(adata, color=['Neurog3','atlas','kit', 'cell_type_annotation_lv1'],layer = 'sct_logcounts', cmap=mymap, frameon=True, ncols = 5, wspace = 0.6)
sc.pl.violin(adata, keys=['Neurog3'], groupby='Status_strain',  rotation=90)

In [None]:
sc.pl.matrixplot(acts, var_names=acts.var_names, groupby='cell_type_annotation_lv1', dendrogram=True, standard_scale='var',
                 colorbar_title='Z-scaled scores', cmap='RdBu_r')

In [None]:
# Retrieve MSigDB resource
msigdb = dc.get_resource('MSigDB')
msigdb

In [None]:
# Filter by hallmark
msigdb = msigdb[msigdb['collection']=='hallmark']

# Remove duplicated entries
msigdb = msigdb[~msigdb.duplicated(['geneset', 'genesymbol'])]

# Rename
msigdb.loc[:, 'geneset'] = [name.split('HALLMARK_')[1] for name in msigdb['geneset']]
msigdb['genesymbol'] = msigdb['genesymbol'].str.capitalize()

msigdb

In [None]:
# Infer enrichment with ora using significant deg
top_genes = results_df[results_df['padj'] < 0.05]

# Run ora
enr_pvals = dc.get_ora_df(
    df=top_genes,
    net=msigdb,
    source='geneset',
    target='genesymbol'
)

enr_pvals.head()

In [None]:
dc.plot_dotplot(
    enr_pvals.sort_values('Combined score', ascending=False).head(15),
    x='Combined score',
    y='Term',
    s='Odds ratio',
    c='FDR p-value',
    scale=0.5,
    figsize=(3, 9)
)

In [None]:
sc.pl.umap(adata,color='Igfbp4',cmap=mymap,layer='log_dca_counts', frameon=True)