# TNK Pseudo-bulk Functional Analysis - Timepoints Comparison

https://decoupler-py.readthedocs.io/en/latest/notebooks/pseudobulk.html

## Environment SetUp

In [None]:
# load packages
import scanpy as sc
import decoupler as dc

# Only needed for processing
import numpy as np
import pandas as pd

# Needed for some plotting
import matplotlib.pyplot as plt

import os

# plotting options
sc.settings.set_figure_params(dpi=300, frameon=False)
sc.set_figure_params(dpi=300)
sc.set_figure_params(figsize=(4, 4))

In [None]:
# set working and fig dir
work_dir = "/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE"
fig_dir = os.path.join(work_dir, "figures", "combined", "TNK", "decoupleR", "timepoints_comparison")
sc.settings.figdir = os.path.join(work_dir, "figures", "combined", "TNK", "decoupleR", "timepoints_comparison")

In [None]:
# load data
adata = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TNK_annotated_13-04-24.h5ad"))

In [None]:
# modify layers to suit decoupleR reqs
adata.layers["counts"] = adata.layers["rawcounts"]
adata.layers["normalized"] = adata.layers["logcounts"]

In [None]:
# modify metadata cell type name to ease analysis
adata.obs["cell_type"] = adata.obs["Annotation_2.0"]

In [None]:
# explore metdata
adata.obs

## Pseudobulk

### Generation of pseudo-bulk profiles

In [None]:
pdata = dc.get_pseudobulk(
    adata,
    sample_col='sample',
    groups_col='cell_type',
    layer='counts',
    mode='sum',
    min_cells=10,
    min_counts=1000
)
pdata

### Exploration of pseudobulk profiles

In [None]:
# explore variability of pseudobulks between patient and cell type

# Store raw counts in layers
pdata.layers['counts'] = pdata.X.copy()

# Normalize, scale and compute pca
sc.pp.normalize_total(pdata, target_sum=1e4)
sc.pp.log1p(pdata)
sc.pp.scale(pdata, max_value=10)
sc.tl.pca(pdata)

# Return raw counts to X
dc.swap_layer(pdata, 'counts', X_layer_key=None, inplace=True)

In [None]:
sc.pl.pca(pdata, color=['timepoint', 'cell_type'], ncols=1, size=300)
sc.pl.pca_variance_ratio(pdata)

In [None]:
# perform ANOVA on each PC and see whether they are significantly associated with any technical or biological annotations of our samples
dc.get_metadata_associations(
    pdata,
    obs_keys = ['patient', 'timepoint', 'response', 'cell_type', 'psbulk_n_cells', 'psbulk_counts'],  # Metadata columns to associate to PCs
    obsm_key='X_pca',  # Where the PCs are stored
    uns_key='pca_anova',  # Where the results are stored
    inplace=True,
)

In [None]:
dc.plot_associations(
    pdata,
    uns_key='pca_anova',  # Summary statistics from the anova tests
    obsm_key='X_pca',  # where the PCs are stored
    stat_col='p_adj',  # Which summary statistic to plot
    obs_annotation_cols = ['timepoint', 'cell_type'], # which sample annotations to plot
    titles=['Principle component scores', 'Adjusted p-values from ANOVA'],
    figsize=(7, 5),
    n_factors=10,
    cmap_cats="tab20",
    cmap_scores="PuOr",
    save=os.path.join(fig_dir,"Association_plot.png"),
    return_fig=True, 
    dpi=600
)

## Contrast between conditions (DGEA)

In [None]:
# Import DESeq2
from pydeseq2.dds import DeseqDataSet, DefaultInference
from pydeseq2.ds import DeseqStats

In [None]:
pdata.obs.groupby('cell_type').count()['Condition']

In [None]:
# create list of cell types which DGE can be run 
cell_types = pdata.obs['cell_type'].unique().tolist()
for cell_type in cell_types:
    n = pdata.obs.groupby(['cell_type']).count()['Condition'][cell_type]
    if n < 4:
        cell_types.remove(cell_type)

print(cell_types)

In [None]:
it = 1
for cell_type in cell_types:
    print('Running DEseq2 for ' + cell_type + '...')
        
    celltype_cells = pdata[pdata.obs['cell_type'] == cell_type].copy()

    inference = DefaultInference(n_cpus=16)
    dds = DeseqDataSet(
        adata=celltype_cells,
        design_factors='Condition',
        ref_level=['Condition', 'T1/+ICI'],
        refit_cooks=True,
        inference=inference,
    )
    
    # Compute LFCs
    dds.deseq2()
    

    # Extract contrast between timepoints
    stat_res = DeseqStats(
        dds,
        contrast=["Condition", 'T1/+ICI', 'T0/-ICI'],
        inference=inference,
    )
    
    # Compute Wald test
    print(stat_res.summary())
    
    # Extract results
    results_df = stat_res.results_df
    results_df

    if cell_type == "CD4 T CM/EarlyActivated":
        cell_type = "CD4 T CM-EarlyActivated"

    plot=dc.plot_volcano_df(
        results_df,
        x='log2FoldChange',
        y='padj',
        top=20,
        figsize=(8, 4)
    )
    plt.title(cell_type)
    plt.savefig(os.path.join(fig_dir,cell_type+"_DGE_volcano_plot.png"), dpi=300, format="png", bbox_inches="tight")
    plt.show()

    if cell_type == "CD4 T CM-EarlyActivated":
        cell_type = "CD4 T CM/EarlyActivated"
    
    if it == 1:
        mat = results_df[['stat']].T.rename(index={'stat': 'celltype'})
    else:
        mat = mat._append(results_df[['stat']].T.rename(index={'stat': 'celltype'}))

    it += 1
    

In [None]:
names = cell_types

In [None]:
names

In [None]:
mat.index = names

In [None]:
# fill nans with 0s
mat = mat.fillna(0)

In [None]:
mat

In [None]:
mat.to_csv(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_TNK_decoupleR_mat.csv"), index=True)

## Transcription factor activity inference

In [None]:
# Retrieve CollecTRI gene regulatory network
collectri = dc.get_collectri(organism='human', split_complexes=False)
collectri

In [None]:
# Infer pathway activities with ulm
tf_acts, tf_pvals = dc.run_ulm(mat=mat, net=collectri)
tf_acts

In [None]:
# check most active/inactive TFs

for cell_type in pdata.obs['cell_type'].unique():

    if cell_type in ["CD4 T ISG+", "CD4 T Proliferative", "Cycling γδ T-like"]:
        continue
    
    dc.plot_barplot(
        acts=tf_acts,
        contrast=cell_type,
        top=25,
        vertical=True,
        figsize=(3, 6)
    )

    plt.title(cell_type)
    if cell_type == "CD4 T CM/EarlyActivated":
        cell_type = "CD4 T CM-EarlyActivated"
    plt.savefig(os.path.join(fig_dir,cell_type+"_TF_Activity_plot.png"), dpi=300, format="png", bbox_inches="tight")
    plt.show()

In [None]:
# explore a specific TF target genes

# Extract logFCs and pvals
logFCs = results_df[['log2FoldChange']].T.rename(index={'log2FoldChange': 'CD4 T Follicular Helper'})
pvals = results_df[['padj']].T.rename(index={'padj': 'CD4 T Follicular Helper'})

# Plot
dc.plot_volcano(
    logFCs=logFCs,
    pvals=pvals,
    contrast='CD4 T Follicular Helper',
    name='NFYB',
    net=collectri,
    top=10,
    sign_thr=0.05,
    lFCs_thr=0.5
)

In [None]:
'''
# plot network of TFs of interest
dc.plot_network(
    net=collectri,
    obs=mat,
    act=tf_acts,
    n_sources=['CTNBB1', 'MYC', 'E2F4', 'TBX21'],
    n_targets=15,
    node_size=100,
    figsize=(7, 7),
    c_pos_w='darkgreen',
    c_neg_w='darkred',
    vcenter=True
)
'''

In [None]:
print(mat.shape)
print(tf_acts.shape)

## Pathway activity inference - PROGENy model

In [None]:
# Retrieve PROGENy model weights
progeny = dc.get_progeny(top=500)
progeny

In [None]:
# Infer pathway activities with mlm
pathway_acts, pathway_pvals = dc.run_mlm(mat=mat, net=progeny)
pathway_acts

In [None]:
for cell_type in pdata.obs['cell_type'].unique():

    if cell_type in ["CD4 T ISG+", "CD4 T Proliferative", "Cycling γδ T-like"]:
        continue
    
    # plot obtained scores
    dc.plot_barplot(
        acts=pathway_acts,
        contrast=cell_type,
        top=25,
        vertical=False,
        figsize=(6, 3)
    )

    plt.title(cell_type)
    if cell_type == "CD4 T CM/EarlyActivated":
        cell_type = "CD4 T CM-EarlyActivated"
    plt.savefig(os.path.join(fig_dir,cell_type+"_Pathway_Activity_plot.png"), dpi=300, format="png", bbox_inches="tight")
    plt.show()

In [None]:
# explore target genes of pathway of interest
dc.plot_targets(
    data=results_df,
    stat='stat',
    source_name='JAK-STAT',
    net=progeny,
    top=15
)

## Functional enrichment of biological terms (GSEA) - MSigDB

In [None]:
# Retrieve MSigDB resource
msigdb = dc.get_resource('MSigDB')
msigdb

In [None]:
msigdb.geneset

In [None]:
# Filter by hallmark (can use any other)
msigdb = msigdb[msigdb['collection']=='hallmark']

# Remove duplicated entries
msigdb = msigdb[~msigdb.duplicated(['geneset', 'genesymbol'])]

# Rename
msigdb.loc[:, 'geneset'] = [name.split('HALLMARK_')[1] for name in msigdb['geneset']]

msigdb

In [None]:
# Infer enrichment with ora using significant deg
top_genes = results_df[results_df['padj'] < 0.05]

# Run ora
enr_pvals = dc.get_ora_df(
    df=top_genes,
    net=msigdb,
    source='geneset',
    target='genesymbol'
)

enr_pvals.head()

In [None]:
# visualize most enrichment terms
dc.plot_dotplot(
    enr_pvals.sort_values('Combined score', ascending=False).head(15),
    x='Combined score',
    y='Term',
    s='Odds ratio',
    c='FDR p-value',
    scale=0.5,
    figsize=(3, 9)
)

In [None]:
# plot running score
dc.plot_running_score(
    df=results_df,
    stat='stat',
    net=msigdb,
    source='geneset',
    target='genesymbol',
    set_name='MYC_TARGETS_V2'
)

# TNK Pseudo-bulk Functional Analysis - Timepoints PD Comparison

https://decoupler-py.readthedocs.io/en/latest/notebooks/pseudobulk.html

## Environment SetUp

In [None]:
# load packages
import scanpy as sc
import decoupler as dc

# Only needed for processing
import numpy as np
import pandas as pd

# Needed for some plotting
import matplotlib.pyplot as plt

import os

# plotting options
sc.settings.set_figure_params(dpi=300, frameon=False)
sc.set_figure_params(dpi=300)
sc.set_figure_params(figsize=(4, 4))

In [None]:
# set working and fig dir
work_dir = "/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE"
fig_dir = os.path.join(work_dir, "figures", "combined", "TNK", "decoupleR", "timepoints_PD_comparison")
sc.settings.figdir = os.path.join(work_dir, "figures", "combined", "TNK", "decoupleR", "timepoints_PD_comparison")

In [None]:
# load data
adata = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TNK_annotated_13-04-24.h5ad"))

In [None]:
# modify layers to suit decoupleR reqs
adata.layers["counts"] = adata.layers["rawcounts"]
adata.layers["normalized"] = adata.layers["logcounts"]

In [None]:
# modify metadata cell type name to ease analysis
adata.obs["cell_type"] = adata.obs["Annotation_2.0"]

In [None]:
# subset non responders
adata = adata[adata.obs["response"] == "PD"].copy()
adata

In [None]:
# explore metdata
adata.obs

## Pseudobulk

### Generation of pseudo-bulk profiles

In [None]:
pdata = dc.get_pseudobulk(
    adata,
    sample_col='sample',
    groups_col='cell_type',
    layer='counts',
    mode='sum',
    min_cells=10,
    min_counts=1000
)
pdata

### Exploration of pseudobulk profiles

In [None]:
# explore variability of pseudobulks between patient and cell type

# Store raw counts in layers
pdata.layers['counts'] = pdata.X.copy()

# Normalize, scale and compute pca
sc.pp.normalize_total(pdata, target_sum=1e4)
sc.pp.log1p(pdata)
sc.pp.scale(pdata, max_value=10)
sc.tl.pca(pdata)

# Return raw counts to X
dc.swap_layer(pdata, 'counts', X_layer_key=None, inplace=True)

In [None]:
sc.pl.pca(pdata, color=['timepoint', 'cell_type'], ncols=1, size=300)
sc.pl.pca_variance_ratio(pdata)

In [None]:
# perform ANOVA on each PC and see whether they are significantly associated with any technical or biological annotations of our samples
dc.get_metadata_associations(
    pdata,
    obs_keys = ['patient', 'timepoint', 'response', 'cell_type', 'psbulk_n_cells', 'psbulk_counts'],  # Metadata columns to associate to PCs
    obsm_key='X_pca',  # Where the PCs are stored
    uns_key='pca_anova',  # Where the results are stored
    inplace=True,
)

In [None]:
'''
dc.plot_associations(
    pdata,
    uns_key='pca_anova',  # Summary statistics from the anova tests
    obsm_key='X_pca',  # where the PCs are stored
    stat_col='p_adj',  # Which summary statistic to plot
    obs_annotation_cols = ['timepoint', 'cell_type'], # which sample annotations to plot
    titles=['Principle component scores', 'Adjusted p-values from ANOVA'],
    figsize=(7, 5),
    n_factors=10,
    cmap_cats="tab20",
    cmap_scores="PuOr",
    save=os.path.join(fig_dir,"Association_plot.png"),
    return_fig=True, 
    dpi=600
)
'''

## Contrast between conditions (DGEA)

In [None]:
# Import DESeq2
from pydeseq2.dds import DeseqDataSet, DefaultInference
from pydeseq2.ds import DeseqStats

In [None]:
pdata.obs.groupby('cell_type').count()['Condition']

In [None]:
# create list of cell types which DGE can be run 
cell_types = pdata.obs['cell_type'].unique().tolist()
for cell_type in cell_types:
    n = pdata.obs.groupby(['cell_type']).count()['Condition'][cell_type]
    if n < 4:
        cell_types.remove(cell_type)

print(cell_types)

In [None]:
it = 1
for cell_type in cell_types:
    print('Running DEseq2 for ' + cell_type + '...')
        
    celltype_cells = pdata[pdata.obs['cell_type'] == cell_type].copy()

    inference = DefaultInference(n_cpus=16)
    dds = DeseqDataSet(
        adata=celltype_cells,
        design_factors='Condition',
        ref_level=['Condition', 'T1/+ICI'],
        refit_cooks=True,
        inference=inference,
    )
    
    # Compute LFCs
    dds.deseq2()
    

    # Extract contrast between timepoints
    stat_res = DeseqStats(
        dds,
        contrast=["Condition", 'T1/+ICI', 'T0/-ICI'],
        inference=inference,
    )
    
    # Compute Wald test
    print(stat_res.summary())
    
    # Extract results
    results_df = stat_res.results_df
    results_df

    if cell_type == "CD4 T CM/EarlyActivated":
        cell_type = "CD4 T CM-EarlyActivated"

    plot=dc.plot_volcano_df(
        results_df,
        x='log2FoldChange',
        y='padj',
        top=20,
        figsize=(8, 4)
    )
    plt.title(cell_type)
    plt.savefig(os.path.join(fig_dir,cell_type+"_DGE_volcano_plot.png"), dpi=300, format="png", bbox_inches="tight")
    plt.show()

    if cell_type == "CD4 T CM-EarlyActivated":
        cell_type = "CD4 T CM/EarlyActivated"
    
    if it == 1:
        mat = results_df[['stat']].T.rename(index={'stat': 'celltype'})
    else:
        mat = mat._append(results_df[['stat']].T.rename(index={'stat': 'celltype'}))

    it += 1
    

In [None]:
names = cell_types

In [None]:
names

In [None]:
mat.index = names

In [None]:
# fill nans with 0s
mat = mat.fillna(0)

In [None]:
mat

In [None]:
mat.to_csv(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_TNK_decoupleR_mat.csv"), index=True)

## Transcription factor activity inference

In [None]:
# Retrieve CollecTRI gene regulatory network
collectri = dc.get_collectri(organism='human', split_complexes=False)
collectri

In [None]:
# Infer pathway activities with ulm
tf_acts, tf_pvals = dc.run_ulm(mat=mat, net=collectri)
tf_acts

In [None]:
# check most active/inactive TFs

for cell_type in pdata.obs['cell_type'].unique():

    if cell_type in ["CD4 T ISG+", "CD4 T Proliferative", "Cycling γδ T-like"]:
        continue
    
    dc.plot_barplot(
        acts=tf_acts,
        contrast=cell_type,
        top=25,
        vertical=True,
        figsize=(3, 6)
    )

    plt.title(cell_type)
    if cell_type == "CD4 T CM/EarlyActivated":
        cell_type = "CD4 T CM-EarlyActivated"
    plt.savefig(os.path.join(fig_dir,cell_type+"_TF_Activity_plot.png"), dpi=300, format="png", bbox_inches="tight")
    plt.show()

In [None]:
# explore a specific TF target genes

# Extract logFCs and pvals
logFCs = results_df[['log2FoldChange']].T.rename(index={'log2FoldChange': 'CD4 T Follicular Helper'})
pvals = results_df[['padj']].T.rename(index={'padj': 'CD4 T Follicular Helper'})

# Plot
dc.plot_volcano(
    logFCs=logFCs,
    pvals=pvals,
    contrast='CD4 T Follicular Helper',
    name='NFYB',
    net=collectri,
    top=10,
    sign_thr=0.05,
    lFCs_thr=0.5
)

In [None]:
'''
# plot network of TFs of interest
dc.plot_network(
    net=collectri,
    obs=mat,
    act=tf_acts,
    n_sources=['CTNBB1', 'MYC', 'E2F4', 'TBX21'],
    n_targets=15,
    node_size=100,
    figsize=(7, 7),
    c_pos_w='darkgreen',
    c_neg_w='darkred',
    vcenter=True
)
'''

In [None]:
print(mat.shape)
print(tf_acts.shape)

## Pathway activity inference - PROGENy model

In [None]:
# Retrieve PROGENy model weights
progeny = dc.get_progeny(top=500)
progeny

In [None]:
# Infer pathway activities with mlm
pathway_acts, pathway_pvals = dc.run_mlm(mat=mat, net=progeny)
pathway_acts

In [None]:
for cell_type in pdata.obs['cell_type'].unique():

    if cell_type in ["CD4 T ISG+", "CD4 T Proliferative", "Cycling γδ T-like"]:
        continue
    
    # plot obtained scores
    dc.plot_barplot(
        acts=pathway_acts,
        contrast=cell_type,
        top=25,
        vertical=False,
        figsize=(6, 3)
    )

    plt.title(cell_type)
    if cell_type == "CD4 T CM/EarlyActivated":
        cell_type = "CD4 T CM-EarlyActivated"
    plt.savefig(os.path.join(fig_dir,cell_type+"_Pathway_Activity_plot.png"), dpi=300, format="png", bbox_inches="tight")
    plt.show()

In [None]:
# explore target genes of pathway of interest
dc.plot_targets(
    data=results_df,
    stat='stat',
    source_name='JAK-STAT',
    net=progeny,
    top=15
)

## Functional enrichment of biological terms (GSEA) - MSigDB

In [None]:
# Retrieve MSigDB resource
msigdb = dc.get_resource('MSigDB')
msigdb

In [None]:
msigdb.geneset

In [None]:
# Filter by hallmark (can use any other)
msigdb = msigdb[msigdb['collection']=='hallmark']

# Remove duplicated entries
msigdb = msigdb[~msigdb.duplicated(['geneset', 'genesymbol'])]

# Rename
msigdb.loc[:, 'geneset'] = [name.split('HALLMARK_')[1] for name in msigdb['geneset']]

msigdb

In [None]:
# Infer enrichment with ora using significant deg
top_genes = results_df[results_df['padj'] < 0.05]

# Run ora
enr_pvals = dc.get_ora_df(
    df=top_genes,
    net=msigdb,
    source='geneset',
    target='genesymbol'
)

enr_pvals.head()

In [None]:
# visualize most enrichment terms
dc.plot_dotplot(
    enr_pvals.sort_values('Combined score', ascending=False).head(15),
    x='Combined score',
    y='Term',
    s='Odds ratio',
    c='FDR p-value',
    scale=0.5,
    figsize=(3, 9)
)

In [None]:
# plot running score
dc.plot_running_score(
    df=results_df,
    stat='stat',
    net=msigdb,
    source='geneset',
    target='genesymbol',
    set_name='MYC_TARGETS_V2'
)

# Pseudo-bulk Functional Analysis - Response Comparison

https://decoupler-py.readthedocs.io/en/latest/notebooks/pseudobulk.html

## Environment SetUp

In [None]:
# load packages
import scanpy as sc
import decoupler as dc

# Only needed for processing
import numpy as np
import pandas as pd

# Needed for some plotting
import matplotlib.pyplot as plt

import os

# Plotting options, change to your liking
sc.settings.set_figure_params(dpi=300, frameon=False)
sc.set_figure_params(dpi=300)
sc.set_figure_params(figsize=(4, 4))

In [None]:
# set working and fig dir
work_dir = "/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE"
fig_dir = os.path.join(work_dir, "figures", "combined", "TNK", "decoupleR", "response_comparison")
sc.settings.figdir = os.path.join(work_dir, "figures", "combined", "TNK", "decoupleR", "response_comparison")

In [None]:
# load data
adata = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_TNK_annotated_13-04-24.h5ad"))

In [None]:
# modify layers to suit decoupleR reqs
adata.layers["counts"] = adata.layers["rawcounts"]
adata.layers["normalized"] = adata.layers["logcounts"]

In [None]:
# modify metadata cell type name to ease analysis
adata.obs["cell_type"] = adata.obs["Annotation_2.0"]

In [None]:
# explore metdata
adata.obs

## Pseudobulk

### Generation of pseudo-bulk profiles

In [None]:
pdata = dc.get_pseudobulk(
    adata,
    sample_col='sample',
    groups_col='cell_type',
    layer='counts',
    mode='sum',
    min_cells=10,
    min_counts=1000
)
pdata

### Exploration of pseudobulk profiles

In [None]:
# explore variability of pseudobulks between patient and cell type

# Store raw counts in layers
pdata.layers['counts'] = pdata.X.copy()

# Normalize, scale and compute pca
sc.pp.normalize_total(pdata, target_sum=1e4)
sc.pp.log1p(pdata)
sc.pp.scale(pdata, max_value=10)
sc.tl.pca(pdata)

# Return raw counts to X
dc.swap_layer(pdata, 'counts', X_layer_key=None, inplace=True)

In [None]:
sc.pl.pca(pdata, color=['response', 'cell_type'], ncols=1, size=300)
sc.pl.pca_variance_ratio(pdata)

In [None]:
# perform ANOVA on each PC and see whether they are significantly associated with any technical or biological annotations of our samples
dc.get_metadata_associations(
    pdata,
    obs_keys = ['patient', 'timepoint', 'response', 'cell_type', 'psbulk_n_cells', 'psbulk_counts'],  # Metadata columns to associate to PCs
    obsm_key='X_pca',  # Where the PCs are stored
    uns_key='pca_anova',  # Where the results are stored
    inplace=True,
)

In [None]:
dc.plot_associations(
    pdata,
    uns_key='pca_anova',  # Summary statistics from the anova tests
    obsm_key='X_pca',  # where the PCs are stored
    stat_col='p_adj',  # Which summary statistic to plot
    obs_annotation_cols = ['response', 'cell_type'], # which sample annotations to plot
    titles=['Principle component scores', 'Adjusted p-values from ANOVA'],
    figsize=(7, 5),
    n_factors=10,
    cmap_cats="tab20",
    cmap_scores="PuOr",
    save=os.path.join(fig_dir,"Association_plot.png"),
    return_fig=True, 
    dpi=600
)

## Contrast between conditions (DGEA)

In [None]:
# Import DESeq2
from pydeseq2.dds import DeseqDataSet, DefaultInference
from pydeseq2.ds import DeseqStats

In [None]:
# explore pseudobulks 
pdata.obs.groupby(['cell_type', 'response']).count()

In [None]:
pdata.obs.groupby('cell_type').count()['Condition']

In [None]:
# create list of cell types which DGE can be run 
cell_types = pdata.obs['cell_type'].unique().tolist()
for cell_type in cell_types:
    n = pdata.obs.groupby(['cell_type']).count()['Condition'][cell_type]
    if n < 4:
        cell_types.remove(cell_type)

for cell_type in cell_types:
    for count in pdata.obs.groupby(['cell_type', 'response']).count()['cells'][cell_type]:
        if int(count) < 2:
            cell_types.remove(cell_type)
            continue

print(cell_types)

In [None]:
for i in pdata.obs.groupby(['cell_type', 'response']).count()['cells']['CD4 T Helper-like']:
    print(i)

In [None]:
it = 1
for cell_type in cell_types:

    print('Running DEseq2 for ' + cell_type + '...')

    celltype_cells = pdata[pdata.obs['cell_type'] == cell_type].copy()

    inference = DefaultInference(n_cpus=16)
    dds = DeseqDataSet(
        adata=celltype_cells,
        design_factors='response',
        ref_level=['response', 'PD'],
        refit_cooks=True,
        inference=inference,
    )
    
    # Compute LFCs
    dds.deseq2()
    

    # Extract contrast between timepoints
    stat_res = DeseqStats(
        dds,
        contrast=["response", 'PD', 'SD'],
        inference=inference,
    )
    
    # Compute Wald test
    print(stat_res.summary())
    
    # Extract results
    results_df = stat_res.results_df
    results_df

    if cell_type == "CD4 T CM/EarlyActivated":
        cell_type = "CD4 T CM-EarlyActivated"

    plot=dc.plot_volcano_df(
        results_df,
        x='log2FoldChange',
        y='padj',
        top=20,
        figsize=(8, 4)
    )
    plt.title(cell_type)
    plt.savefig(os.path.join(fig_dir,cell_type+"_DGE_volcano_plot.png"), dpi=300, format="png", bbox_inches="tight")
    plt.show()

    if cell_type == "CD4 T CM-EarlyActivated":
        cell_type = "CD4 T CM/EarlyActivated"
    
    if it == 1:
        mat = results_df[['stat']].T.rename(index={'stat': 'celltype'})
    else:
        mat = mat._append(results_df[['stat']].T.rename(index={'stat': 'celltype'}))

    it += 1
    

In [None]:
names = cell_types

In [None]:
names

In [None]:
mat.index = names

In [None]:
# fill nans with 0s
mat = mat.fillna(0)

In [None]:
mat

In [None]:
mat.to_csv(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_TNK_decoupleR_mat_response.csv"), index=True)

## Transcription factor activity inference

In [None]:
# Retrieve CollecTRI gene regulatory network
collectri = dc.get_collectri(organism='human', split_complexes=False)
collectri

In [None]:
# Infer pathway activities with ulm
tf_acts, tf_pvals = dc.run_ulm(mat=mat, net=collectri)
tf_acts

In [None]:
# check most active/inactive TFs

for cell_type in pdata.obs['cell_type'].unique():

    if cell_type in ["CD4 T ISG+", "CD4 T Proliferative", "Cycling γδ T-like"]:
        continue
    
    dc.plot_barplot(
        acts=tf_acts,
        contrast=cell_type,
        top=25,
        vertical=True,
        figsize=(3, 6)
    )

    plt.title(cell_type)
    if cell_type == "CD4 T CM/EarlyActivated":
        cell_type = "CD4 T CM-EarlyActivated"
    plt.savefig(os.path.join(fig_dir,cell_type+"_TF_Activity_plot.png"), dpi=300, format="png", bbox_inches="tight")
    plt.show()

In [None]:
# explore a specific TF target genes

# Extract logFCs and pvals
logFCs = results_df[['log2FoldChange']].T.rename(index={'log2FoldChange': 'CD4 T Follicular Helper'})
pvals = results_df[['padj']].T.rename(index={'padj': 'CD4 T Follicular Helper'})

# Plot
dc.plot_volcano(
    logFCs=logFCs,
    pvals=pvals,
    contrast='CD4 T Follicular Helper',
    name='NFYB',
    net=collectri,
    top=10,
    sign_thr=0.05,
    lFCs_thr=0.5
)

In [None]:
# plot network of TFs of interest
'''
dc.plot_network(
    net=collectri,
    obs=mat,
    act=tf_acts,
    n_sources=['CTNBB1', 'MYC', 'E2F4', 'TBX21'],
    n_targets=15,
    node_size=100,
    figsize=(7, 7),
    c_pos_w='darkgreen',
    c_neg_w='darkred',
    vcenter=True
)
'''

In [None]:
print(mat.shape)
print(tf_acts.shape)

## Pathway activity inference - PROGENy model

In [None]:
# Retrieve PROGENy model weights
progeny = dc.get_progeny(top=500)
progeny

In [None]:
# Infer pathway activities with mlm
pathway_acts, pathway_pvals = dc.run_mlm(mat=mat, net=progeny)
pathway_acts

In [None]:
for cell_type in pdata.obs['cell_type'].unique():

    if cell_type in ["CD4 T ISG+", "CD4 T Proliferative", "Cycling γδ T-like"]:
        continue
    
    # plot obtained scores
    dc.plot_barplot(
        acts=pathway_acts,
        contrast=cell_type,
        top=25,
        vertical=False,
        figsize=(6, 3)
    )

    plt.title(cell_type)
    if cell_type == "CD4 T CM/EarlyActivated":
        cell_type = "CD4 T CM-EarlyActivated"
    plt.savefig(os.path.join(fig_dir,cell_type+"_Pathway_Activity_plot.png"), dpi=300, format="png", bbox_inches="tight")
    plt.show()

In [None]:
# explore target genes of pathway of interest
dc.plot_targets(
    data=results_df,
    stat='stat',
    source_name='JAK-STAT',
    net=progeny,
    top=15
)

## Functional enrichment of biological terms (GSEA) - MSigDB

In [None]:
# Retrieve MSigDB resource
msigdb = dc.get_resource('MSigDB')
msigdb

In [None]:
msigdb.geneset

In [None]:
# Filter by hallmark (can use any other)
msigdb = msigdb[msigdb['collection']=='hallmark']

# Remove duplicated entries
msigdb = msigdb[~msigdb.duplicated(['geneset', 'genesymbol'])]

# Rename
msigdb.loc[:, 'geneset'] = [name.split('HALLMARK_')[1] for name in msigdb['geneset']]

msigdb

In [None]:
# Infer enrichment with ora using significant deg
top_genes = results_df[results_df['padj'] < 0.05]

# Run ora
enr_pvals = dc.get_ora_df(
    df=top_genes,
    net=msigdb,
    source='geneset',
    target='genesymbol'
)

enr_pvals.head()

In [None]:
# visualize most enrichment terms
dc.plot_dotplot(
    enr_pvals.sort_values('Combined score', ascending=False).head(15),
    x='Combined score',
    y='Term',
    s='Odds ratio',
    c='FDR p-value',
    scale=0.5,
    figsize=(3, 9)
)

In [None]:
# plot running score
dc.plot_running_score(
    df=results_df,
    stat='stat',
    net=msigdb,
    source='geneset',
    target='genesymbol',
    set_name='MYC_TARGETS_V2'
)