In [1]:
import scanpy as sc
import decoupler as dc

# Only needed for processing
import numpy as np
import pandas as pd
from anndata import AnnData

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [2]:
experiment = "GSE164471_MAvO_DDS"
comparison = 'middleAge.vs.old'

In [3]:
results_df =  pd.read_csv(f'/home/amore/work/data/{experiment}.csv')
results_df = results_df.set_index('row')
results_df.drop(columns=['Unnamed: 0'],inplace=True)
results_df.index.name = 'GeneName'
results_df

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
GeneName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000000003.14,194.175388,0.184338,0.205575,0.896696,0.369881,0.999968
ENSG00000000005.5,1.577458,-0.454634,1.138998,-0.399153,0.689781,0.999968
ENSG00000000419.12,178.205796,-0.032840,0.205884,-0.159506,0.873270,0.999968
ENSG00000000457.13,135.945190,0.005344,0.235099,0.022732,0.981864,0.999968
ENSG00000000460.16,301.374341,-0.014087,0.201734,-0.069830,0.944329,0.999968
...,...,...,...,...,...,...
ENSG00000285480.1,1.183743,-2.735240,1.522186,-1.796916,0.072349,0.999968
ENSG00000285491.1,0.944793,0.307932,1.491112,0.206512,0.836391,0.999968
ENSG00000285505.1,0.546736,1.773289,1.918997,0.924071,0.355450,0.999968
ENSG00000285508.1,56.841619,0.485548,0.696718,0.696908,0.485860,0.999968


In [4]:
ensembl_names = results_df.index
ensembl_names = ensembl_names.to_list()

In [None]:
# Retrieve gene symbols
annot = sc.queries.biomart_annotations("hsapiens",
        ["ensembl_gene_id", "external_gene_name"],
        use_cache=False
    ).set_index("ensembl_gene_id")


In [None]:
ensembl_names = [item for item in ensembl_names if item.split('.')[0] in annot.index]


In [None]:
results_df = results_df.query('index in @ensembl_names')

In [None]:
ensembl_names = [element.split('.')[0] for element in ensembl_names]


In [None]:
# Assign gene symbols
results_df['gene_symbol'] = [annot.loc[ensembl_id,'external_gene_name'] for ensembl_id in ensembl_names]
results_df

In [None]:
results_df.to_csv(f'/home/amore/work/data/{experiment}_gene_symbol.csv')

In [3]:
results_df =  pd.read_csv(f'/home/amore/work/data/{experiment}_gene_symbol.csv')


In [4]:
results_df = results_df.set_index('gene_symbol')
results_df.index.name = 'GeneName'
results_df

Unnamed: 0_level_0,GeneName,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
GeneName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TSPAN6,ENSG00000000003.14,194.175388,0.184338,0.205575,0.896696,0.369881,0.999968
TNMD,ENSG00000000005.5,1.577458,-0.454634,1.138998,-0.399153,0.689781,0.999968
DPM1,ENSG00000000419.12,178.205796,-0.032840,0.205884,-0.159506,0.873270,0.999968
SCYL3,ENSG00000000457.13,135.945190,0.005344,0.235099,0.022732,0.981864,0.999968
FIRRM,ENSG00000000460.16,301.374341,-0.014087,0.201734,-0.069830,0.944329,0.999968
...,...,...,...,...,...,...,...
H2BK1,ENSG00000285480.1,1.183743,-2.735240,1.522186,-1.796916,0.072349,0.999968
OR1Q1BP,ENSG00000285491.1,0.944793,0.307932,1.491112,0.206512,0.836391,0.999968
,ENSG00000285505.1,0.546736,1.773289,1.918997,0.924071,0.355450,0.999968
,ENSG00000285508.1,56.841619,0.485548,0.696718,0.696908,0.485860,0.999968


In [5]:
def make_index_unique(df):
    # Create a dictionary to keep track of the counts
    counts = {}
    
    def unique_index(index):
        if index in counts:
            counts[index] += 1
            return f"{index}_{counts[index]}"
        else:
            counts[index] = 0
            return index
    
    # Apply the unique_index function to each index value
    new_index = [unique_index(idx) for idx in df.index]
    df.index = new_index


In [6]:
results_df = results_df[results_df.index.notnull()]
results_df

Unnamed: 0_level_0,GeneName,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
GeneName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TSPAN6,ENSG00000000003.14,194.175388,0.184338,0.205575,0.896696,0.369881,0.999968
TNMD,ENSG00000000005.5,1.577458,-0.454634,1.138998,-0.399153,0.689781,0.999968
DPM1,ENSG00000000419.12,178.205796,-0.032840,0.205884,-0.159506,0.873270,0.999968
SCYL3,ENSG00000000457.13,135.945190,0.005344,0.235099,0.022732,0.981864,0.999968
FIRRM,ENSG00000000460.16,301.374341,-0.014087,0.201734,-0.069830,0.944329,0.999968
...,...,...,...,...,...,...,...
TUSC2P1,ENSG00000285470.1,0.403128,1.997943,1.939565,1.030098,0.302964,0.999968
OR4M2-OT1,ENSG00000285472.1,2.932671,-0.529675,1.412727,-0.374931,0.707712,0.999968
H2BK1,ENSG00000285480.1,1.183743,-2.735240,1.522186,-1.796916,0.072349,0.999968
OR1Q1BP,ENSG00000285491.1,0.944793,0.307932,1.491112,0.206512,0.836391,0.999968


In [7]:

make_index_unique(results_df)

In [8]:
results_df = results_df.dropna()
results_df

Unnamed: 0,GeneName,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
TSPAN6,ENSG00000000003.14,194.175388,0.184338,0.205575,0.896696,0.369881,0.999968
TNMD,ENSG00000000005.5,1.577458,-0.454634,1.138998,-0.399153,0.689781,0.999968
DPM1,ENSG00000000419.12,178.205796,-0.032840,0.205884,-0.159506,0.873270,0.999968
SCYL3,ENSG00000000457.13,135.945190,0.005344,0.235099,0.022732,0.981864,0.999968
FIRRM,ENSG00000000460.16,301.374341,-0.014087,0.201734,-0.069830,0.944329,0.999968
...,...,...,...,...,...,...,...
TUSC2P1,ENSG00000285470.1,0.403128,1.997943,1.939565,1.030098,0.302964,0.999968
OR4M2-OT1,ENSG00000285472.1,2.932671,-0.529675,1.412727,-0.374931,0.707712,0.999968
H2BK1,ENSG00000285480.1,1.183743,-2.735240,1.522186,-1.796916,0.072349,0.999968
OR1Q1BP,ENSG00000285491.1,0.944793,0.307932,1.491112,0.206512,0.836391,0.999968


In [None]:
dc.plot_volcano_df(
    results_df,
    x='log2FoldChange',
    y='padj',
    top=10,
    figsize=(5, 5)
)

In [9]:
mat = results_df[['stat']].T.rename(index={'stat': comparison})
mat

Unnamed: 0,TSPAN6,TNMD,DPM1,SCYL3,FIRRM,FGR,CFH,FUCA2,GCLC,NFYA,...,OR11J6P,PDGFRL2P,POLR2J3_1,TUSC2P2,C4orf36_1,TUSC2P1,OR4M2-OT1,H2BK1,OR1Q1BP,TBCEL-TECTA
middleAge.vs.old,0.896696,-0.399153,-0.159506,0.022732,-0.06983,-0.374732,0.604502,-0.486915,-0.526544,-1.281789,...,-0.37693,-0.761187,-1.499686,-0.279096,-0.633141,1.030098,-0.374931,-1.796916,0.206512,0.420466


In [10]:
collectri = dc.get_collectri(organism='human', split_complexes=False)
collectri

Unnamed: 0,source,target,weight,PMID
0,MYC,TERT,1,10022128;10491298;10606235;10637317;10723141;1...
1,SPI1,BGLAP,1,10022617
2,SMAD3,JUN,1,10022869;12374795
3,SMAD4,JUN,1,10022869;12374795
4,STAT5A,IL2,1,10022878;11435608;17182565;17911616;22854263;2...
...,...,...,...,...
43173,NFKB,hsa-miR-143-3p,1,19472311
43174,AP1,hsa-miR-206,1,19721712
43175,NFKB,hsa-miR-21-5p,1,20813833;22387281
43176,NFKB,hsa-miR-224-5p,1,23474441;23988648


In [11]:
# Infer TF activities with ulm
tf_acts, tf_pvals = dc.run_ulm(mat=mat, net=collectri, verbose=True)
tf_acts

Running ulm on mat with 1 samples and 28851 targets for 750 sources.


Unnamed: 0,ABL1,AHR,AHRR,AIP,AIRE,AP1,APEX1,AR,ARID1A,ARID1B,...,ZNF382,ZNF384,ZNF395,ZNF410,ZNF436,ZNF699,ZNF76,ZNF804A,ZNF91,ZXDC
middleAge.vs.old,1.231489,-1.021376,1.766211,-1.760559,0.009928,-1.33532,-1.457494,-4.115004,-1.905634,0.708271,...,2.703219,-0.166023,-0.656235,1.617273,-1.417291,-0.831803,1.192362,0.913818,0.26542,-0.643625


In [12]:
tf_df = pd.DataFrame(tf_acts.T)
tf_df['pvals']=tf_pvals.T
tf_df.to_csv(f'results/tf_acts{experiment}_{comparison}.csv')
tf_df

Unnamed: 0,middleAge.vs.old,pvals
ABL1,1.231489,0.218150
AHR,-1.021376,0.307085
AHRR,1.766211,0.077371
AIP,-1.760559,0.078324
AIRE,0.009928,0.992079
...,...,...
ZNF699,-0.831803,0.405527
ZNF76,1.192362,0.233129
ZNF804A,0.913818,0.360820
ZNF91,0.265420,0.790688


In [None]:
values = tf_acts.iloc[0]
values

In [None]:
down_reg = values.sort_values(ascending=True)[:5].index.to_list()

In [None]:
up_reg = values.sort_values(ascending=False)[:5].index.to_list()

In [None]:
up_down_reg = down_reg.copy()

In [None]:
up_down_reg.extend(up_reg)
up_down_reg

In [None]:
dc.plot_barplot(
    acts=tf_acts,
    contrast=comparison,
    top=10,
    vertical=True,
    figsize=(3, 3)
)

In [None]:
dc.plot_network(
    net=collectri,
    obs=mat,
    act=tf_acts,
    n_sources=up_down_reg,
    n_targets=5,
    node_size=50,
    figsize=(15, 15),
    c_pos_w='darkgreen',
    c_neg_w='darkred',
    vcenter=True,
    save=f'/home/amore/work/figures/{experiment}_{comparison}_network_TF.jpg'
)

In [None]:
# Extract logFCs and pvals
logFCs = results_df[['log2FoldChange']].T.rename(index={'log2FoldChange': comparison})
pvals = results_df[['padj']].T.rename(index={'padj': comparison})

# Plot
dc.plot_volcano(
    logFCs,
    pvals,
    comparison,
    name='SRF',
    net=collectri,
    top=10,
    sign_thr=0.05,
    lFCs_thr=0.5
)

In [13]:
# Retrieve PROGENy model weights
progeny = dc.get_progeny()
progeny

Unnamed: 0,source,target,weight,p_value
0,Androgen,TMPRSS2,11.490631,0.000000e+00
1,Androgen,NKX3-1,10.622551,2.242078e-44
2,Androgen,MBOAT2,10.472733,4.624285e-44
3,Androgen,KLK2,10.176186,1.944414e-40
4,Androgen,SARG,11.386852,2.790209e-40
...,...,...,...,...
1395,p53,CCDC150,-3.174527,7.396252e-13
1396,p53,LCE1A,6.154823,8.475458e-13
1397,p53,TREM2,4.101937,9.739648e-13
1398,p53,GDF9,3.355741,1.087433e-12


In [14]:
# Infer pathway activities with mlm
pathway_acts, pathway_pvals = dc.run_mlm(mat=mat, net=progeny, verbose=True)
pathway_acts

Running mlm on mat with 1 samples and 28851 targets for 14 sources.


Unnamed: 0,Androgen,EGFR,Estrogen,Hypoxia,JAK-STAT,MAPK,NFkB,PI3K,TGFb,TNFa,Trail,VEGF,WNT,p53
middleAge.vs.old,-0.765123,3.788988,1.142182,-0.642152,-1.462402,-2.626986,0.306523,0.929358,0.023225,-0.131037,2.095984,0.457538,3.316274,-0.925828


In [15]:
pathway_df = pd.DataFrame(pathway_acts.T)
pathway_df['pvals']=pathway_pvals.T
pathway_df.to_csv(f'results/pathway_acts_{experiment}_{comparison}.csv')
pathway_df

Unnamed: 0,middleAge.vs.old,pvals
Androgen,-0.765123,0.444205
EGFR,3.788988,0.000152
Estrogen,1.142182,0.253388
Hypoxia,-0.642152,0.520779
JAK-STAT,-1.462402,0.143642
MAPK,-2.626986,0.008619
NFkB,0.306523,0.759209
PI3K,0.929358,0.352711
TGFb,0.023225,0.981471
TNFa,-0.131037,0.895747


In [None]:
dc.plot_barplot(
    pathway_acts,
    comparison,
    top=25,
    vertical=False,
    figsize=(6, 3)
)

In [None]:
dc.plot_targets(results_df, stat='stat', source_name='TGFb', net=progeny, top=15)


In [None]:
dc.plot_targets(results_df, stat='stat', source_name='PI3K', net=progeny, top=15)


In [16]:
msigdb = dc.get_resource('MSigDB')
msigdb

Unnamed: 0,genesymbol,collection,geneset
0,MAFF,chemical_and_genetic_perturbations,BOYAULT_LIVER_CANCER_SUBCLASS_G56_DN
1,MAFF,chemical_and_genetic_perturbations,ELVIDGE_HYPOXIA_UP
2,MAFF,chemical_and_genetic_perturbations,NUYTTEN_NIPP1_TARGETS_DN
3,MAFF,immunesigdb,GSE17721_POLYIC_VS_GARDIQUIMOD_4H_BMDC_DN
4,MAFF,chemical_and_genetic_perturbations,SCHAEFFER_PROSTATE_DEVELOPMENT_12HR_UP
...,...,...,...
3838543,PRAMEF22,go_biological_process,GOBP_POSITIVE_REGULATION_OF_CELL_POPULATION_PR...
3838544,PRAMEF22,go_biological_process,GOBP_APOPTOTIC_PROCESS
3838545,PRAMEF22,go_biological_process,GOBP_REGULATION_OF_CELL_DEATH
3838546,PRAMEF22,go_biological_process,GOBP_NEGATIVE_REGULATION_OF_DEVELOPMENTAL_PROCESS


In [17]:
# Filter by hallmark
#msigdb = msigdb[msigdb['collection']=='hallmark']

# Remove duplicated entries
msigdb = msigdb[~msigdb.duplicated(['geneset', 'genesymbol'])]

# Rename
#msigdb.loc[:, 'geneset'] = [name.split('HALLMARK_')[1] for name in msigdb['geneset']]

msigdb

Unnamed: 0,genesymbol,collection,geneset
0,MAFF,chemical_and_genetic_perturbations,BOYAULT_LIVER_CANCER_SUBCLASS_G56_DN
1,MAFF,chemical_and_genetic_perturbations,ELVIDGE_HYPOXIA_UP
2,MAFF,chemical_and_genetic_perturbations,NUYTTEN_NIPP1_TARGETS_DN
3,MAFF,immunesigdb,GSE17721_POLYIC_VS_GARDIQUIMOD_4H_BMDC_DN
4,MAFF,chemical_and_genetic_perturbations,SCHAEFFER_PROSTATE_DEVELOPMENT_12HR_UP
...,...,...,...
3838543,PRAMEF22,go_biological_process,GOBP_POSITIVE_REGULATION_OF_CELL_POPULATION_PR...
3838544,PRAMEF22,go_biological_process,GOBP_APOPTOTIC_PROCESS
3838545,PRAMEF22,go_biological_process,GOBP_REGULATION_OF_CELL_DEATH
3838546,PRAMEF22,go_biological_process,GOBP_NEGATIVE_REGULATION_OF_DEVELOPMENTAL_PROCESS


In [19]:
# Infer enrichment with ora using significant deg
top_genes = results_df[results_df['padj'] < 0.05]

# Run ora
enr_pvals = dc.get_ora_df(
    df=top_genes,
    net=msigdb,
    source='geneset',
    target='genesymbol'
)

enr_pvals.head()

Unnamed: 0,Term,Set size,Overlap ratio,p-value,FDR p-value,Odds ratio,Combined score,Features
0,AACWWCAANK_UNKNOWN,145,0.006897,0.014448,0.036548,136.463918,578.226727,DQX1
1,ACTAYRNNNCCCR_UNKNOWN,462,0.002165,0.045668,0.060536,42.245405,130.384867,DQX1
2,BROWNE_HCMV_INFECTION_12HR_DN,98,0.010204,0.009776,0.036548,202.055838,935.074199,MYH8
3,BROWNE_HCMV_INFECTION_24HR_DN,144,0.006944,0.014349,0.036548,137.415225,583.205146,MYH8
4,CARRILLOREIXACH_HEPATOBLASTOMA_VS_NORMAL_UP,831,0.001203,0.081376,0.092307,23.054119,57.835406,DQX1


In [20]:
enr_pvals.to_csv(f'results/enr_pvals_{experiment}_{comparison}.csv')

In [None]:
dc.plot_dotplot(
    enr_pvals.sort_values('Combined score', ascending=False).head(15),
    x='Combined score',
    y='Term',
    s='Odds ratio',
    c='FDR p-value',
    scale=0.05,
    figsize=(3, 10)
)

In [None]:
dc.plot_running_score(
    df=results_df,
    stat='stat',
    net=msigdb,
    source='geneset',
    target='genesymbol',
    set_name='TNFA_SIGNALING_VIA_NFKB'
)

In [None]:
dc.plot_running_score(
    df=results_df,
    stat='stat',
    net=msigdb,
    source='geneset',
    target='genesymbol',
    set_name='MYOGENESIS'
)

In [21]:
import liana as ln

liana_lr = ln.resource.select_resource()
liana_lr = ln.resource.explode_complexes(liana_lr)

# Create two new DataFrames, each containing one of the pairs of columns to be concatenated
df1 = liana_lr[['interaction', 'ligand']]
df2 = liana_lr[['interaction', 'receptor']]

# Rename the columns in each new DataFrame
df1.columns = ['interaction', 'genes']
df2.columns = ['interaction', 'genes']

# Concatenate the two new DataFrames
liana_lr = pd.concat([df1, df2], axis=0)
liana_lr['weight'] = 1

# Find duplicated rows
duplicates = liana_lr.duplicated()

# Remove duplicated rows
liana_lr = liana_lr[~duplicates]

liana_lr

Unnamed: 0,interaction,genes,weight
0,LGALS9&PTPRC,LGALS9,1
1,LGALS9&MET,LGALS9,1
2,LGALS9&CD44,LGALS9,1
3,LGALS9&LRP1,LGALS9,1
4,LGALS9&CD47,LGALS9,1
...,...,...,...
5775,BMP2&ACTR2,ACTR2,1
5776,BMP15&ACTR2,ACTR2,1
5777,CSF1&CSF3R,CSF3R,1
5778,IL36G&IFNAR1,IFNAR1,1


In [22]:
# Infer lr activities with ulm
lr_score, lr_pvalue = dc.run_ulm(
    mat=mat,
    net=liana_lr,
    source='interaction',
    target='genes',
    min_n=2,
    verbose=True
)

Running ulm on mat with 1 samples and 28851 targets for 4106 sources.


In [None]:
dc.plot_barplot(lr_score, comparison, top=25, vertical=True)


In [23]:
lr_df = pd.DataFrame(lr_score.T)
lr_df['pvals']=lr_pvalue.T
lr_df.to_csv(f'results/lr_score_{experiment}_{comparison}.csv')
lr_df

Unnamed: 0,middleAge.vs.old,pvals
A2M&LRP1,0.225797,0.821361
AANAT&MTNR1A,0.929301,0.352741
AANAT&MTNR1B,-0.098470,0.921560
ACE&BDKRB2,-0.164478,0.869356
ACTR2&ADRB2,-2.020147,0.043377
...,...,...
ZG16B&TLR5,1.887015,0.059168
ZG16B&TLR6,1.027848,0.304030
ZP3&CHRNA7,-1.721731,0.085129
ZP3&EGFR,-1.828419,0.067497
