In [1]:
import scanpy as sc
import decoupler as dc

# Only needed for processing
import numpy as np
import pandas as pd
from anndata import AnnData

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [2]:
experiment = "GSE164471_MAvY_DDS"
comparison = 'young.vs.middleage'

In [3]:
results_df =  pd.read_csv(f'/home/amore/work/data/{experiment}.csv')
results_df = results_df.set_index('row')
results_df.drop(columns=['Unnamed: 0'],inplace=True)
results_df.index.name = 'GeneName'
results_df

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
GeneName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000000003.14,175.685322,-0.049808,0.230487,-0.216097,0.828912,0.957591
ENSG00000000005.5,1.797225,0.425876,0.968858,0.439565,0.660252,
ENSG00000000419.12,162.870939,-0.194526,0.228771,-0.850309,0.395153,0.830629
ENSG00000000457.13,120.250411,-0.208984,0.268479,-0.778400,0.436333,0.841090
ENSG00000000460.16,295.833043,0.182003,0.220999,0.823547,0.410197,0.833987
...,...,...,...,...,...,...
ENSG00000285480.1,1.621407,-0.024949,1.324500,-0.018836,0.984972,
ENSG00000285491.1,1.489357,1.778577,1.351951,1.315564,0.188320,
ENSG00000285505.1,0.408963,1.297987,1.624595,0.798960,0.424313,
ENSG00000285508.1,40.905166,-0.819949,0.675841,-1.213228,0.225042,0.799189


In [4]:
ensembl_names = results_df.index
ensembl_names = ensembl_names.to_list()

In [None]:
# Retrieve gene symbols
annot = sc.queries.biomart_annotations("hsapiens",
        ["ensembl_gene_id", "external_gene_name"],
        use_cache=False
    ).set_index("ensembl_gene_id")


In [None]:
ensembl_names = [item for item in ensembl_names if item.split('.')[0] in annot.index]


In [None]:
results_df = results_df.query('index in @ensembl_names')

In [None]:
ensembl_names = [element.split('.')[0] for element in ensembl_names]


In [None]:
# Assign gene symbols
results_df['gene_symbol'] = [annot.loc[ensembl_id,'external_gene_name'] for ensembl_id in ensembl_names]
results_df

In [None]:
results_df.to_csv(f'/home/amore/work/data/{experiment}_gene_symbol.csv')

In [3]:
results_df =  pd.read_csv(f'/home/amore/work/data/{experiment}_gene_symbol.csv')


In [4]:
results_df = results_df.set_index('gene_symbol')
results_df.index.name = 'GeneName'
results_df

Unnamed: 0_level_0,GeneName,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
GeneName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TSPAN6,ENSG00000000003.14,175.685322,-0.049808,0.230487,-0.216097,0.828912,0.957591
TNMD,ENSG00000000005.5,1.797225,0.425876,0.968858,0.439565,0.660252,
DPM1,ENSG00000000419.12,162.870939,-0.194526,0.228771,-0.850309,0.395153,0.830629
SCYL3,ENSG00000000457.13,120.250411,-0.208984,0.268479,-0.778400,0.436333,0.841090
FIRRM,ENSG00000000460.16,295.833043,0.182003,0.220999,0.823547,0.410197,0.833987
...,...,...,...,...,...,...,...
H2BK1,ENSG00000285480.1,1.621407,-0.024949,1.324500,-0.018836,0.984972,
OR1Q1BP,ENSG00000285491.1,1.489357,1.778577,1.351951,1.315564,0.188320,
,ENSG00000285505.1,0.408963,1.297987,1.624595,0.798960,0.424313,
,ENSG00000285508.1,40.905166,-0.819949,0.675841,-1.213228,0.225042,0.799189


In [5]:
def make_index_unique(df):
    # Create a dictionary to keep track of the counts
    counts = {}
    
    def unique_index(index):
        if index in counts:
            counts[index] += 1
            return f"{index}_{counts[index]}"
        else:
            counts[index] = 0
            return index
    
    # Apply the unique_index function to each index value
    new_index = [unique_index(idx) for idx in df.index]
    df.index = new_index


In [6]:
results_df = results_df[results_df.index.notnull()]
results_df

Unnamed: 0_level_0,GeneName,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
GeneName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TSPAN6,ENSG00000000003.14,175.685322,-0.049808,0.230487,-0.216097,0.828912,0.957591
TNMD,ENSG00000000005.5,1.797225,0.425876,0.968858,0.439565,0.660252,
DPM1,ENSG00000000419.12,162.870939,-0.194526,0.228771,-0.850309,0.395153,0.830629
SCYL3,ENSG00000000457.13,120.250411,-0.208984,0.268479,-0.778400,0.436333,0.841090
FIRRM,ENSG00000000460.16,295.833043,0.182003,0.220999,0.823547,0.410197,0.833987
...,...,...,...,...,...,...,...
TUSC2P1,ENSG00000285470.1,0.132631,0.585087,3.172873,0.184403,0.853697,
OR4M2-OT1,ENSG00000285472.1,3.172226,0.095544,1.197045,0.079817,0.936383,
H2BK1,ENSG00000285480.1,1.621407,-0.024949,1.324500,-0.018836,0.984972,
OR1Q1BP,ENSG00000285491.1,1.489357,1.778577,1.351951,1.315564,0.188320,


In [7]:

make_index_unique(results_df)

In [8]:
results_df = results_df.dropna()
results_df

Unnamed: 0,GeneName,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
TSPAN6,ENSG00000000003.14,175.685322,-0.049808,0.230487,-0.216097,0.828912,0.957591
DPM1,ENSG00000000419.12,162.870939,-0.194526,0.228771,-0.850309,0.395153,0.830629
SCYL3,ENSG00000000457.13,120.250411,-0.208984,0.268479,-0.778400,0.436333,0.841090
FIRRM,ENSG00000000460.16,295.833043,0.182003,0.220999,0.823547,0.410197,0.833987
FGR,ENSG00000000938.12,18.763385,0.377124,0.515070,0.732180,0.464058,0.845462
...,...,...,...,...,...,...,...
OR11J6P,ENSG00000285405.1,18.022689,-0.274647,0.944019,-0.290934,0.771102,0.940642
PDGFRL2P,ENSG00000285420.1,9.210927,-0.751170,0.689271,-1.089805,0.275799,0.805466
POLR2J3_1,ENSG00000285437.1,1511.002799,-0.190077,0.194396,-0.977783,0.328182,0.818797
C4orf36_1,ENSG00000285458.1,347.370170,0.174586,0.370647,0.471030,0.637619,0.901866


In [None]:
dc.plot_volcano_df(
    results_df,
    x='log2FoldChange',
    y='padj',
    top=10,
    figsize=(5, 5)
)

In [9]:
mat = results_df[['stat']].T.rename(index={'stat': comparison})
mat

Unnamed: 0,TSPAN6,DPM1,SCYL3,FIRRM,FGR,CFH,FUCA2,GCLC,NFYA,STPG1,...,OR4H6BP,THSD1P1,PDE4C_1,OOSP3,ABCF2-H2BK1,OR11J6P,PDGFRL2P,POLR2J3_1,C4orf36_1,TBCEL-TECTA
young.vs.middleage,-0.216097,-0.850309,-0.7784,0.823547,0.73218,1.300634,0.460345,-0.628157,-1.016378,0.857304,...,0.648509,-1.160598,0.507305,1.208897,-1.650777,-0.290934,-1.089805,-0.977783,0.47103,-0.661925


In [10]:
collectri = dc.get_collectri(organism='human', split_complexes=False)
collectri

Unnamed: 0,source,target,weight,PMID
0,MYC,TERT,1,10022128;10491298;10606235;10637317;10723141;1...
1,SPI1,BGLAP,1,10022617
2,SMAD3,JUN,1,10022869;12374795
3,SMAD4,JUN,1,10022869;12374795
4,STAT5A,IL2,1,10022878;11435608;17182565;17911616;22854263;2...
...,...,...,...,...
43173,NFKB,hsa-miR-143-3p,1,19472311
43174,AP1,hsa-miR-206,1,19721712
43175,NFKB,hsa-miR-21-5p,1,20813833;22387281
43176,NFKB,hsa-miR-224-5p,1,23474441;23988648


In [11]:
# Infer TF activities with ulm
tf_acts, tf_pvals = dc.run_ulm(mat=mat, net=collectri, verbose=True)
tf_acts

Running ulm on mat with 1 samples and 18244 targets for 688 sources.


Unnamed: 0,ABL1,AHR,AIP,AIRE,AP1,APEX1,AR,ARID1A,ARID1B,ARID3A,...,ZNF362,ZNF382,ZNF384,ZNF395,ZNF436,ZNF699,ZNF76,ZNF804A,ZNF91,ZXDC
young.vs.middleage,-0.259054,0.256422,0.12763,1.706539,-3.205193,-0.502371,-4.942704,-1.588408,0.960041,0.263229,...,-1.947352,1.089465,-0.983566,-0.534556,-1.640045,-0.702148,-0.677438,-0.807104,-0.209226,0.394685


In [12]:
tf_df = pd.DataFrame(tf_acts.T)
tf_df['pvals']=tf_pvals.T
tf_df.to_csv(f'results/tf_acts{experiment}_{comparison}.csv')
tf_df

Unnamed: 0,young.vs.middleage,pvals
ABL1,-0.259054,0.795597
AHR,0.256422,0.797628
AIP,0.127630,0.898443
AIRE,1.706539,0.087925
AP1,-3.205193,0.001352
...,...,...
ZNF699,-0.702148,0.482596
ZNF76,-0.677438,0.498137
ZNF804A,-0.807104,0.419617
ZNF91,-0.209226,0.834274


In [None]:
values = tf_acts.iloc[0]
values

In [None]:
down_reg = values.sort_values(ascending=True)[:5].index.to_list()

In [None]:
up_reg = values.sort_values(ascending=False)[:5].index.to_list()

In [None]:
up_down_reg = down_reg.copy()

In [None]:
up_down_reg.extend(up_reg)
up_down_reg

In [None]:
dc.plot_barplot(
    acts=tf_acts,
    contrast=comparison,
    top=10,
    vertical=True,
    figsize=(3, 3)
)

In [None]:
dc.plot_network(
    net=collectri,
    obs=mat,
    act=tf_acts,
    n_sources=up_down_reg,
    n_targets=5,
    node_size=50,
    figsize=(15, 15),
    c_pos_w='darkgreen',
    c_neg_w='darkred',
    vcenter=True,
    save=f'/home/amore/work/figures/{experiment}_{comparison}_network_TF.jpg'
)

In [None]:
# Extract logFCs and pvals
logFCs = results_df[['log2FoldChange']].T.rename(index={'log2FoldChange': comparison})
pvals = results_df[['padj']].T.rename(index={'padj': comparison})

# Plot
dc.plot_volcano(
    logFCs,
    pvals,
    comparison,
    name='SRF',
    net=collectri,
    top=10,
    sign_thr=0.05,
    lFCs_thr=0.5
)

In [13]:
# Retrieve PROGENy model weights
progeny = dc.get_progeny()
progeny

Unnamed: 0,source,target,weight,p_value
0,Androgen,TMPRSS2,11.490631,0.000000e+00
1,Androgen,NKX3-1,10.622551,2.242078e-44
2,Androgen,MBOAT2,10.472733,4.624285e-44
3,Androgen,KLK2,10.176186,1.944414e-40
4,Androgen,SARG,11.386852,2.790209e-40
...,...,...,...,...
1395,p53,CCDC150,-3.174527,7.396252e-13
1396,p53,LCE1A,6.154823,8.475458e-13
1397,p53,TREM2,4.101937,9.739648e-13
1398,p53,GDF9,3.355741,1.087433e-12


In [14]:
# Infer pathway activities with mlm
pathway_acts, pathway_pvals = dc.run_mlm(mat=mat, net=progeny, verbose=True)
pathway_acts

Running mlm on mat with 1 samples and 18244 targets for 14 sources.


Unnamed: 0,Androgen,EGFR,Estrogen,Hypoxia,JAK-STAT,MAPK,NFkB,PI3K,TGFb,TNFa,Trail,VEGF,WNT,p53
young.vs.middleage,-1.132366,-0.133314,-1.015983,-1.205596,0.689553,-0.250806,1.002253,2.347256,-1.634072,-1.438217,0.842139,-1.196202,1.587776,0.357393


In [15]:
pathway_df = pd.DataFrame(pathway_acts.T)
pathway_df['pvals']=pathway_pvals.T
pathway_df.to_csv(f'results/pathway_acts_{experiment}_{comparison}.csv')
pathway_df

Unnamed: 0,young.vs.middleage,pvals
Androgen,-1.132366,0.257496
EGFR,-0.133314,0.893947
Estrogen,-1.015983,0.309651
Hypoxia,-1.205596,0.227989
JAK-STAT,0.689553,0.490484
MAPK,-0.250806,0.801967
NFkB,1.002253,0.316235
PI3K,2.347256,0.018923
TGFb,-1.634072,0.102261
TNFa,-1.438217,0.15039


In [None]:
dc.plot_barplot(
    pathway_acts,
    comparison,
    top=25,
    vertical=False,
    figsize=(6, 3)
)

In [None]:
dc.plot_targets(results_df, stat='stat', source_name='TGFb', net=progeny, top=15)


In [None]:
dc.plot_targets(results_df, stat='stat', source_name='PI3K', net=progeny, top=15)


In [16]:
msigdb = dc.get_resource('MSigDB')
msigdb = msigdb[~msigdb.duplicated(['geneset', 'genesymbol'])]

msigdb

Unnamed: 0,genesymbol,collection,geneset
0,MAFF,chemical_and_genetic_perturbations,BOYAULT_LIVER_CANCER_SUBCLASS_G56_DN
1,MAFF,chemical_and_genetic_perturbations,ELVIDGE_HYPOXIA_UP
2,MAFF,chemical_and_genetic_perturbations,NUYTTEN_NIPP1_TARGETS_DN
3,MAFF,immunesigdb,GSE17721_POLYIC_VS_GARDIQUIMOD_4H_BMDC_DN
4,MAFF,chemical_and_genetic_perturbations,SCHAEFFER_PROSTATE_DEVELOPMENT_12HR_UP
...,...,...,...
3838543,PRAMEF22,go_biological_process,GOBP_POSITIVE_REGULATION_OF_CELL_POPULATION_PR...
3838544,PRAMEF22,go_biological_process,GOBP_APOPTOTIC_PROCESS
3838545,PRAMEF22,go_biological_process,GOBP_REGULATION_OF_CELL_DEATH
3838546,PRAMEF22,go_biological_process,GOBP_NEGATIVE_REGULATION_OF_DEVELOPMENTAL_PROCESS


In [17]:
# Filter by hallmark
#msigdb = msigdb[msigdb['collection']=='hallmark']

# Remove duplicated entries
msigdb = msigdb[~msigdb.duplicated(['geneset', 'genesymbol'])]

# Rename
#msigdb.loc[:, 'geneset'] = [name.split('HALLMARK_')[1] for name in msigdb['geneset']]

msigdb

Unnamed: 0,genesymbol,collection,geneset
0,MAFF,chemical_and_genetic_perturbations,BOYAULT_LIVER_CANCER_SUBCLASS_G56_DN
1,MAFF,chemical_and_genetic_perturbations,ELVIDGE_HYPOXIA_UP
2,MAFF,chemical_and_genetic_perturbations,NUYTTEN_NIPP1_TARGETS_DN
3,MAFF,immunesigdb,GSE17721_POLYIC_VS_GARDIQUIMOD_4H_BMDC_DN
4,MAFF,chemical_and_genetic_perturbations,SCHAEFFER_PROSTATE_DEVELOPMENT_12HR_UP
...,...,...,...
3838543,PRAMEF22,go_biological_process,GOBP_POSITIVE_REGULATION_OF_CELL_POPULATION_PR...
3838544,PRAMEF22,go_biological_process,GOBP_APOPTOTIC_PROCESS
3838545,PRAMEF22,go_biological_process,GOBP_REGULATION_OF_CELL_DEATH
3838546,PRAMEF22,go_biological_process,GOBP_NEGATIVE_REGULATION_OF_DEVELOPMENTAL_PROCESS


In [18]:
# Infer enrichment with ora using significant deg
top_genes = results_df[results_df['padj'] < 0.05]

# Run ora
enr_pvals = dc.get_ora_df(
    df=top_genes,
    net=msigdb,
    source='geneset',
    target='genesymbol'
)

enr_pvals.head()

Unnamed: 0,Term,Set size,Overlap ratio,p-value,FDR p-value,Odds ratio,Combined score,Features
0,AAAYRNCTG_UNKNOWN,370,0.002703,0.200798,0.276949,6.910931,11.095199,CYRIA
1,AAAYWAACM_HFH4_01,260,0.003846,0.145353,0.222249,9.884253,19.062695,CYRIA
2,AACTTT_UNKNOWN,1916,0.001044,0.321635,0.38082,2.246717,2.548539,CLIC4;CYRIA
3,ACACTCC_MIR122A,75,0.013333,0.044095,0.188788,34.423553,107.449783,CLIC4
4,ACEVEDO_LIVER_CANCER_UP,970,0.001031,0.449405,0.4882,2.557691,2.045721,GTPBP4


In [19]:
enr_pvals.to_csv(f'results/enr_pvals_{experiment}_{comparison}.csv')

In [None]:
type(enr_pvals)

In [None]:
dc.plot_dotplot(
    enr_pvals.sort_values('Combined score', ascending=False).head(15),
    x='Combined score',
    y='Term',
    s='Odds ratio',
    c='FDR p-value',
    scale=0.005,
    figsize=(3, 10)
)

In [None]:
dc.plot_running_score(
    df=results_df,
    stat='stat',
    net=msigdb,
    source='geneset',
    target='genesymbol',
    set_name='TNFA_SIGNALING_VIA_NFKB'
)

In [None]:
dc.plot_running_score(
    df=results_df,
    stat='stat',
    net=msigdb,
    source='geneset',
    target='genesymbol',
    set_name='MYOGENESIS'
)

In [20]:
import liana as ln

liana_lr = ln.resource.select_resource()
liana_lr = ln.resource.explode_complexes(liana_lr)

# Create two new DataFrames, each containing one of the pairs of columns to be concatenated
df1 = liana_lr[['interaction', 'ligand']]
df2 = liana_lr[['interaction', 'receptor']]

# Rename the columns in each new DataFrame
df1.columns = ['interaction', 'genes']
df2.columns = ['interaction', 'genes']

# Concatenate the two new DataFrames
liana_lr = pd.concat([df1, df2], axis=0)
liana_lr['weight'] = 1

# Find duplicated rows
duplicates = liana_lr.duplicated()

# Remove duplicated rows
liana_lr = liana_lr[~duplicates]

liana_lr

Unnamed: 0,interaction,genes,weight
0,LGALS9&PTPRC,LGALS9,1
1,LGALS9&MET,LGALS9,1
2,LGALS9&CD44,LGALS9,1
3,LGALS9&LRP1,LGALS9,1
4,LGALS9&CD47,LGALS9,1
...,...,...,...
5775,BMP2&ACTR2,ACTR2,1
5776,BMP15&ACTR2,ACTR2,1
5777,CSF1&CSF3R,CSF3R,1
5778,IL36G&IFNAR1,IFNAR1,1


In [21]:
# Infer lr activities with ulm
lr_score, lr_pvalue = dc.run_ulm(
    mat=mat,
    net=liana_lr,
    source='interaction',
    target='genes',
    min_n=2,
    verbose=True
)

Running ulm on mat with 1 samples and 18244 targets for 2803 sources.


In [None]:
dc.plot_barplot(lr_score, comparison, top=25, vertical=True)


In [22]:
lr_df = pd.DataFrame(lr_score.T)
lr_df['pvals']=lr_pvalue.T
lr_df.to_csv(f'results/lr_score_{experiment}_{comparison}.csv')
lr_df

Unnamed: 0,young.vs.middleage,pvals
A2M&LRP1,0.305769,0.759784
AANAT&MTNR1B,1.208114,0.227019
ACE&BDKRB2,-1.051802,0.292904
ACTR2&ADRB2,-3.278332,0.001046
ACTR2&LDLR,-1.657092,0.097518
...,...,...
WNT9B&FZD9_LRP6,-0.618921,0.535976
YBX1&NOTCH1,-1.527957,0.126541
ZP3&CHRNA7,-0.935336,0.349627
ZP3&EGFR,-2.177770,0.029436
