In [1]:
# --- Imports ---
import pandas as pd
from typing import List, TypedDict
from scipy.stats import hypergeom

In [2]:
# Load K562.network.tsv
df1 = pd.read_csv('K562/K562.network.tsv', sep = '\t')

print(df1.shape)
df1.head(n = 3)

(78062, 9)


Unnamed: 0,chrom,chromStart,chromEnd,name,class,targetGene,targetEnsemblID,cellType,MPRabc
0,chr1,9863,10662,intergenic|chr1:9863-10662,intergenic,OR4F5,ENSG00000186092,K562,0.352643
1,chr1,9863,10662,intergenic|chr1:9863-10662,intergenic,FAM138F,ENSG00000282591,K562,0.45579
2,chr1,9863,10662,intergenic|chr1:9863-10662,intergenic,FAM138A,ENSG00000237613,K562,0.45579


In [3]:
# Load HepG2.network.tsv
df2 = pd.read_csv('HepG2/HepG2.network.tsv', sep = '\t')

print(df2.shape)

(117160, 9)


In [4]:
# Load hiPSC.network.tsv
df3 = pd.read_csv('hiPSC/hiPSC.network.tsv', sep = '\t')

print(df3.shape)

(67568, 9)


In [5]:
# Load genes
genes = pd.read_csv('genes/Liver.genes.tsv', sep = '\t')

print(genes.shape)
genes.head(n = 3)

(249, 1)


Unnamed: 0,conditionSpecificGenes
0,AADAC
1,ABCB4
2,ABCC2


In [6]:
# Enrichment test
class EnrichmentResult(TypedDict):
    cell_type: str
    M: int
    K: int
    n: int
    k: int
    expected_overlap: float
    p_val: float
    overlap_genes: List[str]

def test_gene_set_enrichment(dfs: List[pd.DataFrame], genes_df: pd.DataFrame, fg_biosample: str) -> EnrichmentResult:
    networks = pd.concat(dfs, ignore_index = True)
    universe = pd.Index(networks['targetGene'].dropna().unique())

    gene_set = pd.Index(genes_df['conditionSpecificGenes'].dropna().unique())
    gene_set = gene_set.intersection(universe)

    fg_df = networks[networks['cellType'] == fg_biosample]
    fg_genes = pd.Index(fg_df['targetGene'].dropna().unique()).intersection(universe)

    M = len(universe)
    K = len(gene_set)
    n = len(fg_genes)
    overlap_genes = fg_genes.intersection(gene_set)
    k = len(overlap_genes)

    if M == 0 or K == 0 or n == 0:
        expected_overlap: float = float('nan')
        p_val: float = float('nan')
    else:
        expected_overlap = n * (K / M)
        p_val = float(hypergeom.sf(k - 1, M, K, n))

    result: EnrichmentResult = {
        'cell_type': fg_biosample,
        'M': M,
        'K': K,
        'n': n,
        'k': k,
        'expected_overlap': expected_overlap,
        'p_val': p_val,
        'overlap_genes': list(overlap_genes),
    }

    return result

In [7]:
# Results
res = test_gene_set_enrichment(dfs = [df1, df2, df3], genes_df = genes, fg_biosample = 'HepG2')

print(res['p_val'])

6.6274567193882876e-09


In [8]:
# K562  ---> p < X
# HepG2 ---> p < X
# hiPSC ---> p < X 