In [1]:
# --- Imports ---
import pandas as pd
from scipy.stats import fisher_exact
import numpy as np
from statsmodels.stats.multitest import multipletests

In [2]:
biosamples = {'K562':  {'E-Gs': 'K562/K562.network.tsv',   'TFs': 'K562/K562.FIMO.VERIFIED.tsv'},
              'HepG2': {'E-Gs': 'HepG2/HepG2.network.tsv', 'TFs': 'HepG2/HepG2.FIMO.VERIFIED.tsv'},
              'hiPSC': {'E-Gs': 'hiPSC/hiPSC.network.tsv', 'TFs': 'hiPSC/hiPSC.FIMO.VERIFIED.tsv'}}

data = {}

for biosample, fnames in biosamples.items():
    nw = pd.read_csv(fnames['E-Gs'], sep = '\t')

    tf = pd.read_csv(fnames['TFs'], sep = '\t')
    
    region_names = set(nw['name'].unique())
    tf = tf[tf['name'].isin(region_names)].copy()

    data[biosample] = {'E-Gs': nw, 'TFs': tf}

In [3]:
regions_list = []

for biosample, d in data.items():
    tmp = d['E-Gs'][['chrom', 'chromStart', 'chromEnd', 'name']].drop_duplicates().copy()
    tmp['biosample'] = biosample
    regions_list.append(tmp)

regions = pd.concat(regions_list, ignore_index = True)

regions = regions.sort_values(['chrom', 'chromStart', 'chromEnd']).reset_index(drop = True)

In [4]:
region_ids = []

curr_region_id = 0
curr_chrom = None
curr_end = -1

for i, row in regions.iterrows():
    chrom = row['chrom']
    start = row['chromStart']
    end   = row['chromEnd']

    if (chrom != curr_chrom) or (start > curr_end):
        curr_region_id += 1
        curr_chrom = chrom
        curr_end = end
    else:
        if end > curr_end:
            curr_end = end

    region_ids.append(curr_region_id)

regions['region_id'] = region_ids

In [5]:
region_biosamples = regions.groupby('region_id')['biosample'].unique()
region_n_biosamples = region_biosamples.apply(len)

regions['n_biosamples'] = regions['region_id'].map(region_n_biosamples)

In [6]:
for biosample, d in data.items():
    nw = d['E-Gs'].copy()

    reg_sub = regions[regions['biosample'] == biosample][['chrom', 'chromStart', 'chromEnd', 'name', 'region_id', 'n_biosamples']]

    nw = nw.merge(reg_sub, on = ['chrom', 'chromStart', 'chromEnd', 'name'], how = 'left')

    data[biosample]['E-Gs'] = nw

In [7]:
for biosample, d in data.items():
    reg_sub = regions[regions['biosample'] == biosample]

    specific_region_ids = set(reg_sub[reg_sub['n_biosamples'] == 1]['region_id'])
    shared_region_ids   = set(reg_sub[reg_sub['n_biosamples'] > 1]['region_id'])

    d['region_ids_specific'] = specific_region_ids
    d['region_ids_shared']   = shared_region_ids

    print(f"{biosample}: Regions = {len(reg_sub)} | "f"Specific = {len(specific_region_ids)} | "f"Shared = {len(shared_region_ids)}")

K562: Regions = 48589 | Specific = 21347 | Shared = 25307
HepG2: Regions = 57440 | Specific = 29731 | Shared = 25526
hiPSC: Regions = 45294 | Specific = 20045 | Shared = 23638


In [8]:
for biosample, d in data.items():
    nw = d['E-Gs']
    tf = d['TFs']

    region_map = nw[['name', 'region_id']].drop_duplicates()

    tf = tf.merge(region_map, on = 'name', how = 'left')

    tf = tf.dropna(subset = ['region_id']).copy()
    tf['region_id'] = tf['region_id'].astype(int)

    region_to_tfs = tf.groupby('region_id')['motifAltID'].unique()

    tf_to_regions = tf.groupby('motifAltID')['region_id'].agg(lambda x: set(x.unique()))

    d['TFs'] = tf
    d['region_to_tfs'] = region_to_tfs
    d['tf_to_regions'] = tf_to_regions

    print(biosample, '| TF hits mapped to regions =', tf.shape[0], '| unique TFs =', len(tf_to_regions))

K562 | TF hits mapped to regions = 1472664 | unique TFs = 491
HepG2 | TF hits mapped to regions = 1404087 | unique TFs = 491
hiPSC | TF hits mapped to regions = 1612535 | unique TFs = 491


In [9]:
rows = []

for biosample, d in data.items():
    specific = d['region_ids_specific']
    shared   = d['region_ids_shared']
    tf_to_regions = d['tf_to_regions']

    n_spec   = len(specific)
    n_shared = len(shared)

    print(biosample, '| n_spec =', n_spec, '| n_shared =', n_shared)

    if (n_spec == 0) or (n_shared == 0):
        continue

    for tf_name, region_ids_with_motif in tf_to_regions.items():
        a = len(region_ids_with_motif & specific)
        c = len(region_ids_with_motif & shared) 

        b = n_spec - a 
        d_no = n_shared - c

        if (a + c) == 0:
            continue

        table = [[a, b],
                 [c, d_no]]

        odds_ratio, p_value = fisher_exact(table, alternative = 'greater')

        or_pc = (a + 0.5) * (d_no + 0.5) / ((b + 0.5) * (c + 0.5))
        log2_or = np.log2(or_pc)

        rows.append({'biosample': biosample,
                     'motifAltID': tf_name,
                     'n_specific_regions': n_spec,
                     'n_shared_regions': n_shared,
                     'n_specific_with_motif': a,
                     'n_shared_with_motif': c,
                     'odds_ratio': odds_ratio,
                     'log2_or': log2_or,
                     'p_value': p_value})

enrich = pd.DataFrame(rows)
print(enrich.head())

K562 | n_spec = 21347 | n_shared = 25307
HepG2 | n_spec = 29731 | n_shared = 25526
hiPSC | n_spec = 20045 | n_shared = 23638
  biosample motifAltID  n_specific_regions  n_shared_regions  \
0      K562         AR               21347             25307   
1      K562      ASCL1               21347             25307   
2      K562       ATF2               21347             25307   
3      K562       ATF3               21347             25307   
4      K562       ATF4               21347             25307   

   n_specific_with_motif  n_shared_with_motif  odds_ratio   log2_or  \
0                    476                  557    1.013407  0.019428   
1                   1855                 3461    0.600700 -0.735106   
2                    558                  618    1.072298  0.100826   
3                     87                  577    0.175390 -2.504350   
4                    383                  310    1.473163  0.558469   

        p_value  
0  4.284923e-01  
1  1.000000e+00  
2  1.2497

In [10]:
enrich['q_value'] = np.nan

for biosample in enrich['biosample'].unique():
    mask = enrich['biosample'] == biosample
    pvals = enrich.loc[mask, 'p_value'].values

    if len(pvals) == 0:
        continue

    _, qvals, _, _ = multipletests(pvals, method = 'fdr_bh')
    enrich.loc[mask, 'q_value'] = qvals

for biosample in enrich['biosample'].unique():
    print('\nTop TFs in', biosample)
    print(enrich.query("biosample == @biosample").sort_values('log2_or', ascending = False)
                .head(10)[['motifAltID', 'log2_or', 'p_value', 'q_value', 'n_specific_with_motif', 'n_shared_with_motif']])


Top TFs in K562
      motifAltID   log2_or        p_value        q_value  \
130  GATA1::TAL1  1.838601  9.753040e-173  4.788743e-170   
483         ZNF8  1.457599   3.304046e-20   1.802541e-18   
435       ZNF211  1.387827   1.306933e-09   2.916837e-08   
233      NEUROG2  1.207656   3.733432e-08   5.554894e-07   
264        OLIG1  1.194150   3.818031e-09   7.150161e-08   
8          ATOH7  1.184351   3.913784e-10   9.608340e-09   
265        OLIG2  1.158375   4.223110e-09   7.150161e-08   
15       BHLHE22  1.158375   4.223110e-09   7.150161e-08   
279         PBX1  1.067907   7.161526e-07   8.372165e-06   
232      NEUROG1  1.031807   1.083791e-05   9.019349e-05   

     n_specific_with_motif  n_shared_with_motif  
130                   1680                  590  
483                    249                  108  
435                    115                   52  
233                    117                   60  
264                    137                   71  
8                     