In [None]:
# --- Imports ---
import pandas as pd
from typing import Tuple
import numpy as np
import pyranges as pr

In [None]:
# Load *.network.tsv
df1 = pd.read_csv('K562/K562.network.tsv', sep = '\t')

df1 = df1[['chrom', 'chromStart', 'chromEnd', 'targetGene']]
df1 = df1.drop_duplicates()

print(df1.shape)
df1.head(n = 3)

In [None]:
# Load *.SNV.eQTLs.tsv
df2 = pd.read_csv('eQTLs/Blood.SNV.eQTLs.hg38.tsv', sep = '\t')

df2 = df2[['chrom', 'chromStart', 'chromEnd', 'targetGene']]
df2 = df2.drop_duplicates()

print(df2.shape)
df2.head(n = 3)

In [None]:
# Permutation test
def permutation_test(df1: pd.DataFrame, df2: pd.DataFrame, permutations: int, random_state: int) -> Tuple[int, float, pd.DataFrame]:
    RNG = np.random.default_rng(random_state)

    df1 = df1[['chrom', 'chromStart', 'chromEnd', 'targetGene']].dropna().drop_duplicates().reset_index(drop = True)
    df2 = df2[['chrom', 'chromStart', 'chromEnd', 'targetGene']].dropna().drop_duplicates().reset_index(drop = True)

    if df1.empty or df2.empty:
        histogram_df = pd.DataFrame({'Overlaps': [0], 'Label': ['Observed']})
        return 0, 1.0, histogram_df

    df1 = df1.reset_index(names = 'edge_id')
    df2 = df2.reset_index(names = 'snv_id')

    enh_pr = pr.PyRanges(df1.rename(columns = {'chrom': 'Chromosome', 'chromStart': 'Start', 'chromEnd': 'End'}))

    snv_pr = pr.PyRanges(df2.rename(columns = {'chrom': 'Chromosome', 'chromStart': 'Start', 'chromEnd': 'End'}))

    overlaps = snv_pr.join(enh_pr, suffix = '_enh').df

    if overlaps.empty:
        histogram_df = pd.DataFrame({'Overlaps': [0], 'Label': ['Observed']})
        return 0, 1.0, histogram_df

    ov = overlaps[['snv_id', 'targetGene', 'edge_id']].copy()
    ov = ov.rename(columns = {'targetGene': 'gene_eqtl'})

    enh_genes = df1[['edge_id', 'targetGene']].copy()
    enh_genes = enh_genes.rename(columns = {'targetGene': 'gene_enh'})

    all_genes = pd.unique(pd.concat([df2['targetGene'], df1['targetGene']], ignore_index = True))
    gene_to_idx = {g: i for i, g in enumerate(all_genes)}
    n_genes = len(all_genes)

    gene_eqtl_idx = ov['gene_eqtl'].map(gene_to_idx).to_numpy(dtype = np.int64)
    edge_id_idx = ov['edge_id'].to_numpy(dtype = np.int64)
    snv_ids = ov['snv_id'].to_numpy(dtype = np.int64)

    gene_enh_idx = enh_genes['gene_enh'].map(gene_to_idx).to_numpy(dtype = np.int64)

    pair_ids = snv_ids * n_genes + gene_eqtl_idx

    supported_mask_obs = gene_eqtl_idx == gene_enh_idx[edge_id_idx]
    supported_pairs_obs = np.unique(pair_ids[supported_mask_obs])
    obs_overlap = supported_pairs_obs.size

    null_counts = np.zeros(permutations, dtype = np.int64)

    for b in range(permutations):
        perm_gene_enh_idx = RNG.permutation(gene_enh_idx)

        supported_mask_perm = gene_eqtl_idx == perm_gene_enh_idx[edge_id_idx]
        supported_pairs_perm = np.unique(pair_ids[supported_mask_perm])
        null_counts[b] = supported_pairs_perm.size

    p_val = (np.sum(null_counts >= obs_overlap) + 1) / (permutations + 1)

    histogram_df = pd.DataFrame({
        'Overlaps': np.concatenate([null_counts, np.array([obs_overlap])]),
        'Label': ['Null'] * permutations + ['Observed']
    })

    return obs_overlap, p_val, histogram_df

In [None]:
# Results
obs_overlap, p_val, histogram_df = permutation_test(df1 = df1, df2 = df2, permutations = 1000, random_state = 42)

print(f'Observed: {obs_overlap}')
print(f'p-value: {p_val}')
histogram_df.head(n = 5)

In [None]:
# Histogram
histogram_df.to_csv('histograms/K562.eQTLs.histogram.csv', index = False)

In [None]:
# Histogram

In [None]:
# K562  ---> p < 0.01
# HepG2 ---> p < 0.01
# hiPSC ---> p < 0.01