# Genehancer 
- https://genome.ucsc.edu/cgi-bin/hgTables?db=hg38&hgta_group=regulation&hgta_track=geneHancer&hgta_table=geneHancerGenesDoubleElite&hgta_doSchema=describe+table+schema
- https://www.weizmann.ac.il/molgen/genehancer-genome-wide-integration-enhancers-and-target-genes-genecards

- http://www.enhanceratlas.org/linkv2.php#l

In [100]:
import sys 
import pandas as pd 

sys.path.append('../../scripts')

from util import *

In [104]:
DMR = pd.read_csv('delta_CpG_DMR.bed',sep='\t').rename({"# chr":"chrom"}, axis=1).sort_values(['chrom', 'start', 'end']).reset_index(drop=True)
DMR['name'] = [f'DMR_{i+1}' for i in range(DMR.shape[0])]

In [106]:
DMR_up, DMR_down = find_top(DMR,'meth.diff',10,'qvalue',0.01)

up:  594
down: 79204


In [156]:
import pandas as pd
import numpy as np


def find_neighbors(df, dist):
    # Compute the distance between all pairs of genomic intervals
    idx = np.arange(len(df))
    idx_pairs = np.stack(np.meshgrid(idx, idx), axis=-1).reshape(-1, 2)
    pairs_df = pd.DataFrame(
        np.concatenate([df.values[idx_pairs[:, 0]], df.values[idx_pairs[:, 1]]], axis=1),
        columns=[f"{col}_{i}" for i in ["a", "b"] for col in df.columns],
    )
    pairs_df = pairs_df.astype({"chrom_a": "category", "chrom_b": "category"})

    # Filter for overlapping and neighboring pairs
    pairs_df = pairs_df[
        (pairs_df["chrom_a"] == pairs_df["chrom_b"])
        & (
            ((pairs_df["start_b"] - pairs_df["end_a"]).abs() <= dist)
            | ((pairs_df["start_a"] - pairs_df["end_b"]).abs() <= dist)
            | (
                (pairs_df["start_a"] <= pairs_df["start_b"])
                & (pairs_df["end_a"] >= pairs_df["end_b"])
            )
            | (
                (pairs_df["start_b"] <= pairs_df["start_a"])
                & (pairs_df["end_b"] >= pairs_df["end_a"])
            )
        )
    ]

    # Merge overlapping regions and create the "neighbors" column
    pairs_df = (
        pairs_df.groupby(["chrom_a", "start_a", "end_a"])
        .agg({"name_a": "first", "name_b": lambda x: ",".join(sorted(x))})
        .reset_index()
    )
    pairs_df["neighbors"] = pairs_df["name_b"]
    pairs_df = pairs_df.drop(columns=["name_b"])

    return pairs_df


def find_neighbors_dataframe(df, dist):
    # Find overlapping and neighboring regions
    pairs_df = find_neighbors(df, dist)

    # Merge overlapping regions
    merged_df = (
        pairs_df.sort_values(["chrom_a", "start_a", "end_a"])
        .reset_index(drop=True)
        .drop_duplicates(subset=["chrom_a", "start_a", "end_a"])
    )

    # Merge the neighbors column back into the original DataFrame
    merged_df = merged_df.rename(
        columns={"name_a": "name", "chrom_a": "chrom", "start_a": "start", "end_a": "end"}
    )
    merged_df = pd.merge(
        df.drop(columns=["neighbors"]), merged_df[["name", "neighbors"]], on="name", how="left"
    )

    return merged_df


In [172]:
DMR_up[['chrom','start','end','name']].head(5).reset_index(drop=True).to_dict('list')

{'chrom': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'],
 'start': [1131001, 1435001, 1435601, 1504601, 3711401],
 'end': [1132000, 1436000, 1436600, 1505600, 3712400],
 'name': ['DMR_87', 'DMR_215', 'DMR_216', 'DMR_233', 'DMR_1058']}

In [None]:
find_neighbors(DMR_up[['chrom','start','end','strand','name']], dist = 10)

In [110]:
res.loc[res.loc['DMR_79562','neighbors'].split(','),]

Unnamed: 0_level_0,chrom,start,end,neighbors
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DMR_79307,chrX,40095001,40096000,DMR_79307
DMR_79453,chrX,68894401,68895400,"DMR_79307,DMR_79453"
DMR_79476,chrX,71617201,71618200,"DMR_79307,DMR_79453,DMR_79476"
DMR_79560,chrX,133414401,133415400,"DMR_79307,DMR_79453,DMR_79476,DMR_79560,DMR_79..."
DMR_79561,chrX,133414601,133415600,"DMR_79307,DMR_79453,DMR_79476,DMR_79560,DMR_79..."
DMR_79562,chrX,133414801,133415800,"DMR_79307,DMR_79453,DMR_79476,DMR_79560,DMR_79..."


In [8]:
DMR_up

Unnamed: 0,# chr,start,end,strand,pvalue,qvalue,meth.diff,dist.to.feature,feature.name,feature.strand
313,chr6,170295801,170296800,*,2.439006e-18,5.806186e-15,32.682898,482,FAM120B,+
764,chr6,170295601,170296600,*,8.356887e-16,8.102105e-13,31.105991,282,FAM120B,+
810,chr7,27130001,27131000,*,1.148918e-15,1.051174e-12,34.346662,0,HOXA4,-
1362,chr6,10719801,10720800,*,2.492391e-14,1.355930e-11,46.236842,-2115,TMEM14C,+
1809,chr7,27129801,27130800,*,1.386669e-13,5.669166e-11,30.314573,0,HOXA4,-
...,...,...,...,...,...,...,...,...,...,...
79374,chr22,39468801,39469800,*,1.153594e-04,9.746197e-04,29.664430,7023,MGAT3-AS1,-
79532,chr9,127922001,127923000,*,1.167465e-04,9.840702e-04,39.664179,4492,PIP5KL1,-
79567,chr15,89361601,89362600,*,1.169342e-04,9.852190e-04,22.888915,0,AC133637.2,-
79644,chr13,106349001,106350000,*,1.177014e-04,9.904433e-04,23.593520,21083,AL139379.1,-


In [6]:
DMR[DMR['meth.diff'] > 10]

Unnamed: 0,# chr,start,end,strand,pvalue,qvalue,meth.diff,dist.to.feature,feature.name,feature.strand
313,chr6,170295801,170296800,*,2.439006e-18,5.806186e-15,32.682898,482,FAM120B,+
764,chr6,170295601,170296600,*,8.356887e-16,8.102105e-13,31.105991,282,FAM120B,+
810,chr7,27130001,27131000,*,1.148918e-15,1.051174e-12,34.346662,0,HOXA4,-
1362,chr6,10719801,10720800,*,2.492391e-14,1.355930e-11,46.236842,-2115,TMEM14C,+
1809,chr7,27129801,27130800,*,1.386669e-13,5.669166e-11,30.314573,0,HOXA4,-
...,...,...,...,...,...,...,...,...,...,...
79374,chr22,39468801,39469800,*,1.153594e-04,9.746197e-04,29.664430,7023,MGAT3-AS1,-
79532,chr9,127922001,127923000,*,1.167465e-04,9.840702e-04,39.664179,4492,PIP5KL1,-
79567,chr15,89361601,89362600,*,1.169342e-04,9.852190e-04,22.888915,0,AC133637.2,-
79644,chr13,106349001,106350000,*,1.177014e-04,9.904433e-04,23.593520,21083,AL139379.1,-


In [3]:
# cat -t SE_02_1028_SE_hg38.bed | head 

In [6]:
# !bedtools intersect \
#     -b SE_02_1028_SE_hg38.bed \
#     -a delta_CpG_DMR.bed -wb

In [None]:
# pd.read_csv('SE_02_1028_SE_hg38.bed',sep='\t').head()
# pd.read_csv('delta_DNAme_table.txt',sep='\t').head()