In [2]:
import numpy as np
import os, sys
import pandas as pd
import pybedtools as pbt
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append("/dors/capra_lab/users/fongsl/tools/py_")
import config_readwrite as crw
import fet
import plot_params as pp
pp.fonts()

('sans-serif', 'Arial', 18)

# functions

In [3]:
def check_section(config, section):
    
    if config.has_section(section) is False:
        config.add_section(section)
        
    return config

def pbtInt(a, b, out):
    
    A, B = pbt.BedTool(a), pbt.BedTool(b)
    
    C = A.intersect(B, wao=True, output=out)
    
    return C

# config

In [4]:
cfn = os.path.join(os.path.dirname(os.getcwd()), "config")
config, cfn = crw.read_config(cfn)

## read

In [5]:
# read
BED = config["CIS_TRANS"]["regions"]
ANNOT = config["CIS_TRANS"]["regions_annotations"]
SHUF=config["SHUFFLES"]["shuf-all"]

## write

In [6]:
section = 'SEdb2'
config = check_section(config, section)

RE = f"/data/hodges_lab/ATAC-STARR_B-cells/results/results_human-evolution/{section}"
if os.path.exists(RE) is False:
    os.mkdir(RE)

# super enhancer element, super enhancer, and typical enhancer files
SE_ELE = "/data/hodges_lab/ATAC-STARR_B-cells/data/hansen-fong/superenhancers/SE_01_0030_SE_ele_hg38.bed"
SE = "/data/hodges_lab/ATAC-STARR_B-cells/data/hansen-fong/superenhancers/SE_01_0030_SE_hg38.bed"  
TE = "/data/hodges_lab/ATAC-STARR_B-cells/data/hansen-fong/superenhancers/SE_01_0030_TE_hg38.bed"

config[section]["GM12878_SE_ELEMENT"] = SE_ELE
config[section]["GM12878_SE"] = SE
config[section]["GM12878_TE"] = TE
config[section]["results"] = RE

In [7]:
datasets = {"superenhancer_element":SE_ELE, 
            #"superenhancer": SE, 
            "typical_enhancer": TE, 
           }

# intersection

In [8]:
intersections = {}
for name, dataset in datasets.items():
    out = os.path.join(RE, f"regions.x.GM12878.{name}.bed")
    print(out)
    if os.path.exists(out) is False:
        C = pbtInt(BED, dataset, out)

    out_shuf = os.path.join(RE, f"shuf.regions.x.GM12878.{name}.bed")
    if os.path.exists(out_shuf) is False:
        C = pbtInt(SHUF, dataset, out_shuf)
    intersections[name] = (out, out_shuf)

/data/hodges_lab/ATAC-STARR_B-cells/results/results_human-evolution/SEdb2/regions.x.GM12878.superenhancer_element.bed
/data/hodges_lab/ATAC-STARR_B-cells/results/results_human-evolution/SEdb2/regions.x.GM12878.typical_enhancer.bed


# review results

## atac-starr

In [9]:
#intersections["superenhancer"][0]

In [10]:
df_col_dict ={
            #"superenhancer": [0,1,2,3,4,5,6,7,8,25],
            "superenhancer_element":[0,1,2,3,4,5,6,7,8,9], 
            "typical_enhancer" :[0,1,2,3,4,5,6,7,8,17]
            }

In [11]:
def inspectDF(value):
    
    df = pd.read_csv(intersections[value][0], sep='\t', header=None)
    print(len(list(df)))

    print(df.loc[df[df.columns[-1]]>0])

## annots

In [12]:
annot = pd.read_csv(ANNOT, sep='\t')
annot.head()

Unnamed: 0,#chr,start,end,region_id,conserved_active.regions,cis,trans,trans_only,cis_only,cis+trans,...,HH-active_MM-inactive_MH-inactive_cis,HH-active_MM-inactive_HM-inactive_trans,HH-active_MM-inactive_cis-only,HH-active_MM-inactive_trans-only,HH-active_MM-inactive_cis+trans,MM-active_HH-inactive_HM-inactive_cis,MM-active_HH-inactive_MH-inactive_trans,MM-active_HH-inactive_cis-only,MM-active_HH-inactive_trans-only,MM-active_HH-inactive_cis+trans
0,chr1,959034,959154,chr1:959034-959154,0.0,1.0,1.0,0,0,1,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,chr1,1002089,1002179,chr1:1002089-1002179,0.0,1.0,1.0,0,0,1,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,chr1,1064140,1064260,chr1:1064140-1064260,0.0,1.0,0.0,0,1,0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,chr1,1214948,1215208,chr1:1214948-1215208,0.0,1.0,1.0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
4,chr1,1215328,1215508,chr1:1215328-1215508,0.0,1.0,0.0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


                                cCRE annot+     cCRE annot- 
                    HH_active    A               B
    matched_shuffle HH_active    C               D

# HH, MM cis, trans, cis+trans

In [13]:
cols = ['HH_active', "MM_active", "cis_only", "trans_only", "cis+trans",
    "conserved_active.regions",
       'HH-active_MM-inactive_cis-only',
        'HH-active_MM-inactive_trans-only',
        'HH-active_MM-inactive_cis+trans',
        'MM-active_HH-inactive_cis-only',
        'MM-active_HH-inactive_trans-only',
        'MM-active_HH-inactive_cis+trans'
       ]
names = ["#chr", "start", "end", "region_id", 
         "chr_cCRE", "startcCRE", "endcCRE", 'cCRE_id', "label", "len"]

fdr_results={}
for key, values, in intersections.items():
    print(key)
    out, outshuf = values
    
    # opne intersections
    usecols = df_col_dict[key]
    df = pd.read_csv(out, sep='\t', header=None, 
                     usecols=usecols, names=names).drop_duplicates()
    print(df.shape)

    validation_annots = df["label"].unique()
    
    ## shuffles

    # open shuffles
    shuf = pd.read_csv(out_shuf, sep='\t', header=None, 
                     usecols=usecols, names=names).drop_duplicates()
    print(shuf.shape)


    collection={}
    
    
    # compute 2x2
    for col in cols:
        # get the region_ids that are HH/MM active
        pos = annot.loc[annot[col]==1, "region_id"]
        
        # per annotation (e.g. DELS, PELS, etc.)
        for a in validation_annots:
            if a != ".":
                
                # get dataset and matched shuffle ids
                test = df.loc[df["region_id"].isin(pos)]
                test_shuf = shuf.loc[shuf["region_id"].isin(pos)]

                # within set, how many elements have annotation, or don't.
                in_annot = test.loc[test["label"]==a].shape[0]
                out_annot = test.loc[test["label"]!=a].shape[0]

                # and in the shuffles?
                inshuf_annot = test_shuf.loc[test_shuf["label"]==a].shape[0]
                outshuf_annot = test_shuf.loc[test_shuf["label"]!=a].shape[0]
                
                # do 2x2
                result = fet.get_2x2(in_annot, out_annot, 
                                     inshuf_annot,outshuf_annot, 
                                     f"regions_x_shuf_{col}.{a}")
                
                result["col"], result["cCRE_annot"] = col, a
                key_=a+"."+col
                
                print(key_)
                
                collection[key_] = result

    ## FDR correction per shuffle, per test
    fdr_res = fet.fdr_correction(collection)

    ## fraction of annotations 
    fdr_res["frac"] = fdr_res["a"]/(fdr_res["a"]+fdr_res["b"])
    fdr_res["frac_shuf"] = fdr_res["c"]/(fdr_res["c"]+fdr_res["d"])
    fdr_res["sample"] = key
    fdr_results[key]=fdr_res

superenhancer_element
(16898, 10)
(168976, 10)
[[441, 9515], [27967, 71596]]
regions_x_shuf_HH_active.SE_01_0030 [[441, 9515], [27967, 71596]] 0.11865134723284429 0.0
SE_01_0030.HH_active
[[504, 9472], [27995, 71758]]
regions_x_shuf_MM_active.SE_01_0030 [[504, 9472], [27995, 71758]] 0.13638879771001577 0.0
SE_01_0030.MM_active
[[117, 2037], [6081, 15459]]
regions_x_shuf_cis_only.SE_01_0030 [[117, 2037], [6081, 15459]] 0.14601626205286075 4.254839486630868e-151
SE_01_0030.cis_only
[[111, 1872], [5544, 14287]]
regions_x_shuf_trans_only.SE_01_0030 [[111, 1872], [5544, 14287]] 0.1528040824915825 3.5011404469693913e-134
SE_01_0030.trans_only
[[380, 8245], [23996, 62259]]
regions_x_shuf_cis+trans.SE_01_0030 [[380, 8245], [23996, 62259]] 0.1195793598508585 0.0
SE_01_0030.cis+trans
[[139, 2895], [8573, 21767]]
regions_x_shuf_conserved_active.regions.SE_01_0030 [[139, 2895], [8573, 21767]] 0.1219079380639744 9.099447556120465e-234
SE_01_0030.conserved_active.regions
[[48, 1066], [3171, 7969]]
r

  result = getattr(ufunc, method)(*inputs, **kwargs)


(168976, 10)
[[4298, 5658], [27967, 71596]]
regions_x_shuf_HH_active.SE_01_0030 [[4298, 5658], [27967, 71596]] 1.944671927702299 3.268357873031555e-204
SE_01_0030.HH_active
[[4229, 5747], [27995, 71758]]
regions_x_shuf_MM_active.SE_01_0030 [[4229, 5747], [27995, 71758]] 1.8861939259145215 1.4633288874072802e-185
SE_01_0030.MM_active
[[951, 1203], [6081, 15459]]
regions_x_shuf_cis_only.SE_01_0030 [[951, 1203], [6081, 15459]] 2.009653960805928 2.4286917350733656e-50
SE_01_0030.cis_only
[[911, 1072], [5544, 14287]]
regions_x_shuf_trans_only.SE_01_0030 [[911, 1072], [5544, 14287]] 2.1899863843660485 1.2069580515755333e-58
SE_01_0030.trans_only
[[3649, 4976], [23996, 62259]]
regions_x_shuf_cis+trans.SE_01_0030 [[3649, 4976], [23996, 62259]] 1.902640684956067 9.48504770974772e-165
SE_01_0030.cis+trans
[[1270, 1764], [8573, 21767]]
regions_x_shuf_conserved_active.regions.SE_01_0030 [[1270, 1764], [8573, 21767]] 1.8279777014425662 2.917543873127943e-52
SE_01_0030.conserved_active.regions
[[477

In [14]:
fdr_res = pd.concat(fdr_results.values())

### clean up and make tables

fdr_res.sort_values(by="frac", ascending=False).drop_duplicates()
fdr_res["col2"] = fdr_res["col"] +"."+ fdr_res["sample"]

# scientific notatio for FDRP value
fdr_res["FDR_P2"]= fdr_res["FDR_P"].apply(lambda x: "{:.1E}".format(x))

# Fraction heat map table
table = pd.pivot(fdr_res, index="col", columns = "sample",  values= 'frac')
table = table.replace(-np.Inf, -4)
p = pd.pivot(fdr_res, index="col", columns = "sample",  values= 'asterisks')
p = p.fillna("")

In [15]:
fdr_res[[ "col", "sample",'frac',  "OR", "FDR_P"]]

Unnamed: 0,col,sample,frac,OR,FDR_P
0,HH_active,superenhancer_element,0.044295,0.118651,0.0
0,MM_active,superenhancer_element,0.050521,0.136389,0.0
0,cis_only,superenhancer_element,0.054318,0.146016,7.294011e-151
0,trans_only,superenhancer_element,0.055976,0.152804,5.251711e-134
0,cis+trans,superenhancer_element,0.044058,0.119579,0.0
0,conserved_active.regions,superenhancer_element,0.045814,0.121908,1.81989e-233
0,HH-active_MM-inactive_cis-only,superenhancer_element,0.043088,0.11316,1.907419e-90
0,HH-active_MM-inactive_trans-only,superenhancer_element,0.061576,0.171187,1.627985e-51
0,HH-active_MM-inactive_cis+trans,superenhancer_element,0.038868,0.104466,0.0
0,MM-active_HH-inactive_cis-only,superenhancer_element,0.066346,0.182902,2.3114569999999998e-63
