In [1]:
import pybedtools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 1. Analysis of homotypic TFBSs
    e.g ESR1
    - From TFBS-BED get, GeneID, TSS_dist, homotypic_count for each ESR1 in the region 
    - Split data in 4 bins, so that ESR1 is (0-50, 50-100, 100-150, 150-200)bp away from the TSS
    - Generate Table per bin with number of TFBS in col_1 and Genexpresion in col_2. Every Tissue can be a seperate row.

In [24]:
data = pybedtools.BedTool("/sybig/projects/GeneRegulation/data/jme/Bachelorarbeit/data/Promotor_with_TFBS/All_GTEx_Prom_with_TFBS.bed")
gtex_df = pd.read_csv("/sybig/projects/GeneRegulation/data/jme/Bachelorarbeit/data/GTEx_GenExpr_ucsc.csv", sep=",")

## Filter for TF

In [27]:
#Function to filter BED file for specific 
def func_for_tfbs_subset(BedTool_Interval, tfbs_lst):
    """
    Function that returns TRUE, if all of the tfbs in tfbs_lst are in Interval.fields[6]
    It also checks if tfbs in tfbs_lst occur more often then tfbs in Interval.fields[6]. 
    Therefore if tfbs_lst contains the same tfbs twice, it will only return True if Interval.fields[6] also contains the tfbs at least twice. 
    """

    # Processing tfbs_lst --> unique with counts
    input_tfbs_unique, input_tfbs_counts = np.unique(np.array(tfbs_lst), return_counts=True)
    input_tfbs_dict = dict(zip(input_tfbs_unique, input_tfbs_counts))
    
    # Processing tfbs in interval --> split tfbs, distance, strand into lists
    tfbs_arr = np.array(BedTool_Interval.fields[6].split(","))

    # get unique tfbs and counts and save in dict
    tfbs_unique, tfbs_counts = np.unique(tfbs_arr, return_counts=True)
    tfbs_dict = dict(zip(tfbs_unique, tfbs_counts))

    # Compare unique values and counts (equal or more (?))

    # Checks if all unique TFBS in tfbs_lst occur in tfbs_unique.
    if np.all(np.isin(input_tfbs_unique, tfbs_unique)):
        
        # Checks how often tfbs occur in input list, compared to interval tfbs. If tfbs count in input list is higher then in interval it will return False.
        bool_lst = []
        for tfbs, input_count in input_tfbs_dict.items():
            tfbs_count = tfbs_dict[tfbs]
            if input_count > tfbs_count:
                bool_lst.append(False)
            else:
                bool_lst.append(True)
        if np.all(bool_lst):
            return True
        else:
            return False
        
        
    else:
        return False
 
def get_tfbs_subset(BedTool, tfbs_lst):
    """
    Filtering BedTool, so that only Promotors/GeneIDs remain, that contain every TFBS in tfbs_lst at least once.
    """
    return BedTool.filter(func_for_tfbs_subset, tfbs_lst)

In [30]:
esr1 = get_tfbs_subset(data, ["ESR1"]).saveas()

In [31]:
esr1.head()

chr1	778569	778769	ENSG00000237491.8	262	+	REST,KLF9,SRF,SRF,USF2,KLF11,KLF15,KLF5,KLF1,E2F1,KLF4,KLF12,SP3,RXRA,USF1,JUN,JUND,TFE3,CREM,ATF7,ATF2,JUND,CREB1,NFE2L2,ATF3,JUN,PAX5,CREM,TCF7L2,TFAP2C,TFAP2C,PPARG,YY1,YY2,E2F1,NFE2L2,JUN,EGR1,TCF3,EGR2,BHLHE40,GRHL2,RFX3,PPARG,RFX5,RXRB,NEUROD1,MYB,MAFF,NFKB2,SP2,RELA,SP1,SP4,NFYC,THAP11,PKNOX1,RXRA,PBX2,EGR3,EGR1,EGR2,MLX,POU5F1,IRF3,NFYB,FOS,TP53,FOS,ELF1,NFYA,MYC,MYC,MNT,MAX,MAX,GLIS1,RUNX1,RUNX2,ZNF143,OSR2,HAND2,RUNX1,RELA,GABPA,SP1,TEAD3,THAP11,RELA,CTCF,CTCFL,JUND,CREM,NFIL3,ATF7,CREB1,CEBPA,ATF3,REL,RELB,NFKB1,RFX5,ESR1,ESR1,RXRA,JUN,JUND,ATF3,E2F1,E2F1,CREB1	69,80,74,74,76,83,83,84,84,83,83,80,82,63,76,69,72,78,70,71,72,72,71,69,72,72,69,120,191,169,170,154,161,160,158,147,148,133,135,134,134,84,116,109,113,114,114,115,89,88,81,88,82,80,16,27,30,27,30,26,27,28,26,24,11,17,15,9,16,64,16,7,6,7,7,6,0,5,5,-8,-7,-6,-8,37,64,59,66,55,63,51,56,47,45,47,46,47,45,47,37,37,35,38,39,39,40,45,46,47,47,47,46	90,96,90,90,90,94,94,94,95,95,95,9