In [1]:
import pybedtools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# TFBS Combinations

## Extract GTEx Genexpression

In [2]:
GTEx_with_tfbs = pybedtools.BedTool("/sybig/projects/GeneRegulation/data/jme/Bachelorarbeit/data/Promotor_with_TFBS/All_GTEx_Prom_with_TFBS.bed")

In [4]:
first = GTEx_with_tfbs[0]

In [10]:
first.fields

['chr1',
 '17436',
 '17636',
 'ENSG00000278267.1',
 '0',
 '-',
 'MYCN',
 '17510',
 '17522',
 '-',
 'miRNA',
 '0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,']

In [15]:
# with open("write_csv.csv", "w") as f:
#     for i in GTEx_with_tfbs[0:100]:
#         f.write(f"{i.name},{i.fields[11][:-1]}\n")

In [16]:
df = pd.read_csv("write_csv.csv")

In [19]:
test = pd.read_csv("/sybig/projects/GeneRegulation/data/jme/Bachelorarbeit/raw_data/gtex/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct", sep="\t", skiprows=2)

In [26]:
csv_string = ",".join(test.columns)
print(csv_string)

Name,Description,Adipose - Subcutaneous,Adipose - Visceral (Omentum),Adrenal Gland,Artery - Aorta,Artery - Coronary,Artery - Tibial,Bladder,Brain - Amygdala,Brain - Anterior cingulate cortex (BA24),Brain - Caudate (basal ganglia),Brain - Cerebellar Hemisphere,Brain - Cerebellum,Brain - Cortex,Brain - Frontal Cortex (BA9),Brain - Hippocampus,Brain - Hypothalamus,Brain - Nucleus accumbens (basal ganglia),Brain - Putamen (basal ganglia),Brain - Spinal cord (cervical c-1),Brain - Substantia nigra,Breast - Mammary Tissue,Cells - Cultured fibroblasts,Cells - EBV-transformed lymphocytes,Cervix - Ectocervix,Cervix - Endocervix,Colon - Sigmoid,Colon - Transverse,Esophagus - Gastroesophageal Junction,Esophagus - Mucosa,Esophagus - Muscularis,Fallopian Tube,Heart - Atrial Appendage,Heart - Left Ventricle,Kidney - Cortex,Kidney - Medulla,Liver,Lung,Minor Salivary Gland,Muscle - Skeletal,Nerve - Tibial,Ovary,Pancreas,Pituitary,Prostate,Skin - Not Sun Exposed (Suprapubic),Skin - Sun Exposed (Lower l

## Filter for TFBS or TFBS combinations

In [4]:
data = pybedtools.BedTool("/sybig/projects/GeneRegulation/data/jme/Bachelorarbeit/data/Promotor_with_TFBS/All_GTEx_Prom_with_TFBS.bed")

In [5]:
data.head()

chr1	17436	17636	ENSG00000278267.1	0	-	MYCN	74	86	-	miRNA	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 chr1	629439	629639	ENSG00000225630.1	730	+	OTX2	-11	1	-	unprocessed_pseudogene	961.4,1024,949.35,328.1,497.15,363.6,518.8,1182,1160,1568,638.6,851.2,1275,990.9,1320,1213.5,1315,1591,1228,1440,902.9,207.95,180.7,370.8,427.8,826.4,777.5,814.1,335.2,798.7,510.5,1157,1283.5,1149,864.8,754.9,447.2,484.3,760.1,602.9,385.65,320.05,541.9,772.4,604.95,573.3,748.9,342.4,887.1,562,559,441.2,394.05,53.19,
 chr1	634176	634376	ENSG00000198744.5	483	+	EGR2	170	181	+	unprocessed_pseudogene	19.46,17.82,28.275,9.633,12.73,13.52,11.34,40.195,36.9,30.34,21.63,23.05,29.58,32.36,49.13,36.375,37.295,31.25,22.06,32.93,18.46,11.145,8.255,8.538,9.101,16.43,22.555,18.51,12.34,18.61,10.82,35.74,48.98,40.82,18.575,21.435,12.21,14.485,25.13,14,10.052,6.905,14.03,17.25,18.495,19.47,20.65,10.04,19.89,19.92,19.29,10.595,11.63,4.395,
 chr1	758336	758536	

Genereating Subsets based on TFBS or TFBS pair in data.

In [69]:
def func_for_tfbs_subset(BedTool_Interval, tfbs_lst):
    """
    Function that returns TRUE, if all of the tfbs in tfbs_lst are in Interval.fields[6]
    It also checks if tfbs in tfbs_lst occur more often then tfbs in Interval.fields[6]. 
    Therefore if tfbs_lst contains the same tfbs twice, it will only return True if Interval.fields[6] also contains the tfbs at least twice. 
    """

    # Processing tfbs_lst --> unique with counts
    input_tfbs_unique, input_tfbs_counts = np.unique(np.array(tfbs_lst), return_counts=True)
    input_tfbs_dict = dict(zip(input_tfbs_unique, input_tfbs_counts))
    
    # Processing tfbs in interval --> split tfbs, distance, strand into lists
    tfbs_arr = np.array(BedTool_Interval.fields[6].split(","))

    # get unique tfbs and counts and save in dict
    tfbs_unique, tfbs_counts = np.unique(tfbs_arr, return_counts=True)
    tfbs_dict = dict(zip(tfbs_unique, tfbs_counts))

    # Compare unique values and counts (equal or more (?))

    # Checks if all unique TFBS in tfbs_lst occur in tfbs_unique.
    if np.all(np.isin(input_tfbs_unique, tfbs_unique)):
        
        # Checks how often tfbs occur in input list, compared to interval tfbs. If tfbs count in input list is higher then in interval it will return False.
        bool_lst = []
        for tfbs, input_count in input_tfbs_dict.items():
            tfbs_count = tfbs_dict[tfbs]
            if input_count > tfbs_count:
                bool_lst.append(False)
            else:
                bool_lst.append(True)
        if np.all(bool_lst):
            return True
        else:
            return False
        
        
    else:
        return False
    





In [74]:
func_for_tfbs_subset(data[3], tfbs_lst=["JUN", "AR"])

True

In [39]:
# Minimal Example:
test_int = data[3]
tfbs = test_int.fields[6]
tfbs_arr = np.array(tfbs.split(","))
tfbs_unique, counts = np.unique(tfbs_arr, return_counts=True)
tfbs_dict = dict(zip(tfbs_unique, counts))

In [68]:
tfbs_test = np.array(["JUN", "JUN", "A"])
input_tfbs_unique, input_tfbs_counts = np.unique(np.array(tfbs_test), return_counts=True)
input_tfbs_dict = dict(zip(input_tfbs_unique, input_tfbs_counts))

np.all(np.isin(tfbs_test, tfbs_unique))

bool_lst = []
for tfbs, input_count in input_tfbs_dict.items():
    tfbs_count = tfbs_dict[tfbs]
    if input_count > tfbs_count:
        bool_lst.append(False)
    else:
        bool_lst.append(True)
bool_lst

KeyError: 'A'