# Processing and Combining Preprocessed Datasets (Beginning)

In [1]:
import pybedtools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Processing UniBind TFBS

In [2]:
reduce_tfbs = pybedtools.BedTool("/sybig/projects/GeneRegulation/data/jme/Bachelorarbeit/data/tfbs/tfbs_intersect_promotors_wa.bed")
df_tfbs_raw = reduce_tfbs.to_dataframe()

### Refining the Dataset and chaning columns

The name coloumn of the tfbs dataframe contains many information in the Format ”ChipSeq-ID_Zelllinie_TF-name_JASPAR-ID”.

In order to adress these data more easily, it should be seperated and replace the in our case useless, thickStart, thickEnd and itemRgb column.

In [4]:
raw_name = df_tfbs_raw.name.to_numpy()
raw_name

array(['EXP038397_NGP--neuroblastoma-_MYCN_MA0104.4',
       'EXP036801_HUES64--embryonic-stem-cells-_OTX2_MA0712.2',
       'EXP039511_HEK293--embryonic-kidney-_EGR2_MA0472.2', ...,
       'EXP047817_THP-1--acute-monocytic-leukemia-_SPI1_MA0080.5',
       'EXP047818_THP-1--acute-monocytic-leukemia-_SPI1_MA0080.5',
       'EXP049121_THP-1--acute-monocytic-leukemia-_SPI1_MA0080.5'],
      dtype=object)

In [5]:
raw_name[0].split("_")

['EXP038397', 'NGP--neuroblastoma-', 'MYCN', 'MA0104.4']

In [6]:
chipseq_id = np.array([i.split("_")[0] for i in raw_name])
tissue = np.array([i.split("_")[1] for i in raw_name])
tf_name = np.array([i.split("_")[2] for i in raw_name])
jaspar_id = np.array([i.split("_")[3] for i in raw_name])


In [7]:
df_tfbs = df_tfbs_raw.copy()
df_tfbs["name"] = tf_name
df_tfbs["thickStart"] = chipseq_id
df_tfbs["thickEnd"] = tissue
df_tfbs["itemRgb"] = jaspar_id
df_tfbs

Unnamed: 0,chrom,start,end,name,score,strand,thickStart,thickEnd,itemRgb
0,chr1,17510,17522,MYCN,0,-,EXP038397,NGP--neuroblastoma-,MA0104.4
1,chr1,629638,629650,OTX2,0,-,EXP036801,HUES64--embryonic-stem-cells-,MA0712.2
2,chr1,634195,634206,EGR2,0,+,EXP039511,HEK293--embryonic-kidney-,MA0472.2
3,chr1,758332,758344,JUN,0,+,EXP038043,HAEC--human-aortic-endothelial-cells-,MA1130.1
4,chr1,758332,758345,JUN,0,-,EXP038043,HAEC--human-aortic-endothelial-cells-,MA1128.1
...,...,...,...,...,...,...,...,...,...
3667829,chrY,24570023,24570035,FOXA1,0,-,EXP038436,VCaP--prostate-carcinoma-,MA0148.4
3667830,chrY,26360803,26360816,NEUROD1,0,-,EXP038206,D341-Med--medulloblastoma-,MA1109.1
3667831,chrY,26453785,26453805,SPI1,0,-,EXP047817,THP-1--acute-monocytic-leukemia-,MA0080.5
3667832,chrY,26453785,26453805,SPI1,0,-,EXP047818,THP-1--acute-monocytic-leukemia-,MA0080.5


In [28]:
#tfbs_new = pybedtools.BedTool.from_dataframe(df_tfbs).saveas("/sybig/projects/GeneRegulation/data/jme/Bachelorarbeit/data//tfbs/tfbs_reduced_pro.bed")


### Delete/summarize repetitive Entrys

Some TFBS occur more then once in the data, because the data came from diffrent experiments or cell tissues. Sometimes the binding sites also differs just by one nucleotide. However, since only one TFBS can bind to the same location, it makes more sense to reduce these entries to one entry in order not to falsify the significance of the data.

In [8]:
tfbs_new = pybedtools.BedTool("/sybig/projects/GeneRegulation/data/jme/Bachelorarbeit/data/tfbs/tfbs_reduced_pro.bed")
tfbs_new.head(12)

chr1	17510	17522	MYCN	0	-	EXP038397	NGP--neuroblastoma-	MA0104.4
 chr1	629638	629650	OTX2	0	-	EXP036801	HUES64--embryonic-stem-cells-	MA0712.2
 chr1	634195	634206	EGR2	0	+	EXP039511	HEK293--embryonic-kidney-	MA0472.2
 chr1	758332	758344	JUN	0	+	EXP038043	HAEC--human-aortic-endothelial-cells-	MA1130.1
 chr1	758332	758345	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA1128.1
 chr1	758333	758344	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA0462.2
 chr1	758333	758344	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA1127.1
 chr1	758333	758346	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA0488.1
 chr1	758334	758344	JUN	0	+	EXP038043	HAEC--human-aortic-endothelial-cells-	MA0099.3
 chr1	758334	758344	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA1132.1
 chr1	758334	758348	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA0489.1
 chr1	758509	758526	AR	0	+	EXP049391	22RV1--prostate-carcinoma-	MA0007.3
 

In [14]:
tfbs_new[0].fields
tf_name
tf_name_unique = np.unique(tf_name)
len(tf_name_unique)

269

In [20]:
tfbs_JUN = tfbs_new.filter(lambda x: x.name == "JUN").merge(s=True, c=[4,5,6,7,8,9], o=["distinct","sum","distinct","distinct", "distinct", "distinct" ]).saveas("JUN.bed")

In [24]:
tfbs_JUN.head()

chr1	758332	758344	JUN	0	+	EXP038043	HAEC--human-aortic-endothelial-cells-	MA0099.3,MA1130.1
 chr1	758332	758348	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA0462.2,MA0488.1,MA0489.1,MA1127.1,MA1128.1,MA1132.1
 chr1	778608	778621	JUN	0	+	EXP039507	A549--lung-carcinoma-	MA0488.1
 chr1	778681	778700	JUN	0	+	ENCSR000EEK,ENCSR000FAH,EXP000309,EXP037742,EXP038042,EXP038043,EXP039416,EXP039496,EXP040248,EXP040263,EXP040320,EXP040322,EXP047660,EXP047661,EXP048210,EXP057912,EXP057919,EXP058065,EXP058211	786-O--renal-carcinoma-,A549--lung-carcinoma-,HAEC--human-aortic-endothelial-cells-,HUVEC-C--HUVEC--umbilical-vein-endothelial-cells-,HepG2,K562,K562--myelogenous-leukemia-,Kasumi-1--acute-myeloblastic-leukemia-,MCF7--Invasive-ductal-breast-carcinoma-,MDA-MB-231p27CK-DD--breast-cancer-cells--phosphimimetic-p27-cell-line-,definitive-endoderm-from-HUES8	MA0462.2,MA0488.1,MA0489.1,MA1127.1,MA1128.1
 chr1	778685	778697	JUN	0	-	EXP039869,EXP058211	A549--lung-carcinoma-,Kasumi-1--acute-m

In [30]:
tf_test = tf_name_unique[:2]
tf_test

array(['AR', 'ARNT'], dtype='<U9')

In [40]:

merged_tfbs_tmp = pybedtools.BedTool(())
for tfbs_name in tf_name_unique:
    single_tfbs_merged = tfbs_new.filter(lambda x: x.name == tfbs_name).merge(s=True, c=[4,5,6,7,8,9,1], o=["distinct","sum","distinct","distinct", "distinct", "distinct","count"])
    merged_tfbs_tmp = merged_tfbs_tmp.cat(single_tfbs_merged, postmerge=False)

# merged_tfbs = merged_tfbs_tmp.sort().saveas("/sybig/projects/GeneRegulation/data/jme/Bachelorarbeit/data/tfbs/tfbs_reduced_pro_merged.bed")

In [43]:
merged_tfbs = pybedtools.BedTool("/sybig/projects/GeneRegulation/data/jme/Bachelorarbeit/data/tfbs/tfbs_reduced_pro_merged.bed")
merged_tfbs.head()

chr1	17510	17522	MYCN	0	-	EXP038397	NGP--neuroblastoma-	MA0104.4	1
 chr1	629638	629650	OTX2	0	-	EXP036801	HUES64--embryonic-stem-cells-	MA0712.2	1
 chr1	634195	634206	EGR2	0	+	EXP039511	HEK293--embryonic-kidney-	MA0472.2	1
 chr1	758332	758348	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA0462.2,MA0488.1,MA0489.1,MA1127.1,MA1128.1,MA1132.1	6
 chr1	758332	758344	JUN	0	+	EXP038043	HAEC--human-aortic-endothelial-cells-	MA0099.3,MA1130.1	2
 chr1	758509	758526	AR	0	+	EXP049391,EXP049392	22RV1--prostate-carcinoma-	MA0007.3	2
 chr1	778564	778578	TCF7L2	0	-	ENCSR000EXL	Panc1	MA0523.1	1
 chr1	778585	778600	TFAP2C	0	-	EXP047542,GSE36351	UCLA1-hESCs,bt474,mdamb453,skbr3	MA0524.2,MA0814.2,MA0815.1	9
 chr1	778587	778599	TFAP2C	0	+	EXP047542	UCLA1-hESCs	MA0524.2	1
 chr1	778595	778615	PPARG	0	+	EXP034422	subcutaneous-white-adipose-tissue	MA0066.1	1
 

In [48]:
merged_tfbs.count()

NameError: name 'merged_tfbs' is not defined

# Test

In [7]:
reduce_tfbs.head()

chr1	17510	17522	EXP038397_NGP--neuroblastoma-_MYCN_MA0104.4	0	-	17510	17522	126,46,9
 chr1	629638	629650	EXP036801_HUES64--embryonic-stem-cells-_OTX2_MA0712.2	0	-	629638	629650	207,77,229
 chr1	634195	634206	EXP039511_HEK293--embryonic-kidney-_EGR2_MA0472.2	0	+	634195	634206	124,130,29
 chr1	758332	758344	EXP038043_HAEC--human-aortic-endothelial-cells-_JUN_MA1130.1	0	+	758332	758344	161,131,62
 chr1	758332	758345	EXP038043_HAEC--human-aortic-endothelial-cells-_JUN_MA1128.1	0	-	758332	758345	161,131,62
 chr1	758333	758344	EXP038043_HAEC--human-aortic-endothelial-cells-_JUN_MA0462.2	0	-	758333	758344	161,131,62
 chr1	758333	758344	EXP038043_HAEC--human-aortic-endothelial-cells-_JUN_MA1127.1	0	-	758333	758344	161,131,62
 chr1	758333	758346	EXP038043_HAEC--human-aortic-endothelial-cells-_JUN_MA0488.1	0	-	758333	758346	161,131,62
 chr1	758334	758344	EXP038043_HAEC--human-aortic-endothelial-cells-_JUN_MA0099.3	0	+	758334	758344	161,131,62
 chr1	758334	758344	EXP038043_HAEC--human-aortic-end

In [5]:
def refine_BedTool_intervall(BedTool_intervall):
    """
    function that processes single entry and rename columns (fields)  based on ”ChipSeq-ID_Zelllinie_TF-name_JASPAR-ID” information from UniBind column.
    """
    # split name
    split_name = BedTool_intervall.fields[3].split("_")
    chipseq_id = split_name[0]
    tissue = split_name[1]
    tf_name = split_name[2]
    jaspar_id = split_name[3]

    # rename columns (fields)
    BedTool_intervall[3] = tf_name
    BedTool_intervall[6] = chipseq_id
    BedTool_intervall[7] = tissue
    BedTool_intervall[8] = jaspar_id

    return BedTool_intervall

In [15]:
new_bed = reduce_tfbs.each(refine_BedTool_intervall).saveas()

In [35]:
mycn = new_bed[0]
chr_bed = new_bed.filter(lambda x: x.chrom == "chr1").saveas()
# mycn.chrom

In [36]:
chr_bed.head()

chr1	17510	17522	MYCN	0	-	EXP038397	NGP--neuroblastoma-	MA0104.4
 chr1	629638	629650	OTX2	0	-	EXP036801	HUES64--embryonic-stem-cells-	MA0712.2
 chr1	634195	634206	EGR2	0	+	EXP039511	HEK293--embryonic-kidney-	MA0472.2
 chr1	758332	758344	JUN	0	+	EXP038043	HAEC--human-aortic-endothelial-cells-	MA1130.1
 chr1	758332	758345	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA1128.1
 chr1	758333	758344	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA0462.2
 chr1	758333	758344	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA1127.1
 chr1	758333	758346	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA0488.1
 chr1	758334	758344	JUN	0	+	EXP038043	HAEC--human-aortic-endothelial-cells-	MA0099.3
 chr1	758334	758344	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA1132.1
 

In [40]:
tfbs_chr1 = np.unique(list(map(lambda x: x.name, chr_bed)))

In [70]:
tfbs_lst = list(tfbs_chr1)

In [71]:
def filter_interval_for_tfbs(interval, tfbs_lst):
    if interval.name in tfbs_lst:
        tfbs_lst.remove(interval.name)
        return True
    else:
        return False

In [73]:
chr_bed_filter= chr_bed.filter(filter_interval_for_tfbs, tfbs_lst=list(np.unique(list(map(lambda x: x.name, chr_bed))))).saveas()

In [134]:
complement = chr_bed_filter.complement(genome="hg38")
a = chr_bed_filter.sequence(fi="/sybig/projects/GeneRegulation/data/jme/Genome_FASTA/chr1.fa")
open(a.seqfn).read()

'>chr1:17510-17522\nGCCTATGTGCTG\n>chr1:629638-629650\nAATTAATCCCCT\n>chr1:634195-634206\nCCACCTACTCA\n>chr1:758332-758344\ngcaTGAGTAATT\n>chr1:758509-758526\nTGGAACATAATGTCTCC\n>chr1:778564-778578\nCCACTCTGATGAGA\n>chr1:778585-778599\nCCAGCTCCAGGCAC\n>chr1:778595-778615\nGCACCATGGCGCCCCAGTGA\n>chr1:778596-778608\nCACCATGGCGCC\n>chr1:778598-778609\nCCATGGCGCCC\n>chr1:778599-778611\nCATGGCGCCCCA\n>chr1:778607-778622\nCCCAGTGATGTAGCC\n>chr1:778622-778636\nGAACACCCGCGCCT\n>chr1:778623-778634\nAACACCCGCGC\n>chr1:778625-778635\nCACCCGCGCC\n>chr1:778633-778649\nCCTCTAACGTCGCCAA\n>chr1:778639-778653\nACGTCGCCAACGGC\n>chr1:778640-778656\nCGTCGCCAACGGCCCA\n>chr1:778641-778655\nGTCGCCAACGGCCC\n>chr1:778642-778655\nTCGCCAACGGCCC\n>chr1:778644-778654\nGCCAACGGCC\n>chr1:778664-778680\nGTGTCGGCGAAGACCC\n>chr1:778668-778681\nCGGCGAAGACCCG\n>chr1:778671-778681\nCGAAGACCCG\n>chr1:778671-778688\nCGAAGACCCGCCCTTGT\n>chr1:778672-778687\nGAAGACCCGCCCTTG\n>chr1:778672-778689\nGAAGACCCGCCCTTGTG\n>chr1:778673

In [132]:
a[0]

Interval(chr1:17510-17522)

In [163]:
bool_lst = np.array(list(map(lambda x: False if x[0] == ">" else True, open(a.seqfn).readlines())))
lst_all = np.array(list(map(lambda x: x.split()[0], open(a.seqfn).readlines())))
seq_lst = lst_all[bool_lst]
seq_lst

array(['GCCTATGTGCTG', 'AATTAATCCCCT', 'CCACCTACTCA', 'gcaTGAGTAATT',
       'TGGAACATAATGTCTCC', 'CCACTCTGATGAGA', 'CCAGCTCCAGGCAC',
       'GCACCATGGCGCCCCAGTGA', 'CACCATGGCGCC', 'CCATGGCGCCC',
       'CATGGCGCCCCA', 'CCCAGTGATGTAGCC', 'GAACACCCGCGCCT', 'AACACCCGCGC',
       'CACCCGCGCC', 'CCTCTAACGTCGCCAA', 'ACGTCGCCAACGGC',
       'CGTCGCCAACGGCCCA', 'GTCGCCAACGGCCC', 'TCGCCAACGGCCC',
       'GCCAACGGCC', 'GTGTCGGCGAAGACCC', 'CGGCGAAGACCCG', 'CGAAGACCCG',
       'CGAAGACCCGCCCTTGT', 'GAAGACCCGCCCTTG', 'GAAGACCCGCCCTTGTG',
       'AAGACCCGCCCT', 'AAGACCCGCCCTTGTG', 'AGACCCGCCCT', 'AGACCCGCCCTT',
       'AGACCCGCCCTTG', 'AGACCCGCCCTTGTG', 'GACCCGCCCT', 'GACCCGCCCTT',
       'GACCCGCCCTT', 'CGCCCTTGTGACGT', 'CGCCCTTGTGACGT',
       'CGCCCTTGTGACGTCA', 'CGCCCTTGTGACGTCACGGAA', 'CCTTGTGACGTCACG',
       'CTTGTGAC', 'TTGTGACGTCACG', 'TTGTGACGTCACGG', 'TGTGACGTCACG',
       'TGTGACGTCACGG', 'GACGTCACGGAA', 'ACGTCACGGAAGGCGCA',
       'GTCACGGAAGGCGC', 'GTCACGGAAGGCGC', 'CGGAAGGC',
       

In [95]:
def verify(sequence):
    '''This code verifies if a sequence is a DNA or RNA'''
     
    # set the input sequence
    seq = set(sequence)
     
    # confirm if its elements is equal to 
    # the set of valid DNA bases
    # Use a union method to ensure the
    # sequence is verified if does not
    # contain all the bases
    if seq == {"A", "T", "C", "G"}.union(seq):
        return "DNA"
    elif seq == {"A", "U", "C", "G"}.union(seq):
        return "RNA"
    else:
        return "Invalid sequence"
 
 
def rev_comp_if(seq):
    comp = []
    if verify(seq) == "DNA":
        for base in seq:
            if base == "A":
                comp.append("T")
            elif base == "G":
                comp.append("C")
            elif base == "T":
                comp.append("A")
            elif base == "C":
                comp.append("G")
    elif verify(seq) == "RNA":
        for base in seq:
            if base == "U":
                comp.append("A")
            elif base == "G":
                comp.append("C")
            elif base == "A":
                comp.append("U")
            elif base == "C":
                comp.append("G")
    else:
        return "Invalid Sequence"
       
    # reverse the sequence
    comp_rev = comp[::-1]
     
    # convert list to string
    comp_rev = "".join(comp_rev)
    return comp_rev

In [180]:
seq_lst_upper = [i.upper() for i in seq_lst]
seq_reveres = [rev_comp_if(seq) for seq in seq_lst_upper]
is_palindrom = np.array([True if seq_lst_upper[i] == seq_reveres[i] else False for i in range(len(seq_reveres))])



In [182]:
tfbs = np.array(list(map(lambda x: x.name, a)))
tfbs[is_palindrom]

array([], dtype='<U9')