# Processing and Combining Preprocessed Datasets (Beginning)

In [2]:
import pybedtools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Processing UniBind TFBS

In [20]:
reduce_tfbs = pybedtools.BedTool("/sybig/projects/GeneRegulation/data/jme/Bachelorarbeit/data/tfbs_intersect_promotors_wa.bed")
df_tfbs_raw = reduce_tfbs.to_dataframe()

### Refining the Dataset and chaning columns

The name coloumn of the tfbs dataframe contains many information in the Format ”ChipSeq-ID_Zelllinie_TF-name_JASPAR-ID”.

In order to adress these data more easily, it should be seperated and replace the in our case useless, thickStart, thickEnd and itemRgb column.

In [21]:
raw_name = df_tfbs_raw.name.to_numpy()
raw_name

array(['EXP038397_NGP--neuroblastoma-_MYCN_MA0104.4',
       'EXP036801_HUES64--embryonic-stem-cells-_OTX2_MA0712.2',
       'EXP039511_HEK293--embryonic-kidney-_EGR2_MA0472.2', ...,
       'EXP047817_THP-1--acute-monocytic-leukemia-_SPI1_MA0080.5',
       'EXP047818_THP-1--acute-monocytic-leukemia-_SPI1_MA0080.5',
       'EXP049121_THP-1--acute-monocytic-leukemia-_SPI1_MA0080.5'],
      dtype=object)

In [22]:
raw_name[0].split("_")

['EXP038397', 'NGP--neuroblastoma-', 'MYCN', 'MA0104.4']

In [13]:
chipseq_id = np.array([i.split("_")[0] for i in raw_name])
tissue = np.array([i.split("_")[1] for i in raw_name])
tf_name = np.array([i.split("_")[2] for i in raw_name])
jaspar_id = np.array([i.split("_")[3] for i in raw_name])


In [26]:
df_tfbs = df_tfbs_raw.copy()
df_tfbs["name"] = tf_name
df_tfbs["thickStart"] = chipseq_id
df_tfbs["thickEnd"] = tissue
df_tfbs["itemRgb"] = jaspar_id
df_tfbs

Unnamed: 0,chrom,start,end,name,score,strand,thickStart,thickEnd,itemRgb
0,chr1,17510,17522,MYCN,0,-,EXP038397,NGP--neuroblastoma-,MA0104.4
1,chr1,629638,629650,OTX2,0,-,EXP036801,HUES64--embryonic-stem-cells-,MA0712.2
2,chr1,634195,634206,EGR2,0,+,EXP039511,HEK293--embryonic-kidney-,MA0472.2
3,chr1,758332,758344,JUN,0,+,EXP038043,HAEC--human-aortic-endothelial-cells-,MA1130.1
4,chr1,758332,758345,JUN,0,-,EXP038043,HAEC--human-aortic-endothelial-cells-,MA1128.1
...,...,...,...,...,...,...,...,...,...
3667829,chrY,24570023,24570035,FOXA1,0,-,EXP038436,VCaP--prostate-carcinoma-,MA0148.4
3667830,chrY,26360803,26360816,NEUROD1,0,-,EXP038206,D341-Med--medulloblastoma-,MA1109.1
3667831,chrY,26453785,26453805,SPI1,0,-,EXP047817,THP-1--acute-monocytic-leukemia-,MA0080.5
3667832,chrY,26453785,26453805,SPI1,0,-,EXP047818,THP-1--acute-monocytic-leukemia-,MA0080.5


In [28]:
#tfbs_new = pybedtools.BedTool.from_dataframe(df_tfbs).saveas("/sybig/projects/GeneRegulation/data/jme/Bachelorarbeit/data//tfbs/tfbs_reduced_pro.bed")


### Delete/summarize repetitive Entrys

Some TFBS occur more then once in the data, because the data came from diffrent experiments or cell tissues. Sometimes the binding sites also differs just by one nucleotide. However, since only one TFBS can bind to the same location, it makes more sense to reduce these entries to one entry in order not to falsify the significance of the data.

In [41]:
tfbs_new = pybedtools.BedTool("/sybig/projects/GeneRegulation/data/jme/Bachelorarbeit/data/tfbs/tfbs_reduced_pro.bed")
tfbs_new.head(12)

chr1	17510	17522	MYCN	0	-	EXP038397	NGP--neuroblastoma-	MA0104.4
 chr1	629638	629650	OTX2	0	-	EXP036801	HUES64--embryonic-stem-cells-	MA0712.2
 chr1	634195	634206	EGR2	0	+	EXP039511	HEK293--embryonic-kidney-	MA0472.2
 chr1	758332	758344	JUN	0	+	EXP038043	HAEC--human-aortic-endothelial-cells-	MA1130.1
 chr1	758332	758345	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA1128.1
 chr1	758333	758344	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA0462.2
 chr1	758333	758344	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA1127.1
 chr1	758333	758346	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA0488.1
 chr1	758334	758344	JUN	0	+	EXP038043	HAEC--human-aortic-endothelial-cells-	MA0099.3
 chr1	758334	758344	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA1132.1
 chr1	758334	758348	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA0489.1
 chr1	758509	758526	AR	0	+	EXP049391	22RV1--prostate-carcinoma-	MA0007.3
 

In [45]:
tfbs_new[0].fields
tf_name

array(['MYCN', 'OTX2', 'EGR2', ..., 'SPI1', 'SPI1', 'SPI1'], dtype='<U9')

In [47]:
tfbs_JUN = tfbs_new.filter(lambda x: x.name == "JUN").merge(s=True, c=[4,5,6,7,8,9], o=["distinct","sum","distinct","distinct", "distinct", "distinct" ]).saveas("JUN.bed")

In [57]:
tfbs_new.head(12)

chr1	17510	17522	MYCN	0	-	EXP038397	NGP--neuroblastoma-	MA0104.4
 chr1	629638	629650	OTX2	0	-	EXP036801	HUES64--embryonic-stem-cells-	MA0712.2
 chr1	634195	634206	EGR2	0	+	EXP039511	HEK293--embryonic-kidney-	MA0472.2
 chr1	758332	758344	JUN	0	+	EXP038043	HAEC--human-aortic-endothelial-cells-	MA1130.1
 chr1	758332	758345	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA1128.1
 chr1	758333	758344	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA0462.2
 chr1	758333	758344	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA1127.1
 chr1	758333	758346	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA0488.1
 chr1	758334	758344	JUN	0	+	EXP038043	HAEC--human-aortic-endothelial-cells-	MA0099.3
 chr1	758334	758344	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA1132.1
 chr1	758334	758348	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA0489.1
 chr1	758509	758526	AR	0	+	EXP049391	22RV1--prostate-carcinoma-	MA0007.3
 

In [49]:
tfbs_JUN.head()

chr1	758332	758344	JUN	0	+	EXP038043	HAEC--human-aortic-endothelial-cells-	MA0099.3,MA1130.1
 chr1	758332	758348	JUN	0	-	EXP038043	HAEC--human-aortic-endothelial-cells-	MA0462.2,MA0488.1,MA0489.1,MA1127.1,MA1128.1,MA1132.1
 chr1	778608	778621	JUN	0	+	EXP039507	A549--lung-carcinoma-	MA0488.1
 chr1	778681	778700	JUN	0	+	ENCSR000EEK,ENCSR000FAH,EXP000309,EXP037742,EXP038042,EXP038043,EXP039416,EXP039496,EXP040248,EXP040263,EXP040320,EXP040322,EXP047660,EXP047661,EXP048210,EXP057912,EXP057919,EXP058065,EXP058211	786-O--renal-carcinoma-,A549--lung-carcinoma-,HAEC--human-aortic-endothelial-cells-,HUVEC-C--HUVEC--umbilical-vein-endothelial-cells-,HepG2,K562,K562--myelogenous-leukemia-,Kasumi-1--acute-myeloblastic-leukemia-,MCF7--Invasive-ductal-breast-carcinoma-,MDA-MB-231p27CK-DD--breast-cancer-cells--phosphimimetic-p27-cell-line-,definitive-endoderm-from-HUES8	MA0462.2,MA0488.1,MA0489.1,MA1127.1,MA1128.1
 chr1	778685	778697	JUN	0	-	EXP039869,EXP058211	A549--lung-carcinoma-,Kasumi-1--acute-m

Ziel: Die TFBS mit dem selbem TF Name zusammen mergen! 
Wahrscheinlich bester ansatz: Die 268 einzelnen TFs filtern und dann direkt mergen und als Date abspeichern. Dann hat man zwar 268 einzelne TFBS datein, aber die kann man dann wieder zusammenführen und sortieren.
Ist für den Arbeitsprozess auch nicht schlecht, wenn man einzelne TFs anschauen will und nicht immer die große BED file einlesen will.
ABER: Ist später für den automatisier-Fall doof, da so immer viele Datein zwischengespeichert und generiert werden und wenn man den Promotorbereich vergrößert, führt dass ggfls zu nicht handlebaren File-Größen.