In [1]:
import warnings
warnings.filterwarnings('ignore')

from IPython.core.display import display, HTML
display(HTML('<style>.container {width: 90% !important; }</style>'))

In [2]:
from glob import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, pybedtools, subprocess, shutil

In [3]:
def collapseTBD(re):
    new_re = []
    for r, me1, me3, ctcf in zip(re['RE'], re['H3K4me1_rank'], re['H3K4me3_rank'], re['CTCF_rank']):

        if r=='TBD_1':
            if me1 <= me3:
                new_re.append("Promoter")
            else:
                new_re.append("Enhancer")

        elif r=='TBD_2':
            if me1 <= me3:
                new_re.append("Promoter/CTCF")
            else:
                new_re.append("Enhancer/CTCF")

        elif r=='TBD_3':
            if me1 <= me3:
                new_re.append("Promoter/CTCF")
            else:
                new_re.append("Enhancer/CTCF")

        else:
            new_re.append(r)
    re['new_re'] = new_re
    re = re[['chromosome', 'start', 'end', 'new_re']]
    re.columns = ['chromosome', 'start', 'finish', 'RE']
    return re

In [4]:
huvec  = pd.read_csv('metadata/huvec_mlv_REgulamentary.csv', sep='\t')
CD14pos = pd.read_csv('metadata/CD14pos_monocyte_mlv_REgulamentary.csv', sep='\t')
kera = pd.read_csv('metadata/keratinocyte_mlv_REgulamentary.csv', sep='\t')

In [5]:
huvec  = collapseTBD(huvec)
huvec['origin'] = 'huvec'

CD14pos = collapseTBD(CD14pos)
CD14pos['origin'] = 'CD14pos'

kera = collapseTBD(kera)
kera['origin'] = 'kera'

In [6]:
df = pd.concat([huvec, CD14pos, kera])

In [7]:
df = df.sort_values(by=['chromosome', 'start', 'finish'])
df.reset_index(drop=True, inplace=True)

In [8]:
tmp_path = 'tmp'
if not os.path.exists(tmp_path):
    os.makedirs(tmp_path)
pybedtools.set_tempdir(tmp_path)

In [9]:
df = df.sort_values(by=['chromosome', 'start', 'finish'])
df.reset_index(drop=True, inplace=True)

In [10]:
df['id'] = df['origin'].astype(str)+"-"+df['RE'].astype(str)#df['chromosome'].astype(str)+":"+df['start'].astype(str)+"-"+df['finish'].astype(str)

In [11]:
df.to_csv('to_be_merged.bed', sep='\t', index=False, header=False)

In [12]:
df

Unnamed: 0,chromosome,start,finish,RE,origin,id
0,chr1,817082,817826,Enhancer,CD14pos,CD14pos-Enhancer
1,chr1,826746,828028,Promoter,huvec,huvec-Promoter
2,chr1,826861,828019,Promoter,CD14pos,CD14pos-Promoter
3,chr1,858107,858213,CTCF,huvec,huvec-CTCF
4,chr1,869164,869568,Enhancer/CTCF,CD14pos,CD14pos-Enhancer/CTCF
...,...,...,...,...,...,...
147417,chrY,25463606,25465282,Promoter/CTCF,CD14pos,CD14pos-Promoter/CTCF
147418,chrY,25463606,25465566,Promoter/CTCF,CD14pos,CD14pos-Promoter/CTCF
147419,chrY,25474611,25475123,CTCF,CD14pos,CD14pos-CTCF
147420,chrY,25490263,25492198,CTCF,kera,kera-CTCF


In [13]:
command = ['bedtools merge -i to_be_merged.bed -c 6 -o collapse > merged.bed']
subprocess.run(command, shell=True)

CompletedProcess(args=['bedtools merge -i to_be_merged.bed -c 6 -o collapse > merged.bed'], returncode=0)

In [14]:
merged = pd.read_csv("merged.bed", sep='\t', names=['chromosome', 'start', 'finish', 'RE'])
merged

Unnamed: 0,chromosome,start,finish,RE
0,chr1,817082,817826,CD14pos-Enhancer
1,chr1,826746,828028,"huvec-Promoter,CD14pos-Promoter"
2,chr1,858107,858213,huvec-CTCF
3,chr1,869164,870570,"CD14pos-Enhancer/CTCF,CD14pos-Enhancer/CTCF,ke..."
4,chr1,904313,905615,"CD14pos-Enhancer/CTCF,huvec-CTCF,kera-CTCF,CD1..."
...,...,...,...,...
80196,chrY,25421943,25423722,kera-CTCF
80197,chrY,25463595,25465566,"kera-CTCF,CD14pos-Promoter/CTCF,CD14pos-Promot..."
80198,chrY,25474611,25475123,CD14pos-CTCF
80199,chrY,25490263,25492198,kera-CTCF


In [15]:
len(merged['RE'].tolist())*2

160402

In [16]:
m = pd.DataFrame(columns=['huvec', 'CD14pos', 'kera'])

hv = []
d1 = []
d2 = []

for i in merged['RE'].tolist():
  
    tmp = i.split(",")

    #huvec
    with_s = [x for x in tmp if x.startswith('huvec')]
    if len(with_s)==1:
        hv.append(with_s[0].split("-")[-1])
    else:
        hv.append("")

    #CD14pos
    with_s = [x for x in tmp if x.startswith('CD14pos')]
    if len(with_s)==1:
        d1.append(with_s[0].split("-")[-1])
    else:
        d1.append("")
        
    #kera
    with_s = [x for x in tmp if x.startswith('kera')]
    if len(with_s)==1:
        d2.append(with_s[0].split("-")[-1])        
    else:
        d2.append("")        

In [17]:
merged['huvec'] = hv
merged['CD14pos'] = d1
merged['kera'] = d2

merged = merged[['chromosome', 'start', 'finish', 'huvec', 'CD14pos', 'kera']]

In [18]:
merged

Unnamed: 0,chromosome,start,finish,huvec,CD14pos,kera
0,chr1,817082,817826,,Enhancer,
1,chr1,826746,828028,Promoter,Promoter,
2,chr1,858107,858213,CTCF,,
3,chr1,869164,870570,CTCF,,CTCF
4,chr1,904313,905615,,,CTCF
...,...,...,...,...,...,...
80196,chrY,25421943,25423722,,,CTCF
80197,chrY,25463595,25465566,,,CTCF
80198,chrY,25474611,25475123,,CTCF,
80199,chrY,25490263,25492198,,,CTCF


In [19]:
merged.to_csv('cellType_comparison.csv', sep='\t', index=False)

In [22]:
shutil.rmtree(tmp_path)