In [33]:
import pandas as pd
import numpy as np
import pickle as pkl
import matplotlib.pyplot as plt
import glob
import seaborn as sns
import tqdm
import re
from sklearn import metrics
import logomaker
from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay
import seqlogo
import scipy
import re
from matplotlib import gridspec
import scipy
import xgboost
from sklearn.metrics import auc, average_precision_score
from collections import OrderedDict
import torch.nn as nn
import torch
import joblib
import torch.nn.functional as F
import torch.optim as optim
import os
from joblib import Parallel, delayed
%matplotlib notebook

In [34]:
torch.manual_seed(666)
torch.set_deterministic(True)
np.random.seed(666)

In [3]:
# DeepMotifSyn generator
class deeper_u_net(nn.Module):
    def __init__(self,device='cuda'):
        super(deeper_u_net, self).__init__()
        self.device = device
        self.encoder1 = nn.Sequential(
            nn.Conv1d(in_channels = 108, out_channels = 64, kernel_size = 4, stride = 1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
        )
        self.encoder2 = nn.Sequential(
            nn.Conv1d(in_channels=64, out_channels=128, kernel_size = 2, stride = 1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
        )
        
        self.decoder1 = nn.Sequential(
            nn.ConvTranspose1d(in_channels = 128, out_channels = 64, kernel_size = 2, stride = 1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
        )
        
        self.decoder2 = nn.Sequential(
            nn.ConvTranspose1d(in_channels=64*2, out_channels=64, kernel_size = 4, stride = 1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
        )
        
        self.bottleneck = torch.nn.Sequential(
                            torch.nn.Conv1d(kernel_size=3, in_channels=128, out_channels=256, stride=1),
                            torch.nn.ReLU(),
                            torch.nn.BatchNorm1d(256),
                            torch.nn.Conv1d(kernel_size=1, in_channels=256, out_channels=256, stride=1),
                            torch.nn.ReLU(),
                            torch.nn.BatchNorm1d(256),
                            torch.nn.ConvTranspose1d(in_channels=256, out_channels=128, kernel_size=3, stride=1)
                            )
        
        self.cnn_out = nn.Sequential(
            nn.Conv1d(in_channels=64+8, out_channels=32, kernel_size=1),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Conv1d(in_channels=32, out_channels=16, kernel_size=1),
            nn.BatchNorm1d(16),
            nn.ReLU(),
            nn.Conv1d(in_channels=16, out_channels=4, kernel_size=1),
            nn.BatchNorm1d(4),
            nn.Softmax(dim=1)
        )
        
    def forward(self,x):
        seq_x = x[:,:,:8]
        x = x.permute(0, 2, 1)
        
        en1 = self.encoder1(x)
        en2 = self.encoder2(en1)
        x = self.bottleneck(en2)

        x = self.decoder1(x)
        x = self.decoder2(torch.cat((x, en1), 1))
        
        seq_x = seq_x.permute(0 ,2, 1)

        x = torch.cat((x, seq_x), 1)
        out = self.cnn_out(x)
        return out

In [12]:
def get_rev_com_y(seq_mat):
    reversed_mat = seq_mat[::-1].copy()
    for i in range(len(reversed_mat)):
        reversed_mat[i] = np.concatenate((reversed_mat[i][:4][::-1], reversed_mat[i][4:]))
    return reversed_mat

def get_possible_motifpair(name, motif1, motif2, max_mp_len = 35):
    generative_mp_info = []
    possible_mp_code = []
    # max_mp_len = 35
    
    for case_i in range(1, 5):
        if case_i == 1:
            m1 = motif1.copy()
            m2 = motif2.copy()
        elif case_i == 2:
            m1 = motif2.copy()
            m2 = motif1.copy()
        elif case_i == 3:
            m1 = get_rev_com_y(motif2).copy()
            m2 = motif1.copy()
        else:
            m1 = motif1.copy()
            m2 = get_rev_com_y(motif2).copy()
        m1_len = len(m1)
        m2_len = len(m2)
        overlap_len=None
        for m1_si in range(max_mp_len):
            m1_ei = m1_si + m1_len
            if m1_ei >= max_mp_len:
                continue
            for m2_si in range(m1_si, max_mp_len):
                m2_ei = m2_si + m2_len
                if m2_ei >= max_mp_len:
                    continue
                m1_code = np.zeros((35, 4))
                m2_code = np.zeros((35, 4))
                m1_code[m1_si:m1_ei] = m1
                m2_code[m2_si:m2_ei] = m2
                mp_code = np.concatenate([m1_code, m2_code], axis=-1)
                
                if m2_si >= m1_ei:
                    overlap_len = -(m2_si - m1_ei)
                else:
                    overlap_len = sum(mp_code.sum(-1) > 1.1)
        
                possible_mp_code.append(mp_code)
                #print(case_i, overlap_len)
                generative_mp_info.append([case_i, overlap_len, m1_si, m2_si])
    generative_mp_info = np.array(generative_mp_info)
    df = pd.DataFrame(columns=['dimer_name', 'orientation_case', 'overlapping_len', 'm1_start_idx', 'm2_start_idx'])
    df['dimer_name']  = [name] * len(generative_mp_info)
    df['orientation_case'] = generative_mp_info[:, 0]
    df['overlapping_len'] = generative_mp_info[:, 1]
    df['m1_start_idx'] = generative_mp_info[:, 2]
    df['m2_start_idx'] = generative_mp_info[:, 3]
    return df, np.array(possible_mp_code)

In [14]:
def get_icDimer(dimer):
    cpm = seqlogo.CompletePm(pfm =(np.round(dimer.T, 6)*1e6).astype(int))
    cpm_ic = np.repeat(np.expand_dims(cpm.ic.to_numpy(), axis=1),4, axis=1)
    # print(cpm_ic.shape, dimer.shape)
    return cpm_ic*dimer, cpm.ic.to_numpy()

def build_784features(possible_mp_df, possible_mp_code, dimerFam):
    # def build_features
    overlap_len_dict = {}
    onehot_idx = 0
    for i in range(-20, 21):
        overlap_len_dict[i] = onehot_idx
        onehot_idx+=1

    def encode_aligned_features(olen, case):
        case_code = np.zeros(4)
        olen_code = np.zeros(41)
        case_code[case-1] += 1
        olen_code[overlap_len_dict[olen]] += 1
        return case_code, olen_code
    
    possible_mp_784feaures = []
    for idx, row in tqdm.tqdm(possible_mp_df.iterrows()):
        # dimer_idx = row['generated_mp_idx']
        # print(idx)
        ol_len = row['overlapping_len']
        motif1 = possible_mp_code[idx, :, :4].copy()
        motif2 = possible_mp_code[idx, :, 4:8].copy()
        for i, b in enumerate(motif1):
            if sum(b) < 0.3:
                motif1[i] = np.array([0.25, 0.25, 0.25, 0.25])
        for i, b in enumerate(motif2):
            if sum(b) < 0.3:
                motif2[i] = np.array([0.25, 0.25, 0.25, 0.25])

        overlapping_position = np.zeros(35)
        motif1_ic, ic1 = get_icDimer(motif1)
        motif2_ic, ic2 = get_icDimer(motif2)
        overlap_locus = np.logical_and(motif2_ic.sum(-1)>10e-7,motif1_ic.sum(-1)>10e-7)
        overlap_len = sum(overlap_locus)

        motif1_ol = np.zeros((18, 4))-1
        mp_ol_diff = np.zeros((18))-1
        motif2_ol = np.zeros((18, 4))-1 
        mp_ol_sum = np.zeros((18, 4))-1
        motif1_ol_ic = np.zeros(18) - 1
        motif2_ol_ic = np.zeros(18) - 1
        mp_ic_diff = np.zeros(18) - 1

        if overlap_len > 0:
            # overlap_seq_withIC
            #  print(motif1_ic.shape)
            motif1_ol[:overlap_len] = motif1_ic[overlap_locus]
            motif2_ol[:overlap_len] = motif2_ic[overlap_locus]
            # euclidean distance of seq with IC
            mp_ol_diff[:overlap_len] = np.sqrt(((motif1_ic[overlap_locus] - motif2_ic[overlap_locus])**2).sum(-1))
            # overlap_seq_sum
            mp_ol_sum[:overlap_len] = motif1_ic[overlap_locus] + motif2_ic[overlap_locus]
            # overlap ic and its mse
            motif1_ol_ic[:overlap_len] = ic1[overlap_locus]
            motif2_ol_ic[:overlap_len] = ic2[overlap_locus]
            mp_ic_diff[:overlap_len] = np.sqrt((ic1[overlap_locus] - ic2[overlap_locus])**2)
            # AGCT stats
            motif1_overlap_base = motif1_ic[overlap_locus,:].sum(0)
            motif2_overlap_base = motif2_ic[overlap_locus,:].sum(0)

        overlap_feats = np.concatenate([motif1_ol.flatten(), motif2_ol.flatten(), motif1_ol_ic, motif1_ol_ic, mp_ol_diff.flatten(), mp_ol_sum.flatten(), mp_ic_diff, motif2_overlap_base.flatten(), motif1_overlap_base.flatten()])
        seq_feats = np.concatenate([motif1_ic.flatten(), motif2_ic.flatten(), ic1.flatten(), ic2.flatten()])
        feats = np.concatenate([overlap_feats, seq_feats])
        
        dname = row['dimer_name']
        # dimerFam = dnameToFamily_dict[dname]
        ol_len = row['overlapping_len']

        case_fam, olen_fam, _, _= dimer_familly_mpCase_dict[dimerFam]
        case_gmp, olen_gmp = encode_aligned_features(row['overlapping_len'], row['orientation_case'])
        case_mul, olen_mul = case_fam*case_gmp, olen_fam*olen_gmp
        mul_sum = [sum(case_mul), sum(olen_mul)]
        total_sum = [sum(mul_sum)]

        generative_seq_ic, _ = get_icDimer(pred_dimer[idx].T)
        generative_seq_ic = generative_seq_ic.flatten()
        feat_784 = np.concatenate([feats, case_fam, olen_fam, case_gmp, olen_gmp, case_mul, olen_mul, mul_sum, total_sum])
        possible_mp_784feaures.append(feat_784)

    return np.array(possible_mp_784feaures)

In [4]:
dnameToFamily_dict = pkl.load(open("../data/dimer_name_family_upper_dict.pkl", "rb"))
family_onehot_encode = pkl.load(open("../data/kc_heterodimer_family_all614dimers_upper_oneHotDict.pkl", "rb"))
dimer_familly_mpCase_dict = pkl.load(open("../data/dimerMotif_87family_feaures_dict.pkl", "rb"))
kc_dimer_info = pd.read_csv("../data/kc_dimer_info.csv")
# homomotif_seq_dict = pkl.load(open("../data/homodimerMotifDatabase_dict.pkl", "rb"))
# motif_seq_dict = pkl.load(open("../data/motifDatabase_dict.pkl", "rb"))
dimer_seq_dict = pkl.load(open("../data/dimerMotifDatabase_dict.pkl", "rb"))
dimerfamily_dict = pkl.load(open("../data/dimerMotifFamily_dict.pkl", "rb"))
# monomeric motif PPM
monomeric_PPM_dict = pkl.load(open("../data/MonomericMotif_PPM_dict.pkl", "rb"))

# Load motif generator and evaluator

In [5]:
net = deeper_u_net().cuda()
net.eval()
checkpoint = torch.load('../model/deeper_uNet_200epochs.checkpoint')
net.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [6]:
xgboost_evaluator = joblib.load("../model/XGBoost_nobalanced_bestHyper_924features.joblib")

# Load ELF1-ralated monomeric motif pair

In [38]:
elf1_df = pd.read_csv("./ELF1_related_motifpair.csv")

In [39]:
motif_list, family_list = elf1_df['monomeric pair'], elf1_df['family']

In [15]:
mp_name_list = []
synthesized_motifs = []
motif_dfs = []
input_motifpairs = []
for i in range(len(motif_list)):
    
    m1, m2 = motif_list[i].split("_")
    # print(m1, m2)
    if True:
        m1_ppm = monomeric_PPM_dict[m1]
        m2_ppm = monomeric_PPM_dict[m2]
        fcode = family_onehot_encode[family_list[i]][np.newaxis, :]
        possible_mp_df, possible_mp_code = get_possible_motifpair(motif_list[i], m1_ppm, m2_ppm)
        fcode = np.repeat(fcode, 35, axis=1)
        fcode = np.repeat(fcode, len(possible_mp_code), axis=0)
        possible_mp_code_witFAM = np.concatenate([possible_mp_code, fcode], axis=-1)
        # print(possible_mp_code_witFAM.shape)
        with torch.no_grad():
            X_input = torch.from_numpy(possible_mp_code_witFAM).cuda().float()
            pred_dimer = net(X_input).cpu().detach().numpy()
        # print(fcode.shape, possible_mp_code.shape, pred_dimer.shape)
        possible_mp_784feaures = build_784features(possible_mp_df, possible_mp_code, family_list[i])
        possible_mp_924feaures = np.concatenate([possible_mp_784feaures, pred_dimer.reshape(len(pred_dimer), -1)], axis=-1)
        possible_mp_score = xgboost_evaluator.predict_proba(possible_mp_924feaures)
        possible_mp_df['score'] = possible_mp_score[:, 1]
        mp_name_list.append(motif_list[i])
        input_motifpairs.append(possible_mp_code)
        synthesized_motifs.append(pred_dimer)
        motif_dfs.append(possible_mp_df)
        print(motif_list[i], 'NumOfCandidates', len(possible_mp_df))
#     except:
#         print(i, m1, m2, 'motif no found!')

816it [00:09, 88.41it/s]
0it [00:00, ?it/s]

E2F3_ELF1 NumOfCandidates 816


1196it [00:13, 88.37it/s]
0it [00:00, ?it/s]

GCM2_ELF1 NumOfCandidates 1196


960it [00:10, 89.15it/s]
0it [00:00, ?it/s]

RFX3_ELF1 NumOfCandidates 960


1196it [00:13, 88.27it/s]
0it [00:00, ?it/s]

ELK1_ELF1 NumOfCandidates 1196


1150it [00:13, 88.19it/s]
0it [00:00, ?it/s]

ETV7_ELF1 NumOfCandidates 1150


1196it [00:13, 88.11it/s]
0it [00:00, ?it/s]

ARNTL_ELF1 NumOfCandidates 1196


1196it [00:13, 88.76it/s]
0it [00:00, ?it/s]

ERF_ELF1 NumOfCandidates 1196


1008it [00:11, 89.28it/s]
0it [00:00, ?it/s]

CREB3L1_ELF1 NumOfCandidates 1008


1196it [00:13, 87.24it/s]
0it [00:00, ?it/s]

TFAP4_ELF1 NumOfCandidates 1196


1196it [00:13, 87.29it/s]
0it [00:00, ?it/s]

ETV5_ELF1 NumOfCandidates 1196


1008it [00:11, 88.56it/s]
0it [00:00, ?it/s]

FLI1_ELF1 NumOfCandidates 1008


1150it [00:12, 92.72it/s]
0it [00:00, ?it/s]

ETV2_ELF1 NumOfCandidates 1150


1056it [00:11, 94.82it/s]
0it [00:00, ?it/s]

ALX4_ELF1 NumOfCandidates 1056


1196it [00:12, 93.23it/s]
0it [00:00, ?it/s]

HOXA3_ELF1 NumOfCandidates 1196


1196it [00:12, 93.42it/s]
0it [00:00, ?it/s]

EVX1_ELF1 NumOfCandidates 1196


1242it [00:12, 95.58it/s]
0it [00:00, ?it/s]

HOXD12_ELF1 NumOfCandidates 1242


1150it [00:12, 94.31it/s]
0it [00:00, ?it/s]

HOXB13_ELF1 NumOfCandidates 1150


1288it [00:13, 94.51it/s]
0it [00:00, ?it/s]

PITX1_ELF1 NumOfCandidates 1288


1196it [00:12, 95.61it/s]


HOXC10_ELF1 NumOfCandidates 1196


# Collect Top5 heterodimeric motif candidates with a score higher than 0.1

In [35]:
syntheiszed_motif_dict = {}
for i in range(len(motif_dfs)):
    tdf = motif_dfs[i]
    tdf = tdf[tdf['score'] > 0.1]
    tdf = tdf.sort_values(by='score', ascending=False).iloc[:5]
    selex_idx = list(tdf.index)
    mp_code = input_motifpairs[i]
    heterdimers = synthesized_motifs[i]
    good_motif =[]
    for sidx in selex_idx:
        trimed_motif = heterdimers[sidx][:, mp_code[sidx].sum(-1)>0]
        # print(trimed_motif.shape)
        good_motif.append(trimed_motif.T)
    print(mp_name_list[i], len(good_motif))
    syntheiszed_motif_dict[mp_name_list[i]] = good_motif

E2F3_ELF1 3
GCM2_ELF1 2
RFX3_ELF1 5
ELK1_ELF1 4
ETV7_ELF1 5
ARNTL_ELF1 4
ERF_ELF1 4
CREB3L1_ELF1 1
TFAP4_ELF1 3
ETV5_ELF1 4
FLI1_ELF1 3
ETV2_ELF1 4
ALX4_ELF1 1
HOXA3_ELF1 2
EVX1_ELF1 4
HOXD12_ELF1 1
HOXB13_ELF1 1
PITX1_ELF1 2
HOXC10_ELF1 1


# Convert the selected motif to MEME format for FIMO 

In [17]:
meme_format_folder = "./ELF1_related_meme_format"
if not os.path.exists(meme_format_folder):
    os.mkdir(meme_format_folder)
    
for mp_name in syntheiszed_motif_dict.keys():
    for i in range(len(syntheiszed_motif_dict[mp_name])):
        m = syntheiszed_motif_dict[mp_name][i]
        name = 'MEME version 4\n\nALPHABET= ACGT\n\nstrands: + -\n\nBackground letter frequencies (from unknown source):\nA 0.250 C 0.250 G 0.250 T 0.250\n\nMOTIF ' + str(i) + " " + mp_name + " \n\n"
        name1 = 'letter-probability matrix: alength= 4 w= ' + str(len(m)) + ' nsites= 1 E= 0e+0\n'
        pfm_str = ""
        for row in m:
            for item in row:
                pfm_str+=str(item) + " "
            pfm_str += "\n"
        meme_format_motif = name+name1+pfm_str
        with open(meme_format_folder+"/" + mp_name + "_" + str(i) + "_motif.meme", "w") as f:
            f.write(meme_format_motif)

# apply fimo to search motif instances across whole genome

In [20]:
fimo_res_folder = "./ELF1_fimo"
if not os.path.exists(fimo_res_folder):
    os.mkdir(fimo_res_folder)
fimo_cmds = []
for meme_file in glob.glob("./ELF1_related_meme_format/*meme"):
    mp_name = re.split("/|_motif", meme_file)[-2]
    output_fold = fimo_res_folder + "/" + mp_name + "_fimo"
    cmd = "./meme-5.0.1/src/fimo -oc " + output_fold + " " + meme_file + " " +"./meme-5.3.3/db/hg19.fa"
    fimo_cmds.append(cmd)

In [21]:
fimo_cmds[0]

'./meme-5.0.1/src/fimo -oc ./ELF1_fimo/ELF1_related_meme_format\\ALX4_ELF1_0_fimo ./ELF1_related_meme_format\\ALX4_ELF1_0_motif.meme ./meme-5.3.3/db/hg19.fa'

In [22]:
len(fimo_cmds)

54

In [24]:
elf1_fimo_files = glob.glob("./ELF1_fimo/*/fimo.tsv")

In [27]:
# convert fimo.tsv into bedfile
for f in elf1_fimo_files:
    print(f)
    tdf = pd.read_csv(f, "\t")
    # last3idx = [len(tdf)-1, len(tdf)-2, len(tdf)-3]
    tdf.dropna(inplace=True)
    sdf = tdf[['sequence_name', 'start', 'stop', 'motif_alt_id', 'score', 'strand']]
    sdf = sdf.astype({'start': 'int32', 'stop': 'int32'})
    save_file = f.replace(".tsv", '.bed')
    sdf.to_csv(save_file, sep='\t', header=None, index=False)

./ELF1_fimo\ALX4_ELF1_0_fimo\fimo.tsv
./ELF1_fimo\ARNTL_ELF1_0_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\ARNTL_ELF1_1_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\ARNTL_ELF1_2_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\ARNTL_ELF1_3_fimo\fimo.tsv
./ELF1_fimo\CREB3L1_ELF1_0_fimo\fimo.tsv
./ELF1_fimo\E2F3_ELF1_0_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\E2F3_ELF1_1_fimo\fimo.tsv
./ELF1_fimo\E2F3_ELF1_2_fimo\fimo.tsv
./ELF1_fimo\ELK1_ELF1_0_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\ELK1_ELF1_1_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\ELK1_ELF1_2_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\ELK1_ELF1_3_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\ERF_ELF1_0_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\ERF_ELF1_1_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\ERF_ELF1_2_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\ERF_ELF1_3_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\ETV2_ELF1_0_fimo\fimo.tsv
./ELF1_fimo\ETV2_ELF1_1_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\ETV2_ELF1_2_fimo\fimo.tsv
./ELF1_fimo\ETV2_ELF1_3_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\ETV5_ELF1_0_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\ETV5_ELF1_1_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\ETV5_ELF1_2_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\ETV5_ELF1_3_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\ETV7_ELF1_0_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\ETV7_ELF1_1_fimo\fimo.tsv
./ELF1_fimo\ETV7_ELF1_2_fimo\fimo.tsv
./ELF1_fimo\ETV7_ELF1_3_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\ETV7_ELF1_4_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\EVX1_ELF1_0_fimo\fimo.tsv
./ELF1_fimo\EVX1_ELF1_1_fimo\fimo.tsv
./ELF1_fimo\EVX1_ELF1_2_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\EVX1_ELF1_3_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\FLI1_ELF1_0_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\FLI1_ELF1_1_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\FLI1_ELF1_2_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\GCM2_ELF1_0_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\GCM2_ELF1_1_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\HOXA3_ELF1_0_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\HOXA3_ELF1_1_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\HOXB13_ELF1_0_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\HOXC10_ELF1_0_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\HOXD12_ELF1_0_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\PITX1_ELF1_0_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\PITX1_ELF1_1_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\RFX3_ELF1_0_fimo\fimo.tsv
./ELF1_fimo\RFX3_ELF1_1_fimo\fimo.tsv
./ELF1_fimo\RFX3_ELF1_2_fimo\fimo.tsv
./ELF1_fimo\RFX3_ELF1_3_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\RFX3_ELF1_4_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\TFAP4_ELF1_0_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\TFAP4_ELF1_1_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo\TFAP4_ELF1_2_fimo\fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


In [28]:
# download ENCODE DNase cluster peaks
# wget http://hgdownload.soe.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeRegDnaseClustered/wgEncodeRegDnaseClustered.bed.gz

In [6]:
from pybedtools import BedTool

In [3]:
# convert fimo.tsv into bedfile
elf1_fimo_files = glob.glob("./ELF1_fimo/*/fimo.tsv")
for f in elf1_fimo_files:
    print(f)
    tdf = pd.read_csv(f, "\t")
    # last3idx = [len(tdf)-1, len(tdf)-2, len(tdf)-3]
    tdf.dropna(inplace=True)
    sdf = tdf[['sequence_name', 'start', 'stop', 'motif_alt_id', 'score', 'strand']]
    sdf = sdf.astype({'start': 'int32', 'stop': 'int32'})
    save_file = f.replace(".tsv", '.bed')
    sdf.to_csv(save_file, sep='\t', header=None, index=False)

./ELF1_fimo/ETV7_ELF1_3_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/RFX3_ELF1_4_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/ARNTL_ELF1_3_fimo/fimo.tsv
./ELF1_fimo/E2F3_ELF1_0_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/ETV5_ELF1_2_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/ETV7_ELF1_2_fimo/fimo.tsv
./ELF1_fimo/TFAP4_ELF1_0_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/ERF_ELF1_3_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/CREB3L1_ELF1_0_fimo/fimo.tsv
./ELF1_fimo/ETV2_ELF1_2_fimo/fimo.tsv
./ELF1_fimo/ELK1_ELF1_3_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/ETV5_ELF1_1_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/ELK1_ELF1_1_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/ELK1_ELF1_2_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/ETV7_ELF1_4_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/HOXA3_ELF1_1_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/HOXC10_ELF1_0_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/RFX3_ELF1_0_fimo/fimo.tsv
./ELF1_fimo/ERF_ELF1_0_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/FLI1_ELF1_0_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/ERF_ELF1_1_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/GCM2_ELF1_0_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/TFAP4_ELF1_1_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/ERF_ELF1_2_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/ETV2_ELF1_1_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/ETV2_ELF1_3_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/EVX1_ELF1_2_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/PITX1_ELF1_0_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/EVX1_ELF1_0_fimo/fimo.tsv
./ELF1_fimo/ETV7_ELF1_0_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/HOXD12_ELF1_0_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/ETV5_ELF1_3_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/ELK1_ELF1_0_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/EVX1_ELF1_1_fimo/fimo.tsv
./ELF1_fimo/EVX1_ELF1_3_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/ETV2_ELF1_0_fimo/fimo.tsv
./ELF1_fimo/E2F3_ELF1_1_fimo/fimo.tsv
./ELF1_fimo/E2F3_ELF1_2_fimo/fimo.tsv
./ELF1_fimo/HOXB13_ELF1_0_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/FLI1_ELF1_2_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/RFX3_ELF1_3_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/ARNTL_ELF1_1_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/ETV7_ELF1_1_fimo/fimo.tsv
./ELF1_fimo/TFAP4_ELF1_2_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/ETV5_ELF1_0_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/RFX3_ELF1_1_fimo/fimo.tsv
./ELF1_fimo/ALX4_ELF1_0_fimo/fimo.tsv
./ELF1_fimo/FLI1_ELF1_1_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/HOXA3_ELF1_0_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/ARNTL_ELF1_2_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/GCM2_ELF1_1_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/ARNTL_ELF1_0_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


./ELF1_fimo/RFX3_ELF1_2_fimo/fimo.tsv
./ELF1_fimo/PITX1_ELF1_1_fimo/fimo.tsv


  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
dnaseClusterBed = BedTool("./wgEncodeRegDnaseClustered.bed")

In [8]:
pax5_fimo_bed = glob.glob("./ELF1_fimo/*/fimo.bed")
for f in pax5_fimo_bed:
    print(f)
    fimo_bed = BedTool(f)
    save_bed_f = f.replace("fimo.bed", "fimo_dnasecluster.bed")
    fimo_bed.intersect(dnaseClusterBed, wa=True, wb=True).moveto(save_bed_f)

./ELF1_fimo/ETV7_ELF1_3_fimo/fimo.bed
./ELF1_fimo/RFX3_ELF1_4_fimo/fimo.bed
./ELF1_fimo/ARNTL_ELF1_3_fimo/fimo.bed
./ELF1_fimo/E2F3_ELF1_0_fimo/fimo.bed
./ELF1_fimo/ETV5_ELF1_2_fimo/fimo.bed
./ELF1_fimo/ETV7_ELF1_2_fimo/fimo.bed
./ELF1_fimo/TFAP4_ELF1_0_fimo/fimo.bed
./ELF1_fimo/ERF_ELF1_3_fimo/fimo.bed
./ELF1_fimo/CREB3L1_ELF1_0_fimo/fimo.bed
./ELF1_fimo/ETV2_ELF1_2_fimo/fimo.bed
./ELF1_fimo/ELK1_ELF1_3_fimo/fimo.bed
./ELF1_fimo/ETV5_ELF1_1_fimo/fimo.bed
./ELF1_fimo/ELK1_ELF1_1_fimo/fimo.bed
./ELF1_fimo/ELK1_ELF1_2_fimo/fimo.bed
./ELF1_fimo/ETV7_ELF1_4_fimo/fimo.bed
./ELF1_fimo/HOXA3_ELF1_1_fimo/fimo.bed
./ELF1_fimo/HOXC10_ELF1_0_fimo/fimo.bed
./ELF1_fimo/RFX3_ELF1_0_fimo/fimo.bed
./ELF1_fimo/ERF_ELF1_0_fimo/fimo.bed
./ELF1_fimo/FLI1_ELF1_0_fimo/fimo.bed
./ELF1_fimo/ERF_ELF1_1_fimo/fimo.bed
./ELF1_fimo/GCM2_ELF1_0_fimo/fimo.bed
./ELF1_fimo/TFAP4_ELF1_1_fimo/fimo.bed
./ELF1_fimo/ERF_ELF1_2_fimo/fimo.bed
./ELF1_fimo/ETV2_ELF1_1_fimo/fimo.bed
./ELF1_fimo/ETV2_ELF1_3_fimo/fimo.bed
./ELF1_

In [12]:
ELF1_overfrac = {}
ELF1_fimo_bed = glob.glob("./ELF1_fimo/*/fimo.bed")
for f in ELF1_fimo_bed:
    mp_name = re.split("/|_fimo", f)[-3]
    fbed = pd.read_csv(f, "\t", header=None)
    dnasebed = pd.read_csv(f.replace("fimo.bed", "fimo_dnasecluster.bed"), "\t", header=None)
    print(f, len(dnasebed)/len(fbed))
    ELF1_overfrac[mp_name] = [len(fbed), len(dnasebed), len(dnasebed)/len(fbed)]

./ELF1_fimo/ETV7_ELF1_3_fimo/fimo.bed 0.19321505166699163
./ELF1_fimo/RFX3_ELF1_4_fimo/fimo.bed 0.20863159150560726
./ELF1_fimo/ARNTL_ELF1_3_fimo/fimo.bed 0.24854726886546707
./ELF1_fimo/E2F3_ELF1_0_fimo/fimo.bed 0.14499930578053408
./ELF1_fimo/ETV5_ELF1_2_fimo/fimo.bed 0.25425168921197083
./ELF1_fimo/ETV7_ELF1_2_fimo/fimo.bed 0.2776925953627524
./ELF1_fimo/TFAP4_ELF1_0_fimo/fimo.bed 0.20459850784980851
./ELF1_fimo/ERF_ELF1_3_fimo/fimo.bed 0.16737292911078552
./ELF1_fimo/CREB3L1_ELF1_0_fimo/fimo.bed 0.2024846574269246
./ELF1_fimo/ETV2_ELF1_2_fimo/fimo.bed 0.19954779987245638
./ELF1_fimo/ELK1_ELF1_3_fimo/fimo.bed 0.16513511467716296
./ELF1_fimo/ETV5_ELF1_1_fimo/fimo.bed 0.23817555257102999
./ELF1_fimo/ELK1_ELF1_1_fimo/fimo.bed 0.19821665113084433
./ELF1_fimo/ELK1_ELF1_2_fimo/fimo.bed 0.2238355755886637
./ELF1_fimo/ETV7_ELF1_4_fimo/fimo.bed 0.18022221544529496
./ELF1_fimo/HOXA3_ELF1_1_fimo/fimo.bed 0.19343240907684595
./ELF1_fimo/HOXC10_ELF1_0_fimo/fimo.bed 0.12556504877900987
./ELF1_fim

In [13]:
df = pd.DataFrame(ELF1_overfrac.keys(), columns=['motif_id']).set_index('motif_id')
for idx, row in df.iterrows():
    df.loc[idx, 'heterodimeric motif'] = "_".join(idx.split("_")[:2])
    df.loc[idx, 'geonomewide occurrence'] = ELF1_overfrac[idx][0]
    df.loc[idx, 'DNase overlap peaks count'] = ELF1_overfrac[idx][1]
    df.loc[idx, 'DNase overlap fraction'] = ELF1_overfrac[idx][-1]

In [22]:
df.to_csv("./ELF1_related_motif_dnase_validation.csv")

In [36]:
df = pd.read_csv("./ELF1_related_motif_dnase_validation.csv")

In [42]:
for idx, row in df.iterrows():
    df.loc[idx, 'family'] = np.array(family_list)[np.array(motif_list) == row['heterodimeric motif']][0]
sorted_ELF1_dnase = df.sort_values(by='DNase overlap fraction', ascending=False)[['heterodimeric motif', 'motif_id', 'geonomewide occurrence', 'DNase overlap peaks count', 'DNase overlap fraction']]

In [57]:
sorted_ELF1_dnase.to_csv("ELF1_related_motif_dnase_validation_sorted.csv")

In [56]:
sns.set_style('darkgrid')
# sns.set_palette("RdBu")
plt.figure(figsize = (12, 5))
bar = sns.barplot(np.arange(len(sorted_ELF1_dnase)), list(sorted_ELF1_dnase['DNase overlap fraction']), palette='GnBu_d')
plt.xticks(np.arange(len(sorted_ELF1_dnase)), list(sorted_ELF1_dnase['motif_id']), rotation = 90)
plt.ylim(0, 0.35)
bar.axhline(0.1, color='red',  linestyle="-")
plt.xlabel('ELF1-related heterodimeric motifs')
plt.ylabel('DNase Cluster Peaks Overlapping Fraction')
plt.text(0, 0.085, 'Background', color = 'red',  weight='bold')
plt.tight_layout()
plt.savefig('ELF1related_DNase_result.pdf')

<IPython.core.display.Javascript object>

