In [119]:
# File name: genLabeledLoops.py
# Author: Joshua Price
# Date created: 04/03/2018
# Date last modified: 04/11/2018
# Python Version: 2.7

# Purpose: addTadCols and addEPCols annotates known contacts with TAD labels and enhancer/promoter labels
	# addTadCols: takes BedTool with TAD information (tads) and df with loops information, adds two TAD cols to loops df
	# addEPCols: takes BedTool with regulatory information (regs) and, df with loops information, adds two reg cols to loops df
	# mergeLabeledLoops: merges two dfs and saves single df with loops, TAD, and reg info into .csv file
	# locFilter: method for pbt filtering, returns true if feature encapsulates given bp info
	# approxLocFilter: method for pbt filtering, returns true if feature overlaps with given bp info
	# containsPromoter: method for pbt filtering, returns true if feature labeled promoter or promoter proximal region
	# containsEnhancer: method for pbt filtering, returns true if feature labeled enhancer
	# extractLoopInfo: helper function that extracts useful contact info from df row
	# bedCopyAndLen: helper function to get BedTool length without erasing it

import pybedtools as pbt
import pandas as pd
import numpy as np

def addTadCols(tads, loops):
	# Purpose: takes BedTool with TAD information (tads) and df with loops information, adds two TAD cols to loops df
	# Inputs: 
		# tads: BedTool with TAD information in bed format (chr#, startBp, endBp, name, score, ...)
		# loops: pd df (no header) with chrA#, startBpA, endBpA, chrB#, startBpB, endBpB, score
	# Outputs:
		# loops: pd df with two new columns (tadA, tadB) populated with TAD IDs corresponding to loops

	# Add new columns to loops df
	loops["tadA"] = ""
	loops["tadB"] = ""

	# Initialize counters used to produce early report
	numLoops, numIntra, numProblem = 0, 0, 0 

	for idx, row in loops.iterrows():
		chrA, startBpA, endBpA, chrB, startBpB, endBpB = extractLoopInfo(row)

		# tadAInfo contains TAD(s) that encapsulate the contact point
		tadAInfo = tads[(tads['chr'] == chrA) & (tads['startbp'] < startBpA) & (tads['endbp'] > endBpA)]
		tadAInfoLen = tadAInfo.shape[0]
		tadAInfo.reset_index(drop=True, inplace=True)

		# if contact point encapsulated by TAD, add that TAD label to df
		if tadAInfoLen > 0: 
			loops.loc[idx, "tadA"] = tadAInfo.loc[0, "name"]
		else: 
			loops.loc[idx, "tadA"] = np.NaN

		# tadBInfo contains TAD(s) that encapsulate the contact point
		tadBInfo = tads[(tads['chr'] == chrB) & (tads['startbp'] < startBpB) & (tads['endbp'] > endBpB)]
		tadBInfoLen = tadBInfo.shape[0]
		tadBInfo.reset_index(drop=True, inplace=True)

		# if contact point encapsulated by TAD, add that TAD label to df
		if tadBInfoLen > 0: 
			loops.loc[idx, "tadB"] = tadBInfo.loc[0, "name"]

		# if both contact points in same TAD (and in a TAD at all), note as intra-TAD
		if loops.loc[idx, "tadA"] == loops.loc[idx, "tadB"] and tadAInfoLen > 0:
			numIntra += 1
		
		# if either contact point is not within a TAD, note as TADless ('problem') contact
		if tadAInfoLen == 0 or tadBInfoLen == 0:
			numProblem += 1

		numLoops += 1
		# print "Percent intra-TAD so far: ", float(numIntra)/(numLoops-numProblem)
		print "Rows completed: ", numLoops

	# Provide initial report about fraction of intra-TAD contacts and fraction of TADless contacts
	print "Percent intra-TAD: ", float(numIntra)/(numLoops-numProblem)
	print "Percent overlapping with TAD boundary (not counted in inter/intra): ", float(numProblem) / numLoops
	print loops
	return loops

def addEPCols(regs, loops):
	# Purpose: takes BedTool with regulatory information (regs) and df with loops information, adds two reg cols to loops df
	# Inputs: 
		# regs: BedTool with regulatory information in bed format (chr#, startBp, endBp, name)
		# loops: pd df (no header) with chrA#, startBpA, endBpA, chrB#, startBpB, endBpB, score, (tadA), (tadB)
	# Outputs:
		# loops: pd df with two new columns (regA, regB) populated with regulatory feature IDs corresponding to loops
	# Add new columns to loops df	
	loops["regA"] = ""
	loops["regB"] = ""
	loops["Int_Status"] = ""

	# Initialize counters used to produce early report
	numLoops, numEE, numPP, numEP, numProblem = 0, 0, 0, 0, 0

	for idx, row in loops.iterrows():
		chrA, startBpA, endBpA, chrB, startBpB, endBpB = extractLoopInfo(row)
		chrA = 'chr' + str(chrA)
		chrB = 'chr' + str(chrB)

		# regAInfo contains regulatory labels of regions that overlap with the contact point        
		regAInfo = regs[(((regs['chr'] == chrA) & (regs['startbp'] < startBpA) \
				& (regs['endbp'] > startBpA)) | \
					((regs['chr'] == chrA) & (regs['startbp'] < endBpA) & (regs['endbp'] > endBpA)) | \
					((regs['chr'] == chrA) & (regs['startbp'] > startBpA) & (regs['endbp'] < endBpA)))]

		regAInfoLen = regAInfo.shape[0]
		regAInfo.reset_index(drop=True, inplace=True)

		# labels for whether a promoter or enhancer labeled region overlaps with the contact point
		promoterPresentA = regAInfo[regAInfo['feature_type'] == 'feature_type=Promoter'].shape[0] > 0
		enhancerPresentA = regAInfo[regAInfo['feature_type'] == 'feature_type=Enhancer'].shape[0] > 0
        
		# add regulatory label to df as appropriate (EP = both enhancer AND promoter, N = neither)
		if promoterPresentA and enhancerPresentA:
			loops.loc[idx, "regA"] = 'EP'
		elif promoterPresentA:
			loops.loc[idx, "regA"] = 'P'
		elif enhancerPresentA:
			loops.loc[idx, "regA"] = 'E'
		else:
			loops.loc[idx, "regA"] = 'N'

		# regBInfo contains regulatory labels of regions that overlap with the contact point
		regBInfo = regs[(((regs['chr'] == chrB) & (regs['startbp'] < startBpB) \
				& (regs['endbp'] > startBpB)) | \
					((regs['chr'] == chrB) & (regs['startbp'] < endBpB) & (regs['endbp'] > endBpB)) | \
					((regs['chr'] == chrB) & (regs['startbp'] > startBpB) & (regs['endbp'] < endBpB)))]
        
		regBInfoLen = regBInfo.shape[0]
		regBInfo.reset_index(drop=True, inplace=True)

		# labels for whether a promoter or enhancer labeled region overlaps with the contact point
		promoterPresentB = regBInfo[regBInfo['feature_type'] == 'feature_type=Promoter'].shape[0] > 0
		enhancerPresentB = regBInfo[regBInfo['feature_type'] == 'feature_type=Enhancer'].shape[0] > 0
        
		# add regulatory label to df as appropriate (EP = both enhancer AND promoter, N = neither)
		if promoterPresentB and enhancerPresentB:
			loops.loc[idx, "regB"] = 'EP'
		elif promoterPresentB:
			loops.loc[idx, "regB"] = 'P'
		elif enhancerPresentB:
			loops.loc[idx, "regB"] = 'E'
		else:
			loops.loc[idx, "regB"] = 'N'

		# note as EP if one contact point contains E/EP and other contains P/EP
		if (enhancerPresentA and promoterPresentB) or (promoterPresentA and enhancerPresentB):
			isEP = True
		else:
			isEP = False

		# note as same TAD if identical non-empty TAD labels
		if loops.loc[idx, 'tadA'] == loops.loc[idx, 'tadB'] and not pd.isnull(loops.loc[idx, 'tadA']):
			sameTAD = True
		else: 
			sameTAD = False
        
		# if either TAD label is empty, mark contact row as TAD-less
		if pd.isnull(loops.loc[idx, 'tadA']) or pd.isnull(loops.loc[idx, 'tadB']):
			isTADless = True
		else:
			isTADless = False

		# add appropriate label to "EP_Status" col in row, N means non-EP
		if isEP and sameTAD:
			loops.loc[idx, "Int_Status"] = "intraEP"
		elif isEP and not sameTAD and not isTADless:
			loops.loc[idx, "Int_Status"] = "interEP"
		elif not isEP and sameTAD:
			loops.loc[idx, "Int_Status"] = "intraN"
		elif not isEP and not sameTAD and not isTADless:
			loops.loc[idx, "Int_Status"] = "interN"
		elif isEP and isTADless:
			loops.loc[idx, "Int_Status"] = "tadlessEP"
		elif not isEP and isTADless:
			loops.loc[idx, "Int_Status"] = "tadlessN"

		# Count contacts for initial report
		if (promoterPresentA and enhancerPresentB) or (enhancerPresentA and promoterPresentB):
			numEP += 1
		if enhancerPresentA and enhancerPresentB:
			numEE += 1
		if promoterPresentA and promoterPresentB:
			numPP += 1
		numLoops += 1

		# Provide initial report about regulatory distribution of loops
		# print "Percent EE: ", float(numEE)/(numLoops)
		# print "Percent PP: ", float(numPP)/(numLoops)
		# print "Percent EP: ", float(numEP)/(numLoops)
		print "Rows completed: ", numLoops

	# returned original loops df with cols (regA and regB) added
	return loops

def mergeLabeledLoops(tadLabelsDf, regLabelsDf):
	# Purpose: takes BedTool with regulatory information (regs) and df with loops information, adds two reg cols to loops df
	# Inputs: 
		# regs: BedTool with regulatory information in bed format (chr#, startBp, endBp, name)
		# loops: pd df (no header) with chrA#, startBpA, endBpA, chrB#, startBpB, endBpB, score, (tadA), (tadB)
	# Outputs:
		# loops: pd df with two new columns (regA, regB) populated with regulatory feature IDs corresponding to loops

	wholeLabeledDf = pd.concat([tadLabelsDf, regLabelsDf['regA'], \
		regLabelsDf['regB']], axis=1)
	# wholeLabeledDf.to_csv('/data2/josh/expCH12/labeled_loops.csv', index=True, sep='\t', header=True)
	return wholeLabeledDf

def locFilter(feature, chrom, startBp, endBp):
	# pybedtools filter function: Returns True if feature overlaps with contact
	return str(feature.chrom) == chrom and feature.start < startBp and feature.end > endBp

def approxLocFilter(feature, chrom, startBp, endBp):
	# pybedtools filter function: Returns True if feature overlaps with contact
	chrom = 'chr' + str(chrom)
	return str(feature.chrom) == chrom and ((feature.start < startBp and feature.end > startBp) \
		or (feature.start < endBp and feature.end > endBp)
		or (feature.start > startBp and feature.end < endBp))

def containsPromoter(feature):
	# pybedtools filter function: Returns True if feature is promoter
	return feature.name == 'feature_type=Promoter'

def containsEnhancer(feature):
	# pybedtools filter function: Returns True if feature is enhancer
	return feature.name == 'feature_type=Enhancer'

def extractLoopInfo(row):
	# helper function: extracts bed-formatted information from pandas df row
	chrA = row["chrA"]
	chrA = chrA[3:] # remove 'chr'
	startBpA = row["startbpA"]
	endBpA = row["endbpA"]
	chrB = row["chrB"]
	chrB = chrB[3:] # remove 'chr'
	startBpB = row["startbpB"]
	endBpB = row["endbpB"]
	return chrA, startBpA, endBpA, chrB, startBpB, endBpB

def bedCopyAndLen(bTool):
	# helper function: copies bedTool to report length (lame but necessary with pybedtools)
	bToolCopy = bTool.saveas('btool.bed')
	bToolLen = len(bToolCopy)
	bTool = pbt.BedTool('btool.bed')
	return bTool, bToolLen

In [107]:
if __name__ == "__main__":
    
	# Use CH12 TADs BED file for now
	tadFile = '/data2/josh/expCH12/CH12_lieberman_intra_5kb_domains.bed'
	tads = pd.read_table(tadFile)
	tads.columns = ["chr", "startbp", "endbp", "name", "score", "dir", "startbpB", "endbpB", 'rgb']

In [108]:
	loopFile = '/data2/josh/expCH12/lieberman_loops_CH12.txt'
	loops = pd.read_csv(loopFile, sep="\t", header=None)
	loops.columns = ["chrA", "startbpA", "endbpA", "chrB", "startbpB", "endbpB", "score"]

In [117]:
	tadLabeledLoops = addTadCols(tads, loops)
	tadLabeledLoops.to_csv('/data2/josh/expCH12/tad_labeled_loops.csv', index=True, sep='\t', header=True)

Rows completed:  1
Rows completed:  2
Rows completed:  3
Rows completed:  4
Rows completed:  5
Rows completed:  6
Rows completed:  7
Rows completed:  8
Rows completed:  9
Rows completed:  10
Rows completed:  11
Rows completed:  12
Rows completed:  13
Rows completed:  14
Rows completed:  15
Rows completed:  16
Rows completed:  17
Rows completed:  18
Rows completed:  19
Rows completed:  20
Rows completed:  21
Rows completed:  22
Rows completed:  23
Rows completed:  24
Rows completed:  25
Rows completed:  26
Rows completed:  27
Rows completed:  28
Rows completed:  29
Rows completed:  30
Rows completed:  31
Rows completed:  32
Rows completed:  33
Rows completed:  34
Rows completed:  35
Rows completed:  36
Rows completed:  37
Rows completed:  38
Rows completed:  39
Rows completed:  40
Rows completed:  41
Rows completed:  42
Rows completed:  43
Rows completed:  44
Rows completed:  45
Rows completed:  46
Rows completed:  47
Rows completed:  48
Rows completed:  49
Rows completed:  50
Rows comp

In [120]:
	regFile = '/data2/josh/expCH12/mm9_regulatory_converted.bed'
	regs = pd.read_table(regFile)
	regs.columns = ["chr", "startbp", "endbp", "feature_type", "extra1", "extra2"]
	addEPCols(regs, tadLabeledLoops)
    

has TADs
intraN
Rows completed:  1
has TADs
intraN
Rows completed:  2
is Tadless
tadlessEP
Rows completed:  3
is Tadless
tadlessN
Rows completed:  4
has TADs
intraEP
Rows completed:  5
has TADs
intraEP
Rows completed:  6
has TADs
interEP
Rows completed:  7
is Tadless
tadlessN
Rows completed:  8
has TADs
intraEP
Rows completed:  9
has TADs
interEP
Rows completed:  10
has TADs
interN
Rows completed:  11
has TADs
interEP
Rows completed:  12
has TADs
intraEP
Rows completed:  13
has TADs
intraEP
Rows completed:  14
has TADs
intraEP
Rows completed:  15
has TADs
intraEP
Rows completed:  16
has TADs
intraN
Rows completed:  17
has TADs
intraEP
Rows completed:  18
has TADs
intraN
Rows completed:  19
has TADs
intraEP
Rows completed:  20
has TADs
intraEP
Rows completed:  21
is Tadless
tadlessN
Rows completed:  22
has TADs
interEP
Rows completed:  23
has TADs
intraEP
Rows completed:  24
has TADs
interN
Rows completed:  25
has TADs
interEP
Rows completed:  26
has TADs
intraEP
Rows completed:  27
has

KeyboardInterrupt: 

In [None]:
	loops.to_csv('/data2/josh/expCH12/labeled_loops.csv', index=True, sep='\t', header=True)