In [1]:
# Generate distributions of contact (pixel) strength for FOCS predicted EPIs
# Author: Joshua Price
# Started: Nov 20, 2018
# Last Edited: Nov 23, 2018
# Runtime: ~20 min on alineos

import math
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import cooler

In [2]:
# Import promoter list from FOCS
ep = pd.read_csv('/data2/josh/ep/fantom_mm10_E-P.txt', sep='\t', header=None, usecols=[0,1,2,3,4,5,6,7])
ep.columns = ['p_chr', 'p_start', 'p_end', 'p_dir', 'contact_id','e_chr','e_start','e_end']
ep['p_center'] = (ep['p_start']+ep['p_end'])/2
ep['e_center'] = (ep['e_start']+ep['e_end'])/2
ep['distance'] = abs(ep['p_center'] - ep['e_center'])
ep_long = ep[ep['distance'] > 5000]
ep_long.head()

Unnamed: 0,p_chr,p_start,p_end,p_dir,contact_id,e_chr,e_start,e_end,p_center,e_center,distance
3,chr1,9631149,9631173,-,EP00004,chr1,9613909,9614261,9631161.0,9614085.0,17076.0
4,chr1,9748247,9748272,-,EP00005,chr1,9798749,9799747,9748259.5,9799248.0,50988.5
5,chr1,9748247,9748272,-,EP00006,chr1,9396999,9397125,9748259.5,9397062.0,351197.5
6,chr1,9748277,9748320,-,EP00007,chr1,9798749,9799747,9748298.5,9799248.0,50949.5
7,chr1,9748277,9748320,-,EP00008,chr1,9396999,9397125,9748298.5,9397062.0,351236.5


In [2]:
# import cooler data
c = cooler.Cooler('/data2/josh/stan/merge_res200.cool')

In [3]:
# create df with pixels - takes a while (sometimes crashes kernel)
pix = c.pixels()
p = pix[:100000000]
p.tail()

Unnamed: 0,bin1_id,bin2_id,count
49999995,620743,11784127,1
49999996,620743,12444790,1
49999997,620743,13131240,1
49999998,620744,620745,3
49999999,620744,620746,8


In [5]:
p[p['bin1_id']==15000].shape[0]

29

In [None]:
def to_int(f):
    if math.isnan(f):
        return 0
    else:
        return int(f)

In [None]:
ep_long['p_bin'] = (ep_long['p_center']/200).apply(to_int)
ep_long['e_bin'] = (ep_long['e_center']/200).apply(to_int)

In [None]:
ep_long1 = ep_long[(ep_long['p_chr'] == 'chr1') & (ep_long['e_chr'] == 'chr1')]
max(max(ep_long1['p_bin']),max(ep_long1['e_bin']))

In [None]:
ep_long1.tail()

In [None]:
p_chr1 = p[p['bin2_id'] < 975883]

In [None]:
ep_long_few = ep_long1.head()
# ep_long_few['score'] = 0
counts = [] * ep_long.shape[0]

for idx, row in ep_long1.iterrows():
    p_bin = row['p_bin']
    e_bin = row['e_bin']
    count_series = p_chr1[(p_chr1['bin1_id']==p_bin) & (p_chr1['bin2_id']==e_bin)]['count']
    if count_series.empty:
        count_series = p_chr1[(p_chr1['bin1_id']==e_bin) & (p_chr1['bin2_id']==p_bin)]['count']
        # ep_long_few.loc[idx,'score'] = 0
    if count_series.empty: # still
        counts.append(0)
    else:
        # ep_long_few.loc[idx,'score'] = count_series.iloc[0]
        counts.append(count_series.iloc[0])
    if ((idx % 100 == 0) & (idx > 0)):
        print(idx)

ep_long_few['score'] = counts

In [None]:
sns.countplot(x='score', data=ep_long1)
plt.title('Score Distribution of EPIs')

In [None]:
plt.title('Score Distribution of all contacts')
sns.countplot(x='count',data=p_chr1[0:10000])