# RoadmapEpigenomics

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

from IPython.display import display
import pandas as pd

import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
import scipy as sp
import os, re
import glob
from tqdm.auto import tqdm
import subprocess, tempfile

import plotly.graph_objects as go
import plotly.express as px

In [None]:
# Single track per modification per cell
CELLS = [
    'IMR90', 
    'H1_Derived_Neuronal_Progenitor_Cultured_Cells', 
    'iPS_DF_6.9',
    'Small_Intestine',
    'H1',
    'Pancreas',
    'iPS_DF_19.11',
    'Esophagus',
    'H1_Derived_Mesenchymal_Stem_Cells',
    'Sigmoid_Colon',
    'Spleen',
]

MODIFICATIONS = ['H3K27me3', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K36me3', 'H3K9me3']


# Overlapping

In [None]:
from pathlib import Path
import bed_metrics as bm

def overlap_with_name_level(overlaps, m, l):
    dfo = overlaps[(m, l)].melt(value_name='overlap')
    dfo['modification'] = m
    dfo['level'] = l
    return dfo

def show_overlap(df):
    levels = sorted(set(df['level']))
    overlaps = {}
    for m in set(df['modification']):
        for l in levels:
            print('Processing', m, l)
            files = df.loc[np.logical_and(df['modification'] == m, df['level'] == l)]['file']
            paths = [Path(f) for f in files]
            df_path = f'/tmp/overlap_{m}_{l}.tsv'
            overlaps[(m, l)] = bm.load_or_build_metrics_table(paths, paths, Path(df_path), jaccard=False)

    dfo = pd.concat([overlap_with_name_level(overlaps, m, l) for (m, l) in overlaps])        
    dfo_mean = dfo.groupby(['modification', 'level'])['overlap'].mean().reset_index().sort_values(
        by=['modification'])
    dfo_std = dfo.groupby(['modification', 'level'])['overlap'].std().reset_index().fillna(0).sort_values(
        by=['modification']) 
    fig = go.Figure(layout=go.Layout(title=go.layout.Title(text="Overlap")))
    for l in levels:
        fig.add_trace(go.Scatter(x=dfo_mean.loc[dfo_mean['level']==l]['modification'], 
                                 y=dfo_mean.loc[dfo_mean['level']==l]['overlap'], 
                                 name=f"{l} mean", line_shape='linear'))
        fig.add_trace(go.Scatter(x=dfo_std.loc[dfo_std['level']==l]['modification'], 
                                 y=dfo_std.loc[dfo_std['level']==l]['overlap'], 
                                 name=f"{l} std", line_shape='linear', 
                                 line=dict(dash='dot')))
    fig.show()

# MACS2

In [None]:
MACS2_FOLDER1='~/data/2020_roadmapepigenomics/bed_macs2_q0.05'
MACS2_FOLDER2='~/data/2020_roadmapepigenomics/bed_macs2_broad_0.1'
MACS2_LEVELS = ['q0.05', 'broad_0.1']

dfm = pd.DataFrame(columns=['gsm', 'cell', 'modification', 'level', 'file', 'peaks', 'length'])
for file in tqdm(glob.glob(MACS2_FOLDER1 + '/*.narrowPeak') + glob.glob(MACS2_FOLDER2 + '/*.broadPeak')):
    level = next((l for l in MACS2_LEVELS if f'_{l}' in file), None)
    modification = next((m for m in MODIFICATIONS if f'.{m}.' in file), None)
    cell = next((c for c in CELLS if f'.{c}.' in file), None)
    if level and modification and cell:
        gsm = re.sub('_.*', '', os.path.basename(file))
        out = ! awk '{{N+=1;L+=($$3-$$2)}} END{{print(N,L)}}' {file}
        if out[0].strip() != '':
            peaks, length = out[0].split(' ') 
        else:
            peaks, length = 0, 0
        dfm.loc[len(dfm)] = (gsm, cell, modification, f'macs2 {level}', file, peaks, length)
        
# Fix types
dfm['peaks'] = dfm['peaks'].astype(int)
dfm['length'] = dfm['length'].astype(int)

In [None]:
len(dfm)

In [None]:
dfm['f'] = dfm['modification']
dfm_mean = dfm.groupby(['f', 'level'])['peaks'].mean().reset_index().sort_values(by=['f', 'level'])
dfm_std = dfm.groupby(['f', 'level'])['peaks'].std().reset_index().fillna(0).sort_values(by=['f', 'level'])

fig = go.Figure(layout=go.Layout(title=go.layout.Title(text="Peaks")))
for l in sorted(set(dfm_mean['level'])):
    fig.add_trace(go.Scatter(x=dfm_mean.loc[dfm_mean['level']==l]['f'], 
                             y=dfm_mean.loc[dfm_mean['level']==l]['peaks'], 
                             name=f"{l} mean", line_shape='linear'))
    fig.add_trace(go.Scatter(x=dfm_std.loc[dfm_std['level']==l]['f'], 
                             y=dfm_std.loc[dfm_std['level']==l]['peaks'], 
                             name=f"{l} std", line_shape='linear', 
                             line=dict(dash='dot')))
fig.show()

In [None]:
# Show me overlaps   
macs2levels2process = set(['macs2 broad_0.1', 'macs2 q0.05'])
show_overlap(dfm.loc[[l in macs2levels2process for l in dfm['level']]])

# SICER

In [None]:
SICER_FOLDER='~/data/2020_roadmapepigenomics/bams_sicer'
SICER_LEVELS = ['FDR0.01']

dfsc = pd.DataFrame(columns=['gsm', 'cell', 'modification', 'level', 'file', 'peaks', 'length'])
for file in tqdm(glob.glob(SICER_FOLDER + '/*-island.bed')):
    level = next((l for l in SICER_LEVELS if f'{l}' in file), None)
    modification = next((m for m in MODIFICATIONS if f'.{m}.' in file), None)
    cell = next((c for c in CELLS if f'.{c}.' in file), None)
    if level and modification and cell:
        gsm = re.sub('_.*', '', os.path.basename(file))
        out = ! awk '{{N+=1;L+=($$3-$$2)}} END{{print(N,L)}}' {file}
        if out[0].strip() != '':
            peaks, length = out[0].split(' ') 
        else:
            peaks, length = 0, 0
        dfsc.loc[len(dfsc)] = (gsm, cell, modification, f'sicer {level}', file, peaks, length)

# Fix types
dfsc['peaks'] = dfsc['peaks'].astype(int)
dfsc['length'] = dfsc['length'].astype(int)

In [None]:
len(dfsc)

In [None]:
dfsc['f'] = dfsc['modification']
dfsc_mean = dfsc.groupby(['f', 'level'])['peaks'].mean().reset_index().sort_values(by=['f', 'level'])
dfsc_std = dfsc.groupby(['f', 'level'])['peaks'].std().reset_index().fillna(0).sort_values(by=['f', 'level'])

fig = go.Figure(layout=go.Layout(title=go.layout.Title(text="Peaks")))
for l in sorted(set(dfsc_mean['level'])):
    fig.add_trace(go.Scatter(x=dfsc_mean.loc[dfsc_mean['level']==l]['f'], 
                             y=dfsc_mean.loc[dfsc_mean['level']==l]['peaks'], 
                             name=f"{l} mean", line_shape='linear'))
    fig.add_trace(go.Scatter(x=dfsc_std.loc[dfsc_std['level']==l]['f'], 
                             y=dfsc_std.loc[dfsc_std['level']==l]['peaks'], 
                             name=f"{l} std", line_shape='linear', 
                             line=dict(dash='dot')))
fig.show()

In [None]:
# Show me overlaps   
sicerlevels2process = set(['sicer FDR0.01', 'sicer FDR0.1'])
show_overlap(dfsc.loc[[l in sicerlevels2process for l in dfsc['level']]])

# SPAN

In [None]:
SPAN_FOLDER='~/data/2020_roadmapepigenomics/bed_span'
GAPS = [0, 5, 10]
FDRS = ['0.1', '0.05', '0.01', '0.001', '0.0001', '1e-05', '1e-06', '1e-07', '1e-08', '1e-10', '1e-15', '1e-20']
# SPAN_LEVELS = ['200_1E-6_5', '200_0.01_5']
SPAN_LEVELS = []
for gap in GAPS:
    for fdr in FDRS:
        SPAN_LEVELS.append(f'200_{fdr}_{gap}')

dfs = pd.DataFrame(columns=['gsm', 'cell', 'modification', 'level', 'file', 'peaks', 'length'])
for file in tqdm(glob.glob(SPAN_FOLDER + '/*.peak')):
    if 'Input' in file:
        continue
    level = next((l for l in SPAN_LEVELS if f'{l}' in file), None)
    modification = next((m for m in MODIFICATIONS if f'.{m}.' in file), None)
    cell = next((c for c in CELLS if f'.{c}.' in file), None)
    if level and modification and cell:
        gsm = re.sub('_.*', '', os.path.basename(file))
        out = ! awk '{{N+=1;L+=($$3-$$2)}} END{{print(N,L)}}' {file}
        if out[0].strip() != '':
            peaks, length = out[0].split(' ') 
        else:
            peaks, length = 0, 0
        dfs.loc[len(dfs)] = (gsm, cell, modification, f'span {level}', file, peaks, length)
        
# Fix types
dfs['peaks'] = dfs['peaks'].astype(int)
dfs['length'] = dfs['length'].astype(int)

In [None]:
len(dfs)

In [None]:
import re
dfs['fdr'] = [float(re.sub('span 200_|_(0|5|10)', '', l)) for l in dfs['level']]
dfs['gap'] = [int(re.sub('.*_', '', l)) for l in dfs['level']]
dfs.sort_values(by=['fdr', 'gap'], inplace=True)
dfs.head()

In [None]:
# dfs['f'] = dfs['modification']
# dfs_mean = dfs.groupby(['f', 'level'])['peaks'].mean().reset_index().sort_values(by=['f', 'level'])
# dfs_std = dfs.groupby(['f', 'level'])['peaks'].std().reset_index().fillna(0).sort_values(by=['f', 'level'])

# fig = go.Figure(layout=go.Layout(title=go.layout.Title(text="Peaks")))
# for l in sorted(set(dfs_mean['level'])):
#     fig.add_trace(go.Scatter(x=dfs_mean.loc[dfs_mean['level']==l]['f'], 
#                              y=dfs_mean.loc[dfs_mean['level']==l]['peaks'], 
#                              name=f"{l} mean", line_shape='linear'))
#     fig.add_trace(go.Scatter(x=dfs_std.loc[dfs_std['level']==l]['f'], 
#                              y=dfs_std.loc[dfs_std['level']==l]['peaks'], 
#                              name=f"{l} std", line_shape='linear', 
#                              line=dict(dash='dot')))
# fig.show()

In [None]:
# Plot peaks number versus FDR for different modifications and GAPs
for m in sorted(set(dfs['modification'])):
    t = dfs.loc[dfs['modification'] == m]
    for gap in sorted(set(dfs['gap'])):
        t2 = t.loc[t['gap']==gap]
        fig = go.Figure()
        for cell in set(t2['cell']):
            t3 = t2.loc[t2['cell'] == cell]
            fig.add_trace(go.Scatter(x=np.log10(t3["fdr"]), y=t3["peaks"], 
                                     mode='lines+markers',
                                     name=cell))
        fig.update_xaxes(title='log10 fdr')
        fig.update_yaxes(title=f'{m} gap {gap} peaks')
        fig.show()

In [None]:
# Plot estimated signal to noise ratio for cells and modifications
sndf = pd.read_csv('~/data/2020_roadmapepigenomics/bed_span/sn.tsv', sep='\t',
                   names=['file', 'gsm', 'cell', 'modification', 'sn'])
sndf.head()

In [None]:
fig = go.Figure()
for m in sorted(set(sndf['modification'])):
    t = sndf.loc[sndf['modification'] == m].copy()
    t.sort_values(by=['cell'], inplace=True)
    fig.add_trace(go.Scatter(x=t['cell'], y=np.log10(t['sn']), 
                             mode='lines+markers',
                             name=m))
fig.update_xaxes(title='cell')
fig.update_yaxes(title=f'Signal to noise estimation log10')
fig.show()

# Coverage analysis

In [None]:
from io import StringIO

ts = []
for info in tqdm(glob.glob('/mnt/stripe/bio/experiments/span_peak_calling_contrast/*.info')):
    name = os.path.basename(info)
    gsm = re.findall('(GSM[0-9]+)', name)[0]
    with open(info) as t:
        info = ''.join(t.readlines())
        if 'Error' in info:
            continue
#         print(info)
        sn = float(re.findall('Signal to noise: ([0-9\.]+)', info)[0])
#         print('SN', sn)
        totalscore = int(re.findall('Total Score: ([0-9]+)', info)[0])
#         print('TS', totalscore)
        dft = re.sub('(.|\n)+Total Score:[^\n]+\n', '', info)
        t = pd.read_csv(StringIO(dft), sep='\t')
        t['GSM'] = gsm
        t['SNR'] = sn
        t['TOTAL_SCORE'] = totalscore
        ts.append(t)
t = pd.concat(ts)

t['PEAKS_RPKM'] = t['PEAKS_SCORE'] / (t['PEAKS_LENGTH'] / 1000) / (t['TOTAL_SCORE'] / 1000000)
t['SHORES_RPKM'] = t['SHORES_SCORE'] / (t['SHORES_LENGTH'] / 1000) / (t['TOTAL_SCORE'] / 1000000)
t['CONTRAST'] = t['PEAKS_RPKM'] / t['SHORES_RPKM']
infodf = t
infodf.head()

In [None]:
t = pd.merge(left=infodf, left_on='GSM', right=dfs[['gsm', 'cell', 'modification']].drop_duplicates(), right_on='gsm')
# Cleanup outliers
t = t.loc[np.logical_not(t['cell'].isin(['iPS_DF_6.9', 'iPS_DF_19.11', 'Spleen', 'IMR90']))]
infodf = t
infodf.head()

In [None]:
import matplotlib

def rgb2hex(r, g, b):
    r, g, b = r * 255, g * 255, b * 255
    return "#{0:02x}{1:02x}{2:02x}".format(int(r), int(g), int(b))

cells = list(set(infodf['cell']))
cmap = matplotlib.cm.get_cmap('tab20', len(cells))
cell_colors = dict(zip(cells, [rgb2hex(*cmap(i)[:3]) for i in range(len(cells))]))
cell_colors

In [None]:
# Peaks number vs FDR
for m in set(infodf['modification']):
    tm = infodf.loc[infodf['modification'] == m]
    fig = go.Figure()
    
    for g in sorted(set(tm['GAP'])):
        tmg = tm.loc[tm['GAP'] == g]
        for c in sorted(set(tmg['cell'])):
            tmgc = tmg.loc[tmg['cell']==c].copy()
            tmgc.sort_values(by=['FDR'], inplace=True)
            if g == 0:
                fig.add_trace(go.Scatter(x=np.log10(tmgc['FDR']), y=tmgc['PEAKS_NUMBER'], 
                         mode='lines',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=2, dash='dash')))
            elif g == 5:
                fig.add_trace(go.Scatter(x=np.log10(tmgc['FDR']), y=tmgc['PEAKS_NUMBER'], 
                         mode='lines',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=2)))
            elif g == 10:
                fig.add_trace(go.Scatter(x=np.log10(tmgc['FDR']), y=tmgc['PEAKS_NUMBER'], 
                         mode='lines',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=2, dash='dot')))

    fig.update_xaxes(title=f'{m} Log10 FDR')
    fig.update_yaxes(title=f'{m} Peaks')
    fig.show()

In [None]:
infodf['PEAKS_AVG_LENGTH'] = infodf['PEAKS_LENGTH'] / infodf['PEAKS_NUMBER']
# Peaks number vs Peaks average length
for m in set(infodf['modification']):
    tm = infodf.loc[infodf['modification'] == m]
    fig = go.Figure()
    
    for g in sorted(set(tm['GAP'])):
        tmg = tm.loc[tm['GAP'] == g]
        for c in sorted(set(tmg['cell'])):
            tmgc = tmg.loc[tmg['cell']==c].copy()
            tmgc.sort_values(by=['FDR'], inplace=True)
            if g == 0:
                fig.add_trace(go.Scatter(x=tmgc['PEAKS_NUMBER'], y=tmgc['PEAKS_AVG_LENGTH'], 
                         mode='lines',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=2, dash='dash')))
            elif g == 5:
                fig.add_trace(go.Scatter(x=tmgc['PEAKS_NUMBER'], y=tmgc['PEAKS_AVG_LENGTH'], 
                         mode='lines',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=2)))
            elif g == 10:
                fig.add_trace(go.Scatter(x=tmgc['PEAKS_NUMBER'], y=tmgc['PEAKS_AVG_LENGTH'], 
                         mode='lines',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=2, dash='dot')))

    fig.update_xaxes(title=f'{m} Peaks number')
    fig.update_yaxes(title=f'{m} Peaks average length')
    fig.show()

In [None]:
# # Contrast vs FDR
# for m in set(infodf['modification']):
#     tm = infodf.loc[infodf['modification'] == m]
#     for g in sorted(set(tm['GAP'])):
#         tmg = tm.loc[tm['GAP'] == g]
#         fig = go.Figure()
#         for c in sorted(set(tmg['cell'])):
#             tmgc = tmg.loc[tmg['cell']==c].copy()
#             tmgc.sort_values(by=['FDR'], inplace=True)
#             fig.add_trace(go.Scatter(x=np.log10(tmgc['FDR']), y=tmgc['CONTRAST'], 
#                                      mode='lines+markers',
#                                      name=c))
#         fig.update_xaxes(title=f'{m} Log10 FDR')
#         fig.update_yaxes(title=f'{m} gap {g} Contrast')
#         fig.show()

In [None]:
# Contrast vs Peaks RPKM
for m in set(infodf['modification']):
    tm = infodf.loc[infodf['modification'] == m]
    display(tm.loc[np.logical_and(t['GAP']==5, t['FDR'].isin([0.1, 1e-6]))][
        ['modification', 'cell', 'GAP', 'FDR', 'SNR', 'PEAKS_NUMBER', 'PEAKS_RPKM', 'CONTRAST']
    ].sort_values(by=['cell']))
    for g in sorted(set(tm['GAP'])):
        tmg = tm.loc[tm['GAP'] == g]
        fig = go.Figure()
        for c in sorted(set(tmg['cell'])):
            tmgc = tmg.loc[tmg['cell']==c].copy()
            tmgc.sort_values(by=['FDR'], inplace=True)
            fig.add_trace(go.Scatter(x=np.log10(tmgc['PEAKS_RPKM']), y=tmgc['CONTRAST'], 
                                     mode='lines+markers',
                                     name=c))
        fig.update_xaxes(title=f'{m} Peaks log10 RPKM')
        fig.update_yaxes(title=f'{m} gap {g} Contrast')
        fig.show()

In [None]:
# Contrast vs Peaks RPKM
for m in set(infodf['modification']):
    tm = infodf.loc[infodf['modification'] == m]
    fig = go.Figure()
    display(tm.loc[np.logical_and(t['GAP']==5, t['FDR'].isin([0.1, 1e-6]))][
        ['modification', 'cell', 'GAP', 'FDR', 'SNR', 'PEAKS_NUMBER', 'PEAKS_RPKM', 'CONTRAST']
    ].sort_values(by=['cell']))
    
    for g in sorted(set(tm['GAP'])):
        tmg = tm.loc[tm['GAP'] == g]
        for c in sorted(set(tmg['cell'])):
            tmgc = tmg.loc[tmg['cell']==c].copy()
            tmgc.sort_values(by=['FDR'], inplace=True)
            if g == 0:
                fig.add_trace(go.Scatter(x=np.log10(tmgc['PEAKS_RPKM']), y=tmgc['CONTRAST'], 
                         mode='lines',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=2, dash='dash')))
            elif g == 5:
                fig.add_trace(go.Scatter(x=np.log10(tmgc['PEAKS_RPKM']), y=tmgc['CONTRAST'], 
                         mode='lines',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=2)))
            elif g == 10:
                fig.add_trace(go.Scatter(x=np.log10(tmgc['PEAKS_RPKM']), y=tmgc['CONTRAST'], 
                         mode='lines',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=2, dash='dot')))

    fig.update_xaxes(title=f'{m} Peaks log10 RPKM')
    fig.update_yaxes(title=f'{m} Contrast')
    fig.show()

In [None]:
# # Contrast vs Peaks number
# for m in set(infodf['modification']):
#     tm = infodf.loc[infodf['modification'] == m]
#     display(tm.loc[np.logical_and(t['GAP']==5, t['FDR']==0.1)][
#         ['modification', 'cell', 'SNR', 'PEAKS_NUMBER', 'PEAKS_RPKM', 'CONTRAST']
#     ].sort_values(by=['cell']))
#     for g in sorted(set(tm['GAP'])):
#         tmg = tm.loc[tm['GAP'] == g]
#         fig = go.Figure()
#         for c in sorted(set(tmg['cell'])):
#             tmgc = tmg.loc[tmg['cell']==c].copy()
#             tmgc.sort_values(by=['FDR'], inplace=True)
#             fig.add_trace(go.Scatter(x=np.log10(tmgc['PEAKS_NUMBER']), y=tmgc['CONTRAST'], 
#                                      mode='lines+markers',
#                                      name=c))
#         fig.update_xaxes(title=f'{m} Peaks log10 number')
#         fig.update_yaxes(title=f'{m} gap {g} Contrast')
#         fig.show()

In [None]:
# Model signal to noise vs number of peaks
import plotly.express as px
for level in ['span 200_0.01_0', 'span 200_0.01_5', 'span 200_0.01_10', 
              'span 200_1e-06_0', 'span 200_1e-06_5', 'span 200_1e-06_10',
              'span 200_1e-10_0', 'span 200_1e-10_5', 'span 200_1e-10_10',
             'span 200_0.0001_10']:
    t = dfs.loc[dfs['level']==level]
    t = pd.merge(left=t, right=sndf, left_on='gsm', right_on='gsm')
    t = t.loc[t['modification_x']!='H3K9me3']
    fig = px.scatter(t, x='sn', y='peaks', color='modification_x')
#     fig.update_xaxes(title='cell')
    fig.update_yaxes(title=f'peaks {level}')
    fig.show()

In [None]:
# Real signal-to-noise ratio
infodf['REAL_SNR'] = (infodf['PEAKS_SCORE'] / infodf['PEAKS_LENGTH']) / (infodf['TOTAL_SCORE'] - infodf['PEAKS_SCORE']) * (3*10e9 - infodf['PEAKS_LENGTH'])
infodf.head()

In [None]:
# FDR vs REAL signal to noise
for m in set(infodf['modification']):
    tm = infodf.loc[infodf['modification'] == m]
    fig = go.Figure()
    
    for g in sorted(set(tm['GAP'])):
        tmg = tm.loc[tm['GAP'] == g]
        for c in sorted(set(tmg['cell'])):
            tmgc = tmg.loc[tmg['cell']==c].copy()
            tmgc.sort_values(by=['FDR'], inplace=True)
            if g == 0:
                fig.add_trace(go.Scatter(x=np.log10(tmgc['FDR']), y=np.log10(tmgc['REAL_SNR']), 
                         mode='lines+markers',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=1, dash='dash')))
            elif g == 5:
                fig.add_trace(go.Scatter(x=np.log10(tmgc['FDR']), y=np.log10(tmgc['REAL_SNR']), 
                                         mode='lines+markers',
                                         name=f'{c} {g}',
                                         line = dict(color=cell_colors[c], width=1)))
            elif g == 10:
                fig.add_trace(go.Scatter(x=np.log10(tmgc['FDR']), y=np.log10(tmgc['REAL_SNR']), 
                         mode='lines+markers',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=1, dash='dot')))

    fig.update_xaxes(title='Log10 FDR')
    fig.update_yaxes(title=f'{m} gap {g} Log10 SNR')
    fig.show()

In [None]:
# PEAKS NUMBER vs REAL signal to noise
for m in set(infodf['modification']):
    tm = infodf.loc[infodf['modification'] == m]
    fig = go.Figure()
    
    for g in sorted(set(tm['GAP'])):
        tmg = tm.loc[tm['GAP'] == g]
        for c in sorted(set(tmg['cell'])):
            tmgc = tmg.loc[tmg['cell']==c].copy()
            tmgc.sort_values(by=['FDR'], inplace=True)
            if g == 0:
                fig.add_trace(go.Scatter(x=np.log10(tmgc['PEAKS_NUMBER']), y=np.log10(tmgc['REAL_SNR']), 
                         mode='lines+markers',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=1, dash='dash')))
            elif g == 5:
                fig.add_trace(go.Scatter(x=np.log10(tmgc['PEAKS_NUMBER']), y=np.log10(tmgc['REAL_SNR']), 
                                         mode='lines+markers',
                                         name=f'{c} {g}',
                                         line = dict(color=cell_colors[c], width=1)))
            elif g == 10:
                fig.add_trace(go.Scatter(x=np.log10(tmgc['PEAKS_NUMBER']), y=np.log10(tmgc['REAL_SNR']), 
                         mode='lines+markers',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=1, dash='dot')))

    fig.update_xaxes(title='Log10 Peaks number')
    fig.update_yaxes(title=f'{m} gap {g} Log10 SNR')
    fig.show()

In [None]:
# Show me overlaps   
spanlevels2process = set(['span tuned', 'span 200_1E-6_5', 'span 200_0.01_5'])
show_overlap(dfs.loc[[l in spanlevels2process for l in dfs['level']]])

# Summary

In [None]:
dfa = pd.concat([dfm.loc[[l in macs2levels2process for l in dfm['level']]],
                 dfsc.loc[[l in sicerlevels2process for l in dfsc['level']]],
                 dfs.loc[[l in spanlevels2process for l in dfs['level']]]])

In [None]:
dfa['f'] = dfa['modification']
dfa_mean = dfa.groupby(['f', 'level'])['peaks'].mean().reset_index().sort_values(by=['f', 'level'])
dfa_std = dfa.groupby(['f', 'level'])['peaks'].std().reset_index().fillna(0).sort_values(by=['f', 'level'])

fig = go.Figure(layout=go.Layout(title=go.layout.Title(text="Peaks")))
for l in sorted(set(dfa_mean['level'])):
    fig.add_trace(go.Scatter(x=dfa_mean.loc[dfa_mean['level']==l]['f'], 
                             y=dfa_mean.loc[dfa_mean['level']==l]['peaks'], 
                             name=f"{l} mean", line_shape='linear'))
    fig.add_trace(go.Scatter(x=dfa_std.loc[dfa_std['level']==l]['f'], 
                             y=dfa_std.loc[dfa_std['level']==l]['peaks'], 
                             name=f"{l} std", line_shape='linear', 
                             line=dict(dash='dot')))
fig.show()

In [None]:
dfa['avg_length'] = dfa['length'] / dfa['peaks']
dfa.loc[~np.isfinite(dfa["avg_length"]), "avg_length"] = 0.0

In [None]:
# # List file to create session
# for m in MODIFICATIONS:
#     for c in CELLS:
#         bw = glob.glob(f'~/data/2020_roadmapepigenomics/bams_bws/*{c}.{m}.*.bw')[0]
#         print(bw)
#         dfcm = dfa.loc[np.logical_and(dfa['cell']==c, dfa['modification']==m)]
#         for l in sorted(set(dfa['level'])):
#             peaks = list(dfcm.loc[dfcm['level'] == l]['file'])
#             if peaks:
#                 peaks = peaks[0]
#                 print(f'{os.path.dirname(peaks)}/bb/{os.path.basename(peaks)}.bb')

# Groups analysis

In [None]:
def plot_data_cells(df, cid, value, description):
    cids = sorted(set(df[cid]))
    axs = {}
    total = len(cids) * 3
    fig = plt.figure(figsize=(int(total * .75), 4))
    offset = 0
    for m in MODIFICATIONS:
        data = df.loc[df['modification'] == m].sort_values(by=[cid])
        xlabels = []
        for c in data[cid]:
            if c not in xlabels:
                xlabels.append(c)
        w = len(cids)
        ax = plt.subplot2grid((1, total), (0, offset), colspan=w)

        sns.barplot(data=data, 
                     x=cid, y=value,
                     ci="sd", capsize=.2, errwidth=2,
                     edgecolor="black",
                     ax = ax)

        sns.swarmplot(data=data,
                      x=cid, y=value,
                      size=1,
                      color="black",
                      alpha=0.5,
                      ax = ax)
        ax.legend().set_visible(False)
        axs[ax] = plt.ylim()
        if offset > 0:
            ax.get_yaxis().set_ticklabels([])
            ax.set_ylabel('')
        else:
            ax.set_ylabel(description)
        
        offset += w
        ax.set_xlabel('')
        ax.set_title(m)
        plt.xticks(range(0, len(xlabels)), xlabels, rotation=45)
            
    ymin = np.min([v[0] for v in axs.values()])
    ymax = np.max([v[1] for v in axs.values()])
 
    for ax in axs.keys():
        ax.set_ylim(bottom = ymin, top = ymax)
    plt.tight_layout()

In [None]:
plot_data_cells(dfa, 'level', 'peaks', 'Peaks')
plt.show()
plot_data_cells(dfa, 'level', 'avg_length', 'Average peak length')
plt.show()

In [None]:
dfa['f'] = dfa['level']
plot_data_cells(dfa, 'f', 'peaks', 'Peaks')
plt.show()
plot_data_cells(dfa, 'f', 'avg_length', 'Average peak length')
plt.show()

# Overlap

In [None]:
def compute_overlap(df):
    # Compute overlaps
    dft = pd.DataFrame(columns=['id', 'modification', 'level', 'overlap'])
    levels = sorted(set(df['level']))
    for m in ['K27ac', 'K4me1', 'K4me3']:
        for l in levels:
            paths = [Path(f) for f in df.loc[np.logical_and(df['modification']==m, df['level']==l)]['file']]
            df_path = f'/tmp/overlap_{m}_{l}.tsv'
            mt = bm.load_or_build_metrics_table(paths, paths, Path(df_path),
                                                jaccard=False,
                                                threads=30)
            for row in mt.index:
                for col in mt.columns:
                    overlap = mt.loc[row][col]
                    dft.loc[len(dft)] = (f'{row}@{col}', m, l, overlap)
    return dft

In [None]:
df_overlap = compute_overlap(dfa)
plot_data_cells(df_overlap, 'level', 'overlap', 'Overlaps')

In [None]:
library_sizes = pd.read_csv(
    '~/data/2020_GSE104284_replicated_k27ac_k4me1_k4me3/samples-filtered.tsv', sep='\t'
)
library_sizes

In [None]:
mseqs = np.zeros(len(dfa))
for i, gsm in enumerate(tqdm(dfa['gsm'])):
    value = library_sizes.loc[[gsm in sn for sn in library_sizes['Sample Name']]]['M Total seqs']
    if len(value) == 0:
        print(f'Nothing found for {gsm}')
        value = 0
    mseqs[i] = value

In [None]:
t = dfa[['level', 'peaks', 'modification']].copy()
t['mseqs'] = mseqs
t = t.loc[t['level'].isin(['span tuned', 'macs2 broad_0.1', 'sicer FDR0.01'])]
t

In [None]:
for m in set(t['modification']):
    plt.figure(figsize=(10, 10))
    sns.scatterplot(x='mseqs', y='peaks', hue='level', data=t.loc[t['modification'] == m], s=100)
    plt.suptitle(m)
    plt.show()

In [None]:
from matplotlib.patches import Patch

cmaps = ['Reds', 'Blues', 'Greens']
legend_elements = [Patch(facecolor=c[:-1], label=m) 
                   for c, m in zip(cmaps, sorted(set(t['modification'])))]
for l in sorted(set(t['level'])):
    plt.figure(figsize=(5, 5))
    tl = t.loc[t['level']==l]
    for i, m in enumerate(sorted(set(tl['modification']))):
        tlm = tl.loc[tl['modification']==m]
        sns.kdeplot(tlm['mseqs'], tlm['peaks'], cmap=cmaps[i], shade=True, shade_lowest=False, alpha=0.4)
    axes = plt.gca()
    axes.set_xlim([-5 ,50])
    axes.set_ylim([-10000, 70000])
    plt.suptitle(l)
    plt.legend(handles=legend_elements)
    plt.show()

In [None]:
cmaps = ['Reds', 'Blues', 'Greens']
legend_elements = [Patch(facecolor=c[:-1], label=l) 
                   for c, l in zip(cmaps, sorted(set(t['level'])))]
for m in sorted(set(t['modification'])):
    plt.figure(figsize=(5, 5))
    tm = t.loc[t['modification']==m]
    for i, l in enumerate(sorted(set(tm['level']))):
        tml = tm.loc[tm['level']==l]
        sns.kdeplot(tml['mseqs'], tml['peaks'], cmap=cmaps[i], shade=True, shade_lowest=False, alpha=0.3)
    axes = plt.gca()
    axes.set_xlim([-5 ,50])
    axes.set_ylim([-10000, 70000])
    plt.suptitle(m)
    plt.legend(handles=legend_elements)
    plt.show()