# 2020_GSE26320_ENCODE

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

import pandas as pd
import seaborn as sns
from IPython.display import display

sns.set_style("whitegrid")
import matplotlib.pyplot as plt
import numpy as np
import os, re
import glob
from tqdm.auto import tqdm

import plotly.graph_objects as go

In [None]:
PATH = os.path.expanduser('~/data/2022_GSE26320')
# Don't have H1 H3K27ac rep2
# Don't have Huvec H3K4me3 rep1
# Don't have HepG2 H3K4me1 rep2
# CELLS = ['GM12878', 'HMEC', 'HSMM', 'K562', 'NHEK', 'NHLF']  # Ignore H1, Huvec, HepG2
CELLS = ['GM12878']
MODIFICATIONS = ['H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3']
REPS = ['rep1']

# MACS2

In [None]:
MACS2_FOLDER=f'{PATH}/macs2'
MACS2_LEVELS = ['q0.05', 'broad0.1']

dfm = pd.DataFrame(columns=['gsm', 'cell', 'modification', 'level', 'file', 'peaks', 'length'], dtype=object)
for file in tqdm(glob.glob(MACS2_FOLDER + '/*.*Peak')):
    if 'gapped' in file:
        continue
    level = next((l for l in MACS2_LEVELS if f'_{l}' in file), None)
    modification = next((m for m in MODIFICATIONS if f'_{m}' in file), None)
    cell = next((c for c in CELLS if f'_{c}' in file), None)
#     print(file, level, modification, cell)
    if level and modification and cell:
        gsm = re.sub('_.*', '', os.path.basename(file))
        out = ! awk '{{N+=1;L+=($$3-$$2)}} END{{print(N,L)}}' {file}
        if out[0].strip() != '':
            peaks, length = out[0].split(' ') 
        else:
            peaks, length = 0, 0
        dfm.loc[len(dfm)] = (gsm, cell, modification, f'macs2 {level}', file, peaks, length)
        
# Fix types
dfm['peaks'] = dfm['peaks'].astype(int)
dfm['length'] = dfm['length'].astype(int)

In [None]:
len(dfm)

In [None]:
dfm['f'] = dfm['modification']
dfm_mean = dfm.groupby(['f', 'level'])['peaks'].mean().reset_index().sort_values(by=['f', 'level'])
dfm_std = dfm.groupby(['f', 'level'])['peaks'].std().reset_index().fillna(0).sort_values(by=['f', 'level'])

# fig = go.Figure(layout=go.Layout(title=go.layout.Title(text="Peaks")))
# for l in sorted(set(dfm_mean['level'])):
#     fig.add_trace(go.Scatter(x=dfm_mean.loc[dfm_mean['level']==l]['f'], 
#                              y=dfm_mean.loc[dfm_mean['level']==l]['peaks'], 
#                              name=f"{l} mean", line_shape='linear'))
#     fig.add_trace(go.Scatter(x=dfm_std.loc[dfm_std['level']==l]['f'], 
#                              y=dfm_std.loc[dfm_std['level']==l]['peaks'], 
#                              name=f"{l} std", line_shape='linear', 
#                              line=dict(dash='dot')))
# fig.show()

In [None]:
macs2levels2process = set(['macs2 q0.05', 'macs2 broad0.1'])

# SICER

In [None]:
SICER_FOLDER=f'{PATH}/sicer'
SICER_LEVELS = ['FDR0.01']

dfsc = pd.DataFrame(columns=['gsm', 'cell', 'modification', 'level', 'file', 'peaks', 'length'], dtype=object)
for file in tqdm(glob.glob(SICER_FOLDER + '/*islands-summary*')):
    level = next((l for l in SICER_LEVELS if f'-{l}' in file), None)
    modification = next((m for m in MODIFICATIONS if f'_{m}' in file), None)
    cell = next((c for c in CELLS if f'_{c}' in file), None)
#     print(file, level, modification, cell)
    if level and modification and cell:
        gsm = re.sub('_.*', '', os.path.basename(file))
        out = ! awk '{{N+=1;L+=($$3-$$2)}} END{{print(N,L)}}' {file}
        if out[0].strip() != '':
            peaks, length = out[0].split(' ') 
        else:
            peaks, length = 0, 0
        dfsc.loc[len(dfsc)] = (gsm, cell, modification, f'sicer {level}', file, peaks, length)

# Fix types
dfsc['peaks'] = dfsc['peaks'].astype(int)
dfsc['length'] = dfsc['length'].astype(int)

In [None]:
display(dfsc)
len(dfsc)

In [None]:
dfsc['f'] = dfsc['modification']
dfsc_mean = dfsc.groupby(['f', 'level'])['peaks'].mean().reset_index().sort_values(by=['f', 'level'])
dfsc_std = dfsc.groupby(['f', 'level'])['peaks'].std().reset_index().fillna(0).sort_values(by=['f', 'level'])

# fig = go.Figure(layout=go.Layout(title=go.layout.Title(text="Peaks")))
# for l in sorted(set(dfsc_mean['level'])):
#     fig.add_trace(go.Scatter(x=dfsc_mean.loc[dfsc_mean['level']==l]['f'], 
#                              y=dfsc_mean.loc[dfsc_mean['level']==l]['peaks'], 
#                              name=f"{l} mean", line_shape='linear'))
#     fig.add_trace(go.Scatter(x=dfsc_std.loc[dfsc_std['level']==l]['f'], 
#                              y=dfsc_std.loc[dfsc_std['level']==l]['peaks'], 
#                              name=f"{l} std", line_shape='linear', 
#                              line=dict(dash='dot')))
# fig.show()

In [None]:
sicerlevels2process = set(['sicer FDR0.01'])

# SPAN

In [None]:
from itertools import product
SPAN_FOLDER=f'{PATH}/span'
GAPS = [3]
FDRS = ['0.05']
# SPAN_LEVELS = ['200_1E-6_5', '200_0.01_5']
SPAN_LEVELS = [f'100_{fdr}_{gap}' for fdr, gap in product(FDRS, GAPS)]

dfs = pd.DataFrame(columns=['gsm', 'cell', 'modification', 'level', 'file', 'peaks', 'length'], dtype=object)
for file in tqdm(glob.glob(SPAN_FOLDER + '/*.peak')):
    if 'Input' in file:
        continue
    level = next((l for l in SPAN_LEVELS if f'_{l}' in file), None)
    modification = next((m for m in MODIFICATIONS if f'_{m}' in file), None)
    cell = next((c for c in CELLS if f'_{c}' in file), None)
#     print(file, level, modification, cell)
    if level and modification and cell:
        gsm = re.sub('_.*', '', os.path.basename(file))
        out = ! awk '{{N+=1;L+=($$3-$$2)}} END{{print(N,L)}}' {file}
        if out[0].strip() != '':
            peaks, length = out[0].split(' ') 
        else:
            peaks, length = 0, 0
        dfs.loc[len(dfs)] = (gsm, cell, modification, f'span {level}', file, peaks, length)
        
# Fix types
dfs['peaks'] = dfs['peaks'].astype(int)
dfs['length'] = dfs['length'].astype(int)

In [None]:
display(dfs)
len(dfs)

In [None]:
import re
dfs['fdr'] = [float(re.sub('span 100_|_(0|5|10)', '', l)) for l in dfs['level']]
dfs['gap'] = [int(re.sub('.*_', '', l)) for l in dfs['level']]
dfs.sort_values(by=['fdr', 'gap'], inplace=True)
dfs.head()

In [None]:
dfs['f'] = dfs['modification']
dfs_mean = dfs.groupby(['f', 'level'])['peaks'].mean().reset_index().sort_values(by=['f', 'level'])
dfs_std = dfs.groupby(['f', 'level'])['peaks'].std().reset_index().fillna(0).sort_values(by=['f', 'level'])

# fig = go.Figure(layout=go.Layout(title=go.layout.Title(text="Peaks")))
# for l in sorted(set(dfs_mean['level'])):
#     fig.add_trace(go.Scatter(x=dfs_mean.loc[dfs_mean['level']==l]['f'], 
#                              y=dfs_mean.loc[dfs_mean['level']==l]['peaks'], 
#                              name=f"{l} mean", line_shape='linear'))
#     fig.add_trace(go.Scatter(x=dfs_std.loc[dfs_std['level']==l]['f'], 
#                              y=dfs_std.loc[dfs_std['level']==l]['peaks'], 
#                              name=f"{l} std", line_shape='linear', 
#                              line=dict(dash='dot')))
# fig.show()

In [None]:
# Plot peaks number versus FDR for different modifications and GAPs
for m in sorted(set(dfs['modification'])):
    t = dfs.loc[dfs['modification'] == m]
    for gap in sorted(set(dfs['gap'])):
        t2 = t.loc[t['gap']==gap]
        fig = go.Figure()
        for cell in set(t2['cell']):
            t3 = t2.loc[t2['cell'] == cell]
            fig.add_trace(go.Scatter(x=np.log10(t3["fdr"]), y=t3["peaks"], 
                                     mode='lines+markers',
                                     name=cell))
        fig.update_xaxes(title='log10 fdr')
        fig.update_yaxes(title=f'{m} gap {gap} peaks')
        fig.show()

In [None]:
spanlevels2process = set(['span 100_0.05_3'])

# Summary

In [None]:
dfa = pd.concat([dfm.loc[[l in macs2levels2process for l in dfm['level']]],
                 dfsc.loc[[l in sicerlevels2process for l in dfsc['level']]],
                 dfs.loc[[l in spanlevels2process for l in dfs['level']]]])

In [None]:
dfa['f'] = dfa['modification']
dfa_mean = dfa.groupby(['f', 'level'])['peaks'].mean().reset_index().sort_values(by=['f', 'level'])
dfa_std = dfa.groupby(['f', 'level'])['peaks'].std().reset_index().fillna(0).sort_values(by=['f', 'level'])

fig = go.Figure(layout=go.Layout(title=go.layout.Title(text="Peaks")))
for l in sorted(set(dfa_mean['level'])):
    fig.add_trace(go.Scatter(x=dfa_mean.loc[dfa_mean['level']==l]['f'], 
                             y=dfa_mean.loc[dfa_mean['level']==l]['peaks'], 
                             name=f"{l} mean", line_shape='linear'))
    fig.add_trace(go.Scatter(x=dfa_std.loc[dfa_std['level']==l]['f'], 
                             y=dfa_std.loc[dfa_std['level']==l]['peaks'], 
                             name=f"{l} std", line_shape='linear', 
                             line=dict(dash='dot')))
fig.show()

In [None]:
dfa['avg_length'] = dfa['length'] / dfa['peaks']
dfa.loc[~np.isfinite(dfa["avg_length"]), "avg_length"] = 0.0

In [None]:
# # List file to create session
# for m in MODIFICATIONS:
#     for c in CELLS:
#         bw = glob.glob(f'/mnt/stripe/shpynov/2020_roadmapepigenomics/bams_bws/*{c}.{m}.*.bw')[0]
#         print(bw)
#         dfcm = dfa.loc[np.logical_and(dfa['cell']==c, dfa['modification']==m)]
#         for l in sorted(set(dfa['level'])):
#             peaks = list(dfcm.loc[dfcm['level'] == l]['file'])
#             if peaks:
#                 peaks = peaks[0]
#                 print(f'{os.path.dirname(peaks)}/bb/{os.path.basename(peaks)}.bb')

# Group analysis

In [None]:
def plot_data_cells(df, cid, value, description):
    cids = sorted(set(df[cid]))
    axs = {}
    total = len(cids) * 5
    fig = plt.figure(figsize=(int(total * .75), 4))
    offset = 0
    for m in MODIFICATIONS:
        data = df.loc[df['modification'] == m].sort_values(by=[cid])
        xlabels = []
        for c in data[cid]:
            if c not in xlabels:
                xlabels.append(c)
        w = len(cids)
        ax = plt.subplot2grid((1, total), (0, offset), colspan=w)

        sns.barplot(data=data, 
                     x=cid, y=value,
                     capsize=.2, errwidth=2,
                     edgecolor="black",
                     ax = ax)

        sns.swarmplot(data=data,
                      x=cid, y=value,
                      size=1,
                      color="black",
                      alpha=0.5,
                      ax = ax)
        ax.legend().set_visible(False)
        axs[ax] = plt.ylim()
        if offset > 0:
            ax.get_yaxis().set_ticklabels([])
            ax.set_ylabel('')
        else:
            ax.set_ylabel(description)
        
        offset += w
        ax.set_xlabel('')
        ax.set_title(m)
        plt.xticks(range(0, len(xlabels)), xlabels, rotation=45)
            
    ymin = np.min([v[0] for v in axs.values()])
    ymax = np.max([v[1] for v in axs.values()])
 
    for ax in axs.keys():
        ax.set_ylim(bottom = ymin, top = ymax)
    plt.tight_layout()

In [None]:
plot_data_cells(dfa, 'level', 'peaks', 'Peaks')
plt.show()
plot_data_cells(dfa, 'level', 'avg_length', 'Average peak length')
plt.show()

In [None]:
dfa['f'] = dfa['level']
plot_data_cells(dfa, 'f', 'peaks', 'Peaks')
plt.show()
plot_data_cells(dfa, 'f', 'avg_length', 'Average peak length')
plt.show()

# Consistency analysis

In [None]:
def bedl(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[2] - tf[1]
    except:
        return [] # Empty file

def lines(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return len(tf)
    except:
        return 0 # Empty file

def join(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return ','.join(f'{c}:{s}-{e}' for c, s, e in zip(tf[0], tf[1], tf[2]))
    except:
        return '' # Empty file

def rank_correlation(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return spearmanr(tf[3], tf[7])[0]
    except:
        return 0 # Empty file
    
def d(a, b):
    return a / b if b != 0 else 0


In [None]:
import tempfile
from scipy.stats import spearmanr

BED4_PATH = f'{PATH}/bed4'
! mkdir -p {BED4_PATH}
  
tools_comparison = pd.DataFrame(columns=['gsm', 'modification', 
                                         'tool1', 'tool2', 
                                         'file1', 'file2',
                                         'peaks1', 'peaks2', 
                                         'length1', 'length2',
                                         'intersect', 'intersectl',
                                         'union', 'unionl',
                                         'overlap1', 'overlap1l', 'overlap2', 'overlap2l',                                       
                                         'diff1', 'diff1l', 'diff2', 'diff2l', 
                                         'diff1_significant', 'diff2_significant',
                                         'ranks_corr', 'n'], dtype=object)
                                      
tf_union = tempfile.mktemp()
with open(tf_union, 'w') as f:
    f.write("""
        cat $1 > $3.tmp
        cat $2 >> $3.tmp
        cat $3.tmp | awk -v OFS='\t' '{print $1,$2,$3}' | sort -k1,1 -k2,2n > $3
        rm $3.tmp
    """)

tf_diff_significant = tempfile.mktemp()
with open(tf_diff_significant, 'w') as f:
    f.write("""
        bedtools intersect -a $1 -b $2 -wa -v > $3.tmp
        if [ -s $3.tmp ]; then
            cat $3.tmp | sort -k4,4nr > $3.tmp2;
            head -n 10 $3.tmp2 > $3;
        else
            > $3;
        fi;
        rm $3.tmp*;
    """)

to_bed4 = tempfile.mktemp()
with open(to_bed4, 'w') as f:
    f.write("""
        cat $1 | awk -v OFS='\t' '{print $1,$2,$3,$5}' | sort -k1,1 -k2,2n > $2;
    """)

to_bed4_top = tempfile.mktemp()
with open(to_bed4_top, 'w') as f:
    f.write("""
        echo "INPUT $1"
        echo "OUTPUT $2"
        echo "N $3"
        echo "COL $4"
        echo "SORT $5"
        cat $1 | awk -v OFS='\t' -v N=$4 -v SORT=$5 '{print $1,$2,$3,$N*SORT}' | sort -k4,4nr > $1.tmp;
        head -n $3 $1.tmp | sort -k1,1 -k2,2n > $2;
        rm $1.tmp;
    """)


def prepare_bed4(file, top=None):
    name = os.path.basename(file)
    file_bed4 = f'{BED4_PATH}/{name}'.replace(' ', '_')
    
    if n is not None:
        file_bed4 += f'_{n}.bed4'
    else:
        file_bed4 += '.bed4'
    
    if not os.path.exists(file_bed4):
        if top is not None:
            score_col, sort = 5, 5
            print(f'bash {to_bed4_top} {file} {file_bed4} {n} {score_col} {sort}')
            !bash {to_bed4_top} {file} {file_bed4} {top} {score_col} {sort}
        else:
            print(f'bash {to_bed4} {file} {file_bed4}')
            !bash {to_bed4} {file} {file_bed4}
    return file_bed4

    
tools = list(sorted(set(dfa['level'])))
for gsm in tqdm(sorted(set(dfa['gsm']))):
    for i in range(len(tools)):
        for j in range(i + 1, len(tools)):
            tool1, tool2 = tools[i], tools[j]
            t1 = dfa.loc[(dfa['gsm'] == gsm) & (dfa['level'] == tool1)]
            t2 = dfa.loc[(dfa['gsm'] == gsm) & (dfa['level'] == tool2)]
            m = t1['modification'].values[0]
            print(gsm, m, tool1, tool2)

            tf = tempfile.mktemp()
            tf2 = tempfile.mktemp()
            for n in [None, 10000, 1000]:
                print(n)
                file1 = t1['file'].values[0]
                file2 = t2['file'].values[0]

                file1, file2 = prepare_bed4(file1, n), prepare_bed4(file2, n)
                peaks1, length1 = lines(file1), sum(bedl(file1))
                peaks2, length2 = lines(file2), sum(bedl(file2))

                !bedtools intersect -b {file1} -a {file2} > {tf}
                intersect, intersectl = lines(tf), sum(bedl(tf))


                !bash {tf_union} {file1} {file2} {tf}
                union, unionl = lines(tf), sum(bedl(tf))

                !bedtools intersect -a {file1} -b {file2} -wa -u > {tf}
                overlap1, overlap1l = lines(tf), sum(bedl(tf))
                !bedtools intersect -b {file1} -a {file2} -wa -u > {tf}
                overlap2, overlap2l = lines(tf), sum(bedl(tf))

                !bedtools intersect -a {file1} -b {file2} -v > {tf}
                diff1, diff1l = lines(tf), sum(bedl(tf))
                !bedtools intersect -b {file1} -a {file2} -v > {tf}
                diff2, diff2l = lines(tf), sum(bedl(tf))

                !bash {tf_diff_significant} {file1} {file2} {tf}
                diff1_significant = join(tf)            
                !bash {tf_diff_significant} {file2} {file1} {tf}
                diff2_significant = join(tf)       
                
                !bedtools intersect -a {file1} -b {file2} -wa -wb > {tf}
                rank_corr = rank_correlation(tf)    
                
                tools_comparison.loc[len(tools_comparison)] = \
                    (gsm, m, 
                     tool1, tool2, 
                     file1, file2, 
                     peaks1, peaks2, 
                     length1, length2, 
                     intersect, intersectl,
                     union, unionl,
                     overlap1, overlap1l, overlap2, overlap2l,
                     diff1, diff1l, diff2, diff2l, 
                     diff1_significant, diff2_significant,
                     rank_corr, n)

print('Done')

In [None]:
tools_comparison['jaccard'] = [d(i, u) for i, u in zip(tools_comparison['intersect'], tools_comparison['union'])]
tools_comparison['jaccardl'] = [d(i, u) for i, u in zip(tools_comparison['intersectl'], tools_comparison['unionl'])]
tools_comparison['overlap1p'] = [d(o, p) for o, p in zip(tools_comparison['overlap1'], tools_comparison['peaks1'])]
tools_comparison['overlap2p'] = [d(o, p) for o, p in zip(tools_comparison['overlap2'], tools_comparison['peaks2'])]

tools_comparison.head()

In [None]:
tools_comparison.to_csv(f'{PATH}/tools_comparison.tsv', sep='\t', index=False)

In [None]:
# tools_comparison[tools_comparison['gsm']=='GSM646316']

## Overlap analysis

In [None]:
# Overlap
for m in sorted(set(tools_comparison['modification'])):
    print(m)
    tm = tools_comparison[tools_comparison['modification'] == m].copy()
    tm.sort_values(by=['modification', 'gsm', 'tool1', 'tool2'], inplace=True)
    tm.fillna(1000000, inplace=True)
    to = pd.DataFrame(columns=['GSM', 'N', 'Comparison', 'Overlap'], dtype=object)
    for _, row in tm.iterrows():
        gsm = row['gsm']
        n = row['n']
        t1 = row['tool1']
        t2 = row['tool2']
        to.loc[len(to)] = (gsm, n, f'{t1} vs {t2}', row['overlap1p'])
        to.loc[len(to)] = (gsm, n, f'{t2} vs {t1}', row['overlap2p'])

    plt.figure(figsize=(18, 5))
    sns.barplot(data=to, 
                x='N', y='Overlap', hue='Comparison', capsize=.1, errwidth=2, edgecolor="black")
    plt.show()

In [None]:
# Jaccard
for m in sorted(set(tools_comparison['modification'])):
    print(m)
    tm = tools_comparison[tools_comparison['modification'] == m].copy()
    tm.sort_values(by=['modification', 'gsm', 'tool1', 'tool2'], inplace=True)
    tm.fillna(1000000, inplace=True)
    to = pd.DataFrame(columns=['GSM', 'N', 'Comparison', 'Jaccard'], dtype=object)
    for _, row in tm.iterrows():
        gsm = row['gsm']
        n = row['n']
        t1 = row['tool1']
        t2 = row['tool2']
        to.loc[len(to)] = (gsm, n, f'{t1} vs {t2}', row['jaccard'])

    plt.figure(figsize=(18, 5))
    sns.barplot(data=to, 
                x='N', y='Jaccard', hue='Comparison', capsize=.1, errwidth=2, edgecolor="black")
    plt.show()

In [None]:
# Jaccard heatmaps
for m in sorted(set(tools_comparison['modification'])):
    print(m)
    tm = tools_comparison[tools_comparison['modification'] == m].copy()

    tm.sort_values(by=['modification', 'gsm', 'tool1', 'tool2'], inplace=True)
    tm.fillna(1000000, inplace=True)

    fig = plt.figure(figsize=(18, 3))
    for i, n in enumerate(sorted(set(tm['n']))):
        ax  = plt.subplot(1, 4, i + 1)
        tmn = tm[tm['n'] == n]
        ts = []
        for gsm in sorted(set(tmn['gsm'])):
            tmng = tmn[tmn['gsm'] == gsm][['tool1', 'tool2', 'jaccard']].copy()
#             print(gsm)
#             display(tmng)            
            tmngp = tmng.pivot(index='tool1', columns='tool2', values='jaccard')
            ts.append(tmngp)
        ts = pd.concat(ts)
        ts = ts.groupby(ts.index).mean()
#         display(ts)
        sns.heatmap(ts, ax=ax, vmin=0, vmax=1, annot=True, fmt='.2f', 
                    yticklabels=i==0,
                    cbar=i==3)
        ax.set_xlabel(str(n))

    plt.show()       

In [None]:
# Overlap heatmaps
for m in sorted(set(tools_comparison['modification'])):
    print(m)
    tm = tools_comparison[tools_comparison['modification'] == m].copy()

    tm.sort_values(by=['modification', 'gsm', 'tool1', 'tool2'], inplace=True)
    tm.fillna(1000000, inplace=True)

    fig = plt.figure(figsize=(18, 3))
    for i, n in enumerate(sorted(set(tm['n']))):
        ax  = plt.subplot(1, 4, i + 1)
        tmn = tm[tm['n'] == n]
        ts = []
        for gsm in sorted(set(tmn['gsm'])):
            tmng1 = tmn[tmn['gsm'] == gsm][['tool1', 'tool2', 'overlap1p']].copy()
            tmng1.rename(dict(overlap1p='overlap'), axis=1, inplace=True)
            tmng2 = tmn[tmn['gsm'] == gsm][['tool1', 'tool2', 'overlap2p']].copy()
            tmng2.rename(dict(overlap2p='overlap', tool1='tool2', tool2='tool1'), axis=1, inplace=True)
            tmng = pd.concat([tmng1, tmng2])
#             print(gsm)
#             display(tmng)            
            tmngp = tmng.pivot(index='tool1', columns='tool2', values='overlap')
            tmngp.fillna(1., inplace=True)
            ts.append(tmngp)
        ts = pd.concat(ts)
        ts = ts.groupby(ts.index).mean()
#         display(ts)
        sns.heatmap(ts, ax=ax, vmin=0, vmax=1, annot=True, fmt='.2f', 
                    yticklabels=i==0,
                    cbar=i==3)
        ax.set_xlabel(str(n))

    plt.show()       

In [None]:
__BREAK__

# Coverage analysis
```
# Experimental signal-to-noise ratio and peak calling contrast
for M in $(ls {PATH}/span/fit/*.span); do echo $M; java -cp experiments/build/libs/experiments-dev.jar -Dconfig.path=/home/user/.epigenome/config.properties org.jetbrains.bio.experiments.SPANPeakCallingContrastExperiment $M; done
```

In [None]:
from io import StringIO

ts = []
for info in tqdm(glob.glob('/mnt/stripe/bio/experiments/span_peak_calling_contrast/*.info')):
    name = os.path.basename(info)
    gsm = re.findall('(GSM[0-9]+)', name)[0]
    with open(info) as t:
        info = ''.join(t.readlines())
        if 'Error' in info:
            continue
#         print(info)
        sn = float(re.findall('Signal to noise: ([0-9\.]+)', info)[0])
#         print('SN', sn)
        totalscore = int(re.findall('Total Score: ([0-9]+)', info)[0])
#         print('TS', totalscore)
        dft = re.sub('(.|\n)+Total Score:[^\n]+\n', '', info)
        t = pd.read_csv(StringIO(dft), sep='\t')
        t['GSM'] = gsm
        t['SNR'] = sn
        t['TOTAL_SCORE'] = totalscore
        ts.append(t)
t = pd.concat(ts)

t['PEAKS_RPKM'] = t['PEAKS_SCORE'] / (t['PEAKS_LENGTH'] / 1000) / (t['TOTAL_SCORE'] / 1000000)
t['SHORES_RPKM'] = t['SHORES_SCORE'] / (t['SHORES_LENGTH'] / 1000) / (t['TOTAL_SCORE'] / 1000000)
t['CONTRAST'] = t['PEAKS_RPKM'] / t['SHORES_RPKM']
infodf = t
infodf = infodf.loc[infodf['FDR'].astype(float)>=1e-10]
infodf.head()

In [None]:
t = pd.merge(left=infodf, left_on='GSM', right=dfs[['gsm', 'cell', 'modification']].drop_duplicates(), right_on='gsm')
infodf = t
infodf.head()

In [None]:
import matplotlib

def rgb2hex(r, g, b):
    r, g, b = r * 255, g * 255, b * 255
    return "#{0:02x}{1:02x}{2:02x}".format(int(r), int(g), int(b))

cells = list(set(infodf['cell']))
cmap = matplotlib.cm.get_cmap('tab20', len(cells))
cell_colors = dict(zip(cells, [rgb2hex(*cmap(i)[:3]) for i in range(len(cells))]))
cell_colors

In [None]:
# Peaks number vs FDR
for m in set(infodf['modification']):
    tm = infodf.loc[infodf['modification'] == m]
    fig = go.Figure()
    
    for g in sorted(set(tm['GAP'])):
        tmg = tm.loc[tm['GAP'] == g]
        for c in sorted(set(tmg['cell'])):
            tmgc = tmg.loc[tmg['cell']==c].copy()
            tmgc.sort_values(by=['FDR'], inplace=True)
            if g == 0:
                fig.add_trace(go.Scatter(x=np.log10(tmgc['FDR']), y=tmgc['PEAKS_NUMBER'], 
                         mode='lines',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=2, dash='dash')))
            elif g == 5:
                fig.add_trace(go.Scatter(x=np.log10(tmgc['FDR']), y=tmgc['PEAKS_NUMBER'], 
                         mode='lines',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=2)))
            elif g == 10:
                fig.add_trace(go.Scatter(x=np.log10(tmgc['FDR']), y=tmgc['PEAKS_NUMBER'], 
                         mode='lines',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=2, dash='dot')))

    fig.update_xaxes(title=f'{m} Log10 FDR')
    fig.update_yaxes(title=f'{m} Peaks')
    fig.show()

In [None]:
infodf['PEAKS_AVG_LENGTH'] = infodf['PEAKS_LENGTH'] / infodf['PEAKS_NUMBER']
# Peaks number vs Peaks average length
for m in set(infodf['modification']):
    tm = infodf.loc[infodf['modification'] == m]
    fig = go.Figure()
    
    for g in sorted(set(tm['GAP'])):
        tmg = tm.loc[tm['GAP'] == g]
        for c in sorted(set(tmg['cell'])):
            tmgc = tmg.loc[tmg['cell']==c].copy()
            tmgc.sort_values(by=['FDR'], inplace=True)
            if g == 0:
                fig.add_trace(go.Scatter(x=tmgc['PEAKS_NUMBER'], y=tmgc['PEAKS_AVG_LENGTH'], 
                         mode='lines',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=2, dash='dash')))
            elif g == 5:
                fig.add_trace(go.Scatter(x=tmgc['PEAKS_NUMBER'], y=tmgc['PEAKS_AVG_LENGTH'], 
                         mode='lines',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=2)))
            elif g == 10:
                fig.add_trace(go.Scatter(x=tmgc['PEAKS_NUMBER'], y=tmgc['PEAKS_AVG_LENGTH'], 
                         mode='lines',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=2, dash='dot')))

    fig.update_xaxes(title=f'{m} Peaks number')
    fig.update_yaxes(title=f'{m} Peaks average length')
    fig.show()

In [None]:
# # Contrast vs FDR
# for m in set(infodf['modification']):
#     tm = infodf.loc[infodf['modification'] == m]
#     for g in sorted(set(tm['GAP'])):
#         tmg = tm.loc[tm['GAP'] == g]
#         fig = go.Figure()
#         for c in sorted(set(tmg['cell'])):
#             tmgc = tmg.loc[tmg['cell']==c].copy()
#             tmgc.sort_values(by=['FDR'], inplace=True)
#             fig.add_trace(go.Scatter(x=np.log10(tmgc['FDR']), y=tmgc['CONTRAST'], 
#                                      mode='lines+markers',
#                                      name=c))
#         fig.update_xaxes(title=f'{m} Log10 FDR')
#         fig.update_yaxes(title=f'{m} gap {g} Contrast')
#         fig.show()

In [None]:
# Contrast vs Peaks RPKM
for m in set(infodf['modification']):
    tm = infodf.loc[infodf['modification'] == m]
    display(tm.loc[np.logical_and(t['GAP']==5, t['FDR'].isin([0.1, 1e-6]))][
        ['modification', 'cell', 'GAP', 'FDR', 'SNR', 'PEAKS_NUMBER', 'PEAKS_RPKM', 'CONTRAST']
    ].sort_values(by=['cell']))
    for g in sorted(set(tm['GAP'])):
        tmg = tm.loc[tm['GAP'] == g]
        fig = go.Figure()
        for c in sorted(set(tmg['cell'])):
            tmgc = tmg.loc[tmg['cell']==c].copy()
            tmgc.sort_values(by=['FDR'], inplace=True)
            fig.add_trace(go.Scatter(x=np.log10(tmgc['PEAKS_RPKM']), y=tmgc['CONTRAST'], 
                                     mode='lines+markers',
                                     name=c))
        fig.update_xaxes(title=f'{m} Peaks log10 RPKM')
        fig.update_yaxes(title=f'{m} gap {g} Contrast')
        fig.show()

In [None]:
# Contrast vs Peaks RPKM
for m in set(infodf['modification']):
    tm = infodf.loc[infodf['modification'] == m]
    fig = go.Figure()
    display(tm.loc[np.logical_and(t['GAP']==5, t['FDR'].isin([0.1, 1e-6]))][
        ['modification', 'cell', 'GAP', 'FDR', 'SNR', 'PEAKS_NUMBER', 'PEAKS_RPKM', 'CONTRAST']
    ].sort_values(by=['cell']))
    
    for g in sorted(set(tm['GAP'])):
        tmg = tm.loc[tm['GAP'] == g]
        for c in sorted(set(tmg['cell'])):
            tmgc = tmg.loc[tmg['cell']==c].copy()
            tmgc.sort_values(by=['FDR'], inplace=True)
            if g == 0:
                fig.add_trace(go.Scatter(x=np.log10(tmgc['PEAKS_RPKM']), y=tmgc['CONTRAST'], 
                         mode='lines',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=2, dash='dash')))
            elif g == 5:
                fig.add_trace(go.Scatter(x=np.log10(tmgc['PEAKS_RPKM']), y=tmgc['CONTRAST'], 
                         mode='lines',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=2)))
            elif g == 10:
                fig.add_trace(go.Scatter(x=np.log10(tmgc['PEAKS_RPKM']), y=tmgc['CONTRAST'], 
                         mode='lines',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=2, dash='dot')))

    fig.update_xaxes(title=f'{m} Peaks log10 RPKM')
    fig.update_yaxes(title=f'{m} Contrast')
    fig.show()

In [None]:
# # Contrast vs Peaks number
# for m in set(infodf['modification']):
#     tm = infodf.loc[infodf['modification'] == m]
#     display(tm.loc[np.logical_and(t['GAP']==5, t['FDR']==0.1)][
#         ['modification', 'cell', 'SNR', 'PEAKS_NUMBER', 'PEAKS_RPKM', 'CONTRAST']
#     ].sort_values(by=['cell']))
#     for g in sorted(set(tm['GAP'])):
#         tmg = tm.loc[tm['GAP'] == g]
#         fig = go.Figure()
#         for c in sorted(set(tmg['cell'])):
#             tmgc = tmg.loc[tmg['cell']==c].copy()
#             tmgc.sort_values(by=['FDR'], inplace=True)
#             fig.add_trace(go.Scatter(x=np.log10(tmgc['PEAKS_NUMBER']), y=tmgc['CONTRAST'], 
#                                      mode='lines+markers',
#                                      name=c))
#         fig.update_xaxes(title=f'{m} Peaks log10 number')
#         fig.update_yaxes(title=f'{m} gap {g} Contrast')
#         fig.show()

In [None]:
# Real signal-to-noise ratio
infodf['REAL_SNR'] = (infodf['PEAKS_SCORE'] / infodf['PEAKS_LENGTH']) / (infodf['TOTAL_SCORE'] - infodf['PEAKS_SCORE']) * (3*10e9 - infodf['PEAKS_LENGTH'])
infodf.head()

In [None]:
# FDR vs REAL signal to noise
for m in set(infodf['modification']):
    tm = infodf.loc[infodf['modification'] == m]
    fig = go.Figure()
    
    for g in sorted(set(tm['GAP'])):
        tmg = tm.loc[tm['GAP'] == g]
        for c in sorted(set(tmg['cell'])):
            tmgc = tmg.loc[tmg['cell']==c].copy()
            tmgc.sort_values(by=['FDR'], inplace=True)
            if g == 0:
                fig.add_trace(go.Scatter(x=np.log10(tmgc['FDR']), y=np.log10(tmgc['REAL_SNR']), 
                         mode='lines+markers',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=1, dash='dash')))
            elif g == 5:
                fig.add_trace(go.Scatter(x=np.log10(tmgc['FDR']), y=np.log10(tmgc['REAL_SNR']), 
                                         mode='lines+markers',
                                         name=f'{c} {g}',
                                         line = dict(color=cell_colors[c], width=1)))
            elif g == 10:
                fig.add_trace(go.Scatter(x=np.log10(tmgc['FDR']), y=np.log10(tmgc['REAL_SNR']), 
                         mode='lines+markers',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=1, dash='dot')))

    fig.update_xaxes(title='Log10 FDR')
    fig.update_yaxes(title=f'{m} gap {g} Log10 SNR')
    fig.show()

In [None]:
# PEAKS NUMBER vs REAL signal to noise
for m in set(infodf['modification']):
    tm = infodf.loc[infodf['modification'] == m]
    fig = go.Figure()
    
    for g in sorted(set(tm['GAP'])):
        tmg = tm.loc[tm['GAP'] == g]
        for c in sorted(set(tmg['cell'])):
            tmgc = tmg.loc[tmg['cell']==c].copy()
            tmgc.sort_values(by=['FDR'], inplace=True)
            if g == 0:
                fig.add_trace(go.Scatter(x=np.log10(tmgc['PEAKS_NUMBER']), y=np.log10(tmgc['REAL_SNR']), 
                         mode='lines+markers',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=1, dash='dash')))
            elif g == 5:
                fig.add_trace(go.Scatter(x=np.log10(tmgc['PEAKS_NUMBER']), y=np.log10(tmgc['REAL_SNR']), 
                                         mode='lines+markers',
                                         name=f'{c} {g}',
                                         line = dict(color=cell_colors[c], width=1)))
            elif g == 10:
                fig.add_trace(go.Scatter(x=np.log10(tmgc['PEAKS_NUMBER']), y=np.log10(tmgc['REAL_SNR']), 
                         mode='lines+markers',
                         name=f'{c} {g}',
                         line = dict(color=cell_colors[c], width=1, dash='dot')))

    fig.update_xaxes(title='Log10 Peaks number')
    fig.update_yaxes(title=f'{m} gap {g} Log10 SNR')
    fig.show()