# SPAN noise experiment

Logbook: https://docs.google.com/document/d/10ItWypr53n7GlS-XKvvR7WteSpdjgUdBeMAFBoFj00k/edit#heading=h.15aayc8a5f19


In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

from IPython.display import display
import pandas as pd

import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
import scipy as sp
import os, re
import glob
from tqdm.auto import tqdm
import subprocess, tempfile
from pathlib import Path
import downstream.bed_metrics as bm
import plotly.graph_objects as go
import plotly.express as px

# Simulated peaks

In [None]:
FOLDER = '/mnt/stripe/shpynov/span-noise-experiment'
MODIFICATIONS = ['H3K27ac' , 'H3K27me3', 'H3K36me3', 'H3K4me3', 'H3K4me1']

# MACS2

In [None]:
MACS2_LEVELS = ['q1e-10', 'q1e-6', 'broad0.1', 'q0.05', 'q0.1', 'q0.2', 'q0.5']

dfm = pd.DataFrame(columns=['modification', 'alpha', 'replicate', 'level', 'file', 'peaks', 'length'])
for modification in MODIFICATIONS:
    print(modification)
    for file in tqdm(glob.glob(os.path.join(FOLDER, modification, 'macs2', '*.*Peak'))):
        if file.endswith('gappedPeak'):
            continue
        level = next((l for l in MACS2_LEVELS if f'_{l}' in file), None) # 
        if level:
            alpha = re.sub('.*hg19_|_[0-9]_[qb].*', '', os.path.basename(file))
            replicate = re.sub('.*hg19_[0-9\.]+_|_[qb].*', '', os.path.basename(file))
            out = ! awk '{{N+=1;L+=($$3-$$2)}} END{{print(N,L)}}' {file}
            if out[0].strip() != '':
                peaks, length = out[0].split(' ') 
            else:
                peaks, length = 0, 0
            dfm.loc[len(dfm)] = (modification, alpha, replicate, f'macs2 {level}', file, peaks, length)
        
# Fix types
dfm['peaks'] = dfm['peaks'].astype(int)
dfm['length'] = dfm['length'].astype(int)
# Sort
dfm.sort_values(by=['modification', 'alpha', 'level', 'replicate'], inplace=True)

In [None]:
dfm.tail()

In [None]:
def show(df, exp=False):
    dft = df.loc[df['alpha'].astype(float) <= 9].copy()
    dft['ma'] = dft['modification'] + '_' + dft['alpha']
    df_mean = dft.groupby(['ma', 'level'])['peaks'].mean().reset_index().sort_values(by=['ma', 'level'])
    df_std = dft.groupby(['ma', 'level'])['peaks'].std().reset_index().fillna(0).sort_values(by=['ma', 'level'])

    fig = go.Figure(layout=go.Layout(title=go.layout.Title(text="Peaks")))
    for l in sorted(set(df_mean['level'])):
        dfml = df_mean.loc[df_mean['level']==l]
        dfsl = df_std.loc[df_std['level']==l]
        fig.add_trace(go.Scatter(x=dfml['ma'], y=dfml['peaks'], name=f"{l} mean", line_shape='linear'))
        fig.add_trace(go.Scatter(x=dfsl['ma'], y=dfsl['peaks'], name=f"{l} std", line_shape='linear', 
                                 line=dict(dash='dot')))
    if exp:
        fig.update_layout(yaxis_type="log")
    fig.show()

In [None]:
show(dfm)

# SICER

In [None]:
SICER_LEVELS = ['FDR0.5', 'FDR0.2', 'FDR0.1', 'FDR0.05', 'FDR0.01', 'FDR1e-6', 'FDR1e-10']

dfsc = pd.DataFrame(columns=['modification', 'alpha', 'replicate', 'level', 'file', 'peaks', 'length'])
for modification in MODIFICATIONS:
    print(modification)
    for file in tqdm(glob.glob(os.path.join(FOLDER, modification, 'sicer', '*-FDR*'))):
        level = next((l for l in SICER_LEVELS if f'-{l}' in file), None) # 
        if level:
            alpha = re.sub('.*hg19_|_[0-9]-W.*', '', os.path.basename(file))
            replicate = re.sub('.*hg19_[0-9\.]+_|-W.*', '', os.path.basename(file))
            out = ! awk '{{N+=1;L+=($$3-$$2)}} END{{print(N,L)}}' {file}
            if out[0].strip() != '':
                peaks, length = out[0].split(' ') 
            else:
                peaks, length = 0, 0
            dfsc.loc[len(dfsc)] = (modification, alpha, replicate, f'sicer {level}', file, peaks, length)
# Fix types
dfsc['peaks'] = dfsc['peaks'].astype(int)
dfsc['length'] = dfsc['length'].astype(int)
# Sort
dfsc.sort_values(by=['modification', 'alpha', 'level', 'replicate'], inplace=True)

In [None]:
show(dfsc)

# SPAN

In [None]:
SPAN_LEVELS = ['0.5', '0.2', '0.1', '0.05', '0.01', '1E-6', '1E-10', 'tuned']

dfsp = pd.DataFrame(columns=['modification', 'alpha', 'replicate', 'level', 'file', 'peaks', 'length'])
for modification in MODIFICATIONS:
    print(modification)
    for file in tqdm(glob.glob(os.path.join(FOLDER, modification, 'span', '*.peak'))):
        level = next((l for l in SPAN_LEVELS if f'_{l}' in file), None) # 
        if level:
            alpha = re.sub('.*hg19_|_[0-9]_200.*', '', os.path.basename(file))
            replicate = re.sub('.*hg19_[0-9\.]+_|_200.*', '', os.path.basename(file))
            out = ! awk '{{N+=1;L+=($$3-$$2)}} END{{print(N,L)}}' {file}
            if out[0].strip() != '':
                peaks, length = out[0].split(' ') 
            else:
                peaks, length = 0, 0
            dfsp.loc[len(dfsp)] = (modification, alpha, replicate, f'span {level}', file, peaks, length)
# Fix types
dfsp['peaks'] = dfsp['peaks'].astype(int)
dfsp['length'] = dfsp['length'].astype(int)
# Sort
dfsp.sort_values(by=['modification', 'alpha', 'level', 'replicate'], inplace=True)

In [None]:
dfsp.tail()

In [None]:
show(dfsp)

# SPAN replicated

In [None]:
dfspr = pd.DataFrame(columns=['modification', 'alpha', 'replicate', 'level', 'file', 'peaks', 'length'])
for modification in MODIFICATIONS:
    print(modification)
    for file in tqdm(glob.glob(os.path.join(FOLDER, modification, 'span_rep', '*.peak'))):
        level = next((l for l in SPAN_LEVELS if f'_{l}' in file), None) # 
        if level:
            alpha = re.sub('.*hg19_|_200.*', '', os.path.basename(file))
            out = ! awk '{{N+=1;L+=($$3-$$2)}} END{{print(N,L)}}' {file}
            if out[0].strip() != '':
                peaks, length = out[0].split(' ') 
            else:
                peaks, length = 0, 0
            dfspr.loc[len(dfspr)] = (modification, alpha, 'None', f'span rep {level}', file, peaks, length)
# Fix types
dfspr['peaks'] = dfspr['peaks'].astype(int)
dfspr['length'] = dfspr['length'].astype(int)
# Sort
dfspr.sort_values(by=['modification', 'alpha', 'level'], inplace=True)

In [None]:
show(dfspr)

# Summary

In [None]:
dfa = pd.concat([dfm, dfsc, dfsp, dfspr])
dfa.sort_values(by=['modification', 'alpha', 'level'], inplace=True)
dfa = dfa.loc[dfa['alpha'].astype(float) <= 9]
display(dfa)

In [None]:
show(dfa)
# dfa['ma'] = dfa['modification'] + '_' + dfa['alpha']
# dfa_mean = dfa.groupby(['ma', 'level'])['peaks'].mean().reset_index().sort_values(by=['ma', 'level'])
# dfa_std = dfa.groupby(['ma', 'level'])['peaks'].std().reset_index().fillna(0).sort_values(by=['ma', 'level'])

# fig = go.Figure(layout=go.Layout(title=go.layout.Title(text="Peaks")))
# for l in sorted(set(dfa_mean['level'])):
#     fig.add_trace(go.Scatter(x=dfa_mean.loc[dfa_mean['level']==l]['ma'], 
#                              y=dfa_mean.loc[dfa_mean['level']==l]['peaks'], 
#                              name=f"{l} mean", line_shape='linear'))
#     fig.add_trace(go.Scatter(x=dfa_std.loc[dfa_std['level']==l]['ma'], 
#                              y=dfa_std.loc[dfa_std['level']==l]['peaks'], 
#                              name=f"{l} std", line_shape='linear', 
#                              line=dict(dash='dot')))
# fig.update_layout(yaxis_type="log")
# fig.show()

In [None]:
dfa['avg_length'] = dfa['length'] / dfa['peaks']
dfa.loc[~np.isfinite(dfa["avg_length"]), "avg_length"] = 0.0
dfa['ma'] = dfa['modification'] + '_' + dfa['alpha']

dfa_mean = dfa.groupby(['ma', 'level'])['avg_length'].mean().reset_index().sort_values(by=['ma', 'level'])
dfa_std = dfa.groupby(['ma', 'level'])['avg_length'].std().reset_index().fillna(0).sort_values(by=['ma', 'level'])

fig = go.Figure(layout=go.Layout(title=go.layout.Title(text="Average length")))
for l in sorted(set(dfa_mean['level'])):
    fig.add_trace(go.Scatter(x=dfa_mean.loc[dfa_mean['level']==l]['ma'], 
                             y=dfa_mean.loc[dfa_mean['level']==l]['avg_length'], 
                             name=f"{l} mean", line_shape='linear'))
    fig.add_trace(go.Scatter(x=dfa_std.loc[dfa_std['level']==l]['ma'], 
                             y=dfa_std.loc[dfa_std['level']==l]['avg_length'], 
                             name=f"{l} std", line_shape='linear', 
                             line=dict(dash='dot')))
# fig.update_layout(yaxis_type="log")
fig.show()

# Precision / Recall

In [None]:
dfpr = pd.DataFrame(columns=['modification', 'replicate', 'level', 'alpha', 'precision', 'recall'])
mlrs = dfa['modification'] + '_' + dfa['level'] + '_' + dfa['replicate']
for m in tqdm(set(mlrs)):
    dfmlrs = dfa.loc[mlrs == m]
    modification = dfmlrs.iloc[0]['modification']
    level = dfmlrs.iloc[0]['level']
    alpha = dfmlrs.iloc[0]['replicate']
    alpha0 = dfmlrs.loc[dfmlrs['alpha'].astype(float) == 0]
    for _, r in dfmlrs.loc[dfmlrs['alpha'].astype(float) > 0].iterrows():        
        alpha_other = r['alpha']
        paths = [Path(alpha0.iloc[0]['file']), Path(r['file'])] 
        df_path = f'/tmp/pr_{m}_{alpha_other}.tsv'
        mt = bm.load_or_build_metrics_table(paths, paths, Path(df_path), jaccard=False, threads=30)
        precision, recall = mt.iloc[1, 0], mt.iloc[0, 1]
        dfpr.loc[len(dfpr)] = (modification, replicate, level, alpha_other, precision, recall)

In [None]:
dfpr

In [None]:
# for m in MODIFICATIONS:
#     print(m)
#     dfmod = dfpr.loc[dfpr['modification'] == m]
#     dfprecision = dfmod.groupby(['alpha', 'level'])['precision'].mean().reset_index().sort_values(
#         by=['alpha', 'level'], ascending=False)
#     dfprecision['al'] = dfprecision['alpha'] + '_' + dfprecision['level']
#     dfrecall = dfmod.groupby(['alpha', 'level'])['recall'].mean().reset_index().sort_values(
#         by=['alpha', 'level'], ascending=False)
#     dfrecall['al'] = dfrecall['alpha'] + '_' + dfrecall['level']
    
#     dfprecisionrecall = pd.merge(on='al', left=dfprecision, right=dfrecall).sort_values(
#         by=['alpha_x', 'level_x'])
# #     display(dfprecisionrecall)

#     fig = go.Figure(layout=go.Layout(title=go.layout.Title(text="")))
#     for l in sorted(set(dfprecisionrecall['level_x'])):
#         dfprl = dfprecisionrecall.loc[dfprecisionrecall['level_x']==l]
#         fig.add_trace(go.Scatter(x=dfprl['recall'], 
#                                  y=dfprl['precision'],
#                                  name=f"{l} precision vs recall", line_shape='linear'))
#     # fig.update_layout(yaxis_type="log")
#     fig.show()

In [None]:
for m in MODIFICATIONS:
    print(m)
    dfmod = dfpr.loc[dfpr['modification'] == m]
    dfprecision = dfmod.groupby(['alpha', 'level'])['precision'].mean().reset_index().sort_values(
        by=['alpha', 'level'], ascending=False)
    dfrecall = dfmod.groupby(['alpha', 'level'])['recall'].mean().reset_index().sort_values(
        by=['alpha', 'level'], ascending=False)

    fig = go.Figure(layout=go.Layout(title=go.layout.Title(text="")))
    for l in sorted(set(dfprecision['level'])):
        fig.add_trace(go.Scatter(x=dfprecision.loc[dfprecision['level']==l]['alpha'], 
                                 y=dfprecision.loc[dfprecision['level']==l]['precision'], 
                                 name=f"{l} precision", line_shape='linear'))
        fig.add_trace(go.Scatter(x=dfrecall.loc[dfrecall['level']==l]['alpha'], 
                                 y=dfrecall.loc[dfrecall['level']==l]['recall'], 
                                 name=f"{l} recall", line_shape='linear', 
                                 line=dict(dash='dot')))
    # fig.update_layout(yaxis_type="log")
    fig.show()

# Overlap

In [None]:
# Compute overlaps per modification, alpha, level

dfo = pd.DataFrame(columns=['id', 'modification', 'alpha', 'level', 'overlap'])
mals = dfa['modification'] + '_' + dfa['alpha'] + '_' + dfa['level']
for m in tqdm(sorted(set(mals))):
    dfmod = dfa.loc[mals == m]
    modification = dfmod.iloc[0]['modification']
    alpha = dfmod.iloc[0]['alpha']
    level = dfmod.iloc[0]['level']
    paths = [Path(f) for f in dfmod['file']] 
    df_path = f'/tmp/overlap_{m}.tsv'
    mt = bm.load_or_build_metrics_table(paths, paths, Path(df_path),
                                        jaccard=False,
                                        threads=30)
    for row in mt.index:
        for col in mt.columns:
            overlap = mt.loc[row][col]
            dfo.loc[len(dfo)] = (f'{row}@{col}', modification, alpha, level, overlap)


In [None]:
dfo

In [None]:
# Overlap plots values with split by modification and alpha
sizex = len(sorted(set(dfo['level'])))
for m in MODIFICATIONS:
    print(m)
    fig = plt.figure(figsize=(int(sizex * 4), 4))
    data = dfo.loc[dfo['modification'] == m]
    ax = plt.axes()
    sns.barplot(data=data, 
                x='alpha', y='overlap', hue='level',
                ci="sd", capsize=.2, errwidth=2,
                edgecolor="black",
                ax = ax)
    # Put the legend out of the figure
    plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
    fig.show()

In [None]:
# Compute overlaps per modification, level
dfo = pd.DataFrame(columns=['id', 'modification', 'level', 'overlap'])
mls = dfa['modification'] + '_' + dfa['level']
for m in tqdm(sorted(set(mls))):
    dfmod = dfa.loc[mls == m]
    modification = dfmod.iloc[0]['modification']
    level = dfmod.iloc[0]['level']
    paths = [Path(f) for f in dfmod['file']] 
    df_path = f'/tmp/overlap_{m}.tsv'
    mt = bm.load_or_build_metrics_table(paths, paths, Path(df_path),
                                        jaccard=False,
                                        threads=30)
    for row in mt.index:
        for col in mt.columns:
            overlap = mt.loc[row][col]
            dfo.loc[len(dfo)] = (f'{row}@{col}', modification, level, overlap)


In [None]:
# Overlap plots values with split by modification and alpha
sizex = len(sorted(set(dfo['level'])))
for m in MODIFICATIONS:
    print(m)
    fig = plt.figure(figsize=(int(sizex * 4), 4))
    data = dfo.loc[dfo['modification'] == m]
    ax = plt.axes()
    sns.barplot(data=data, 
                x='level', y='overlap',
                ci="sd", capsize=.2, errwidth=2,
                edgecolor="black",
                ax = ax)
    # Put the legend out of the figure
    plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
    fig.show()

# Peak significance distribution

In [None]:
import plotly.figure_factory as ff

levels = ['macs2 q0.05', 'sicer FDR0.05', 'span 0.05']
for l in levels:
    tl = dfa.loc[dfa['level'] == l]
    for m in sorted(set(dfm['modification'])):
#         print(m, level)
        t = tl.loc[tl['modification'] == m]
        t = t.loc[t['replicate'] == '0']
        t = t.loc[t['alpha'].astype(float) <= 9]
        t = t.loc[t['peaks'].astype(int) > 0]
        hist_data = []
        alphas = []
        for i, row in tqdm(t.iterrows()):
            if 'macs2' in l:
                pdf = pd.read_csv(row['file'], sep='\t', 
                                  names=['chr', 'start', 'end', 'name', 'score', 'strand', 'fc', 'p', 'q'])
                mlqs = pdf['q']
            elif 'sicer' in l:
                pdf = pd.read_csv(row['file'], sep='\t', 
                                  names=['chr', 'start', 'end', 'reads', 'creads', 'p', 'fc', 'q'])
                qs = -np.log10(pdf['q'])
                mlqs = np.where(~np.isfinite(qs), 1000, qs) # 300 is empirical max for SICER
            elif 'span' in l:
                pdf = pd.read_csv(row['file'], sep='\t', 
                                  names=['chr', 'start', 'end', 'name', 'score', 'strand', 'fc', 'p', 'q'])
                mlqs = pdf['q']                
            else:
                raise Exception(f'Unknow level {l}')                
                
            hist_data.append(np.where(mlqs > 1000, 1000, mlqs))
            alphas.append(row['alpha'])

        # Create distplot with curve_type set to 'normal'
        fig = ff.create_distplot(hist_data, alphas, show_hist=False, show_rug=False)

        # Add title
        fig.update_layout(title_text=f'Minus log q-value distribution {l} {m}')
        fig.show()