# Figure6 Peaks number, average length and signal to noise ratio

Peak calling tuning improvement from defaults to tuned SPAN.

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

from IPython.display import display
import pandas as pd

import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
from collections import OrderedDict, Counter

AGE_PALETTE = {'Old': '#E41A1C', 'Young': '#377EB8'}

TOOLS_PALETTE = {'macs2': "#EDEDED", 'sicer': "#EDEDED", 'span': '#FBE5D6'}


df = pd.read_csv('/mnt/stripe/bio/experiments/configs/Y20O20/benchmark/Y20O20_peaks_summary_uli.tsv', 
                 sep='\t', comment='#')
df = df.loc[df['status'] != 'failed']

# Graphics of peaks number across different marks and tools

In [None]:
toshow = OrderedDict([
    ("H3K27ac", [("MACS2", "default"), ("SPAN", "tuned")]),
    ("H3K27me3", [("MACS2", "default"), ("SICER", "default"), ("SPAN", "tuned")]),
    ("H3K36me3", [("MACS2", "default"), ("SICER", "default"), ("SPAN", "tuned")]),
    ("H3K4me1", [("MACS2", "default"), ("SPAN", "tuned")]),
    ("H3K4me3", [("MACS2", "default"), ("SPAN", "tuned")])
])

# Filter out dataframe to given combination of target, tool and procedure.
dfs = []
for m, tps in toshow.items():
    for (tool, procedure) in tps:
        dfmtp = df.loc[np.logical_and(df['tool'] == tool, 
                          np.logical_and(df['modification'] == m, 
                                         df['procedure'] == procedure))]
        dfs.append(dfmtp)
dft = pd.concat(dfs, axis=0)


def plot_data(dft, value, description):
    dft['mp'] = dft['modification'] + " " + dft['tool'] + " " + dft['procedure']
    dft["age"] = "Young"
    dft.loc[dft.donor.str.startswith("OD"), "age"] = "Old"

    
    ms = ['H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3']
    axs = {}
    mpl = len(set(dft['mp']))
    fig = plt.figure(figsize=(int(len(set(dft['mp'])) * .75), 4))
    offset = 0
    for m in ms:
        data = dft.loc[dft['modification'] == m]
        xlabels = []
        for t in data['tool']:
            if t not in xlabels:
                xlabels.append(t)
        w = len(set(data['mp']))
        ax = plt.subplot2grid((1, mpl), (0, offset), colspan=w)

        sns.barplot(data=data, 
                 x="tool", y=value,
                 ci="sd", capsize=.2, errwidth=2,
                 palette=TOOLS_PALETTE, 
                 edgecolor="black",
                 ax = ax)

        sns.swarmplot(data=data,
              x="tool", y=value,
              size=3, #5
              hue = "age",
              palette=AGE_PALETTE,
              ax = ax)
        ax.legend().set_visible(False)
        axs[ax] = plt.ylim()
        if offset > 0:
            ax.get_yaxis().set_ticklabels([])
            ax.set_ylabel('')
        else:
            ax.set_ylabel(description)
        
        offset += w
        ax.set_xlabel('')
        ax.set_title(m)
        plt.xticks(range(0, len(xlabels)), xlabels, rotation=90)
            
    ymin = np.min([v[0] for v in axs.values()])
    ymax = np.max([v[1] for v in axs.values()])
 
    for ax in axs.keys():
        ax.set_ylim(bottom = ymin, top = ymax)
    plt.tight_layout()
    
    
# Plot peaks number
with PdfPages('/mnt/stripe/figures/peaks_number.pdf') as pdf:
    plot_data(dft, 'peaks', 'Number of peaks')
    pdf.savefig()

# SPAN number of peaks and average length

In [None]:
toshow = OrderedDict([
    ("H3K27ac", [("span", "tuned")]),
    ("H3K27me3", [ ("span", "tuned")]),
    ("H3K36me3", [("span", "tuned")]),
    ("H3K4me1", [("span", "tuned")]),
    ("H3K4me3", [("span", "tuned")])
])

PALETTE = {'H3K27ac': "#FF0000", 
           'H3K27me3': "#9900FF", 
           'H3K36me3': '#0000CC', 
           'H3K4me1': '#FF9900', 
           'H3K4me3': '#33CC33'}

# Filter out dataframe to given combination of target, tool and procedure.
dfs = []
for m, tps in toshow.items():
    for (tool, procedure) in tps:
        dfmtp = df.loc[np.logical_and(df['tool'] == tool, 
                          np.logical_and(df['modification'] == m, 
                                         df['procedure'] == procedure))]
        dfs.append(dfmtp)
        
dft = pd.concat(dfs, axis=0)
display(dft.head())



# Plot peaks number
with PdfPages('/mnt/stripe/peaks_number.pdf') as pdf:
    sns.barplot(x="modification", y="peaks", data=dft, errwidth=0, palette=PALETTE)
    pdf.savefig()
    plt.show()

dft['avg_length'] = dft['length'] / dft['peaks']
# Plot peaks number
with PdfPages('/mnt/stripe/peaks_length.pdf') as pdf:
    sns.barplot(x="modification", y="avg_length", data=dft, errwidth=0, palette=PALETTE)
    pdf.savefig()
    plt.show()

# Average peak length by modification and tool

In [None]:
dft['len_avg'] = dft['length'] / dft['peaks']
dft.loc[~np.isfinite(dft["len_avg"]), "len_avg"] = 0.0

with PdfPages('/mnt/stripe/figures/peaks_length.pdf') as pdf:
    plot_data(dft, 'len_avg', 'Average length of peak')
    pdf.savefig()

In [None]:
# OD14 is the highest in terms of avg peaks length, 
# it has small number of peaks with big summary length and one of the highest FRIP value
display(dft.loc[dft['mp']=='H3K36me3 SPAN tuned'])

# Signal to noise ratio

Computed by `ChipSeqSignalToNoise` experiment.

Input file: `/mnt/stripe/bio/experiments/signal_to_noise/signal_to_noise.tsv`

In [None]:
sn_df = pd.read_csv('/mnt/stripe/bio/experiments/signal_to_noise/signal_to_noise_Y20O20.tsv', 
                    sep='\t', names=['modification', 'cell', 'donor', 'sn', 'file'])

sn_df["age"] = "Young"
sn_df.loc[sn_df.donor.str.startswith("OD"), "age"] = "Old"

sn_df_encode = pd.read_csv('/mnt/stripe/bio/experiments/signal_to_noise/signal_to_noise_cd14encode.tsv', 
                            sep='\t', names=['modification', 'cell', 'donor', 'sn', 'file'])


fig = plt.figure(figsize=(5, 5))
sns.set_style("whitegrid")

ax = sns.barplot(data=sn_df, 
         x="modification", y='sn',
         ci="sd", capsize=.2, errwidth=2,
         color="lightgray",
         edgecolor="black")

ax = sns.swarmplot(data=sn_df, 
      x="modification", y='sn',
      size=3, #5
      hue = "age",
      palette=AGE_PALETTE)

ax = sns.swarmplot(data=sn_df_encode, 
      x="modification", y='sn',
      size=7, #5
      color='black')

plt.xlabel('')
plt.ylabel('signal to noise')

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
plt.show()