# 2022 - ENCODE GSE26320 peaks coverage.ipynb


See logbook: https://docs.google.com/document/d/1hjkhjo6cB4_AykuQVSDTU07IPD0t6988ZfXwmBb8q0A/edit#

1. Process peaks

```
WORK_DIR=/data/2022_GSE26320
GENOME=hg38

echo "MACS2 narrow"
snakemake --printshellcmds -s ~/work/chipseq-smk-pipeline/Snakefile \
  all --cores 24 --use-conda --directory $WORK_DIR --config genome=$GENOME \
  fastq_dir=$WORK_DIR/fastq fastq_ext=fastq macs2_mode=narrow macs2_params="-q 0.05" macs2_suffix=q0.05 \
  --rerun-incomplete;

echo "MACS2 broad"
snakemake --printshellcmds -s ~/work/chipseq-smk-pipeline/Snakefile \
  all --cores 24 --use-conda --directory $WORK_DIR --config genome=$GENOME \
  fastq_dir=$WORK_DIR/fastq fastq_ext=fastq macs2_mode=broad macs2_params="--broad --broad-cutoff 0.1" macs2_suffix=broad0.1 \
  --rerun-incomplete;
```

2. Estimate fragment size from MACS2 output = `241.381`

```
T=$'\t'
for F in macs2/*q0.05*.xls; do 
    NAME=$(basename $F | sed 's/_q.*//g'); D=$(cat $F | grep '# d =' | sed 's/.*= //g'); 
    echo "$NAME$T$D" >> fragment.tsv; 
done
cat fragment.tsv | awk '{D+=$2; N+=1} END {print(D/N)}'
```

3. Compute tags for coverage using `bash GSE26320_analyze.sh`

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

import pandas as pd
import seaborn as sns
from IPython.display import display

sns.set_style("whitegrid")
import matplotlib.pyplot as plt
import numpy as np
import os, re
import glob
from tqdm.auto import tqdm
from itertools import product

In [None]:
def bedl(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[2] - tf[1]
    except:
        return np.zeros(0) # Empty file

def lines(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return len(tf)
    except:
        return 0 # Empty file

def d(a, b):
    return a / b if b != 0 else 0

# Peaks analysis

In [None]:
PATH = os.path.expanduser('~/data/2022_GSE26320')
# Don't have H1 H3K27ac rep2
# Don't have Huvec H3K4me3 rep1 
# Don't have HepG2 H3K4me1 rep2
# CELLS = ['GM12878', 'HMEC', 'HSMM', 'K562', 'NHEK', 'NHLF']  # Ignore H1, Huvec, HepG2
CELLS = ['GM12878']
MODIFICATIONS = ['H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3']
REPS = ['rep1', 'rep2']

In [None]:
def load_peaks_fdr(path, suffix, fdrs):
    df_fdr = pd.DataFrame(columns=['file', 'modification', 'cell', 'replicate', 'fdr', 'peaks', 'avlength'],
                      dtype=object)
    for f in tqdm(os.listdir(path)):
        if suffix not in f:
            continue
        fdr = next((fdr for fdr in fdrs if fdr in f), None)
        cell = next((cc for cc in CELLS if cc in f), None)
        mod = next((m for m in MODIFICATIONS if m in f), None)
        rep = 'rep1' if 'rep1' in f else 'rep2'
        if fdr and cell and rep and mod:
            ps, ls = lines(os.path.join(path, f)), bedl(os.path.join(path, f))
            avls = 0 if ps == 0 else sum(ls) / ps
            df_fdr.loc[len(df_fdr)] = (f, mod, cell, rep, fdr, ps, avls)
    return df_fdr

In [None]:
def plotdf(df, tool, title):
    plt.figure(figsize=(3 * len(MODIFICATIONS), 4))
    for i, m in enumerate(MODIFICATIONS):
        t = df.loc[df['modification'] == m].copy()
        t['fdr'] = t['fdr'].astype(float)
        # Plot 
        ax = plt.subplot(1, len(MODIFICATIONS), i + 1)
        ax.title.set_text(m)
#         sns.boxplot(data=t, x="fdr", y=title, ax = ax)
        sns.barplot(data=t, x="fdr", y=title, ax = ax, capsize=.2, errwidth=2, edgecolor="black")
        ax.xaxis.set_tick_params(rotation=90)
        ax.set_ylabel(title)

    plt.tight_layout()
    plt.savefig(f'{PATH}/analysis/fdr_{tool}_{title}.png', bbox_inches='tight', dpi=300)
    plt.show()

In [None]:
! mkdir ~/data/2022_GSE26320/analysis

### SPAN

In [None]:
# SPAN_FDRS = ['0.1', '0.01', '0.05', '0.001', '0.0001', '1e-06', '1e-08', '1e-10']
SPAN_FDRS = ['0.05']
df_fdr_span = load_peaks_fdr(os.path.expanduser('~/data/2022_GSE26320/span'), '.peak', SPAN_FDRS)

In [None]:
df_fdr_span

In [None]:
print('Peaks')
plotdf(df_fdr_span, 'span', 'peaks')

print('Average length')
plotdf(df_fdr_span, 'span', 'avlength')

In [None]:
# Troubleshooting
display(df_fdr_span[(df_fdr_span['modification'] == 'H3K36me3') &
                    (df_fdr_span['fdr'] == '1e-06') &
                    (df_fdr_span['peaks'] > 60000)])
display(df_fdr_span[(df_fdr_span['modification'] == 'H3K36me3') &
                    (df_fdr_span['replicate'] == 'rep1') &
                    (df_fdr_span['cell'] == 'HSMM')])

### MACS narrow

In [None]:
# MACS2_FDRS = ['0.1', '0.01', '0.05', '1e-3', '1-e4', '1e-6', '1e-8', '1e-10']
MACS2_FDRS = ['0.05']
df_fdr_macs2 = load_peaks_fdr(os.path.expanduser('~/data/2022_GSE26320/macs2'), '.narrowPeak', MACS2_FDRS)

In [None]:
print('Peaks')
plotdf(df_fdr_macs2, 'macs2', 'peaks')

print('Average length')
plotdf(df_fdr_macs2, 'macs2', 'avlength')

### MACS2 broad

In [None]:
# MACS2BROAD_FDRS = ['0.1', '0.05', '1e-2', '1e-3', '1e-4', '1e-6', '1e-8', '1e-10']
MACS2BROAD_FDRS = ['0.1']
df_fdr_macs2broad = load_peaks_fdr(os.path.expanduser('~/data/2022_GSE26320/macs2'), '.broadPeak', MACS2BROAD_FDRS)

In [None]:
print('Peaks')
plotdf(df_fdr_macs2broad, 'macs2broad', 'peaks')

print('Average length')
plotdf(df_fdr_macs2broad, 'macs2broad', 'avlength')

### SICER

In [None]:
# SICER_FDRS = ['0.1', '0.05', '0.01', '0.001', '0.0001', '1e-06', '1e-08', '1e-10']
SICER_FDRS = ['0.01']
df_fdr_sicer = load_peaks_fdr(os.path.expanduser('~/data/2022_GSE26320/sicer'), 'summary-FDR', SICER_FDRS)

In [None]:
print('Peaks')
plotdf(df_fdr_sicer, 'sicer', 'peaks')

print('Average length')
plotdf(df_fdr_sicer, 'sicer', 'avlength')

# Coverage analysis

In [None]:
print('Load coverages files')
FILES = os.listdir(os.path.join(PATH, 'covs'))

reps = []
TOOLS = []
cells = []
mods = []
files = []

for f in FILES:
    rep, tool, cell, mod = None, None, None, None
    rep = 'rep1' if 'rep1' in f else 'rep2'

    if '.peak' in f:
        tool = 'SPAN'
    elif 'FDR' in f:
        tool = 'SICER'
    elif 'narrowPeak' in f:
        tool = 'MACS2'
    elif 'broadPeak' in f:
        tool = 'MACS2 broad'

    cell = next((cc for cc in CELLS if cc in f), None)
    mod = next((m for m in MODIFICATIONS if m in f), None)

    if rep and tool and cell and mod:
        reps.append(rep)
        TOOLS.append(tool)
        cells.append(cell)
        mods.append(mod)
        files.append(f'{PATH}/covs/{f}')

df = pd.DataFrame(dict(cell=cells, modification=mods, rep=reps, file=files, tool=TOOLS))
df.groupby(['modification', 'tool']).count()

In [None]:
peaks = []
lengths = []
avlengths = []

for f in df['file']:
    ps, ls = lines(f), bedl(f)
    avls = 0 if ps == 0 else sum(ls) / ps
    peaks.append(ps)
    lengths.append(sum(ls))
    avlengths.append(avls)

df['peaks'] = peaks
df['length'] = lengths
df['average length'] = avlengths
df.head()

## Load libraries tags coverage

In [None]:
coverage_df = pd.DataFrame(columns=['cell', 'replicate', 'modification', 'tags'], dtype=object)
for c, r in tqdm(product(CELLS, REPS)):
    cc = lines(f'{PATH}/tags/' + next((f for f in os.listdir(f'{PATH}/tags/') if f'{c}_Input_{r}' in f), None))
    coverage_df.loc[len(coverage_df)] = (c, r, 'Input', cc)
    for m in MODIFICATIONS:
        ct = lines(f'{PATH}/tags/' + next((f for f in os.listdir(f'{PATH}/tags/') if f'{c}_{m}_{r}' in f), None))
        coverage_df.loc[len(coverage_df)] = (c, r, m, ct)


def libcoverage_t(m, c, r):
    return coverage_df[(coverage_df['modification'] == m) &
                       (coverage_df['cell'] == c) &
                      (coverage_df['replicate'] == r)]['tags'].values[0]
def libcoverage_c(c, r):
    return libcoverage_t('Input', c, r)

In [None]:
print('Libraries tags x10mln')
fig = plt.figure(figsize=(3 * (len(MODIFICATIONS) + 1), 4))
for i, m in enumerate(MODIFICATIONS + ['Input']):
    t = coverage_df.loc[coverage_df['modification'] == m].copy()
    t['name'] = t['cell'] + ' ' + t['replicate']
    ax = plt.subplot(1, len(MODIFICATIONS) + 1, i + 1)
    ax.title.set_text(m)
    sns.barplot(data=t, x='name', y='tags', ax=ax, color='blue')
    ax.set_ylim(bottom = 0, top = 1.8e7)
    ax.xaxis.set_tick_params(rotation=90)
    ax.set_ylabel('Tags')

plt.tight_layout()
plt.savefig(f'{PATH}/analysis/libraries.png', bbox_inches='tight', dpi=300)
plt.show()

# Loading detailed peaks information

In [None]:
def scores(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[4]
    except:
        return np.zeros(0) # Empty file

def positions(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[0] + ":" + tf[1].astype(str) + "-" + tf[2].astype(str)
    except:
        return [] # Empty file


def coverage_t(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[4]
    except:
        return [] # Empty file

def coverage_c(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[5]
    except:
        return [] # Empty file

In [None]:
df

In [None]:
print('Processing peaks files')
! mkdir {PATH}/analysis/bed

TOOLS = list(sorted(set(df['tool'])))

peaks_info = []
for m, c, r in tqdm(product(MODIFICATIONS, CELLS, REPS)):
    print(m, c, r)

    # Processing single tools information
    for tool in TOOLS:
        t = df.loc[(df['cell'] == c) & (df['modification'] == m) & (df['rep'].str.contains(r)) & (df['tool']==tool)]
        file = t['file'].values[0]
        peaks_info.extend((m, c, r, tool, 'tool', p, s, l, ct, cc)
                          for p, s, l, ct, cc in zip(positions(file), scores(file), bedl(file), coverage_t(file), coverage_c(file)))

    # Processing SPAN vs others
    for tool in TOOLS:
        if tool == 'SPAN':
            continue

        t_s = df.loc[
            (df['cell'] == c) & (df['modification'] == m) & (df['rep'].str.contains(r)) & (df['tool'] == 'SPAN')]
        t_o = df.loc[(df['cell'] == c) & (df['modification'] == m) & (df['rep'].str.contains(r)) & (df['tool'] == tool)]
        file_span, file_other = t_s['file'].values[0], t_o['file'].values[0]

        # Overlap span_vs_other
        tf = f'{PATH}/analysis/bed/{m}_{c}_{r}_overlap_SPAN_vs_{tool}.bed'.replace(' ', '_')
        !bedtools intersect -a {file_span} -b {file_other} -wa -u > {tf}
        peaks_info.extend((m, c, r, tool, 'overlap_span_vs_other', p, s, l, ct, cc)
                          for p, s, l, ct, cc in zip(positions(tf), scores(tf), bedl(tf), coverage_t(tf), coverage_c(tf)))

        # Overlap other_vs_span
        tf = f'{PATH}/analysis/bed/{m}_{c}_{r}_overlap_{tool}_vs_SPAN.bed'.replace(' ', '_')
        !bedtools intersect -b {file_span} -a {file_other} -wa -u > {tf}
        peaks_info.extend((m, c, r, tool, 'overlap_other_vs_span', p, s, l, ct, cc)
                          for p, s, l, ct, cc in zip(positions(tf), scores(tf), bedl(tf), coverage_t(tf), coverage_c(tf)))

        # Diff span_vs_other
        tf = f'{PATH}/analysis/bed/{m}_{c}_{r}_diff_SPAN_vs_{tool}.bed'.replace(' ', '_')
        !bedtools intersect -a {file_span} -b {file_other} -v > {tf}
        peaks_info.extend((m, c, r, tool, 'diff_span_vs_other', p, s, l, ct, cc)
                          for p, s, l, ct, cc in zip(positions(tf), scores(tf), bedl(tf), coverage_t(tf), coverage_c(tf)))

        # Diff other_vs_span
        tf = f'{PATH}/analysis/bed/{m}_{c}_{r}_diff_{tool}_vs_SPAN.bed'.replace(' ', '_')
        !bedtools intersect -b {file_span} -a {file_other} -v > {tf}
        peaks_info.extend((m, c, r, tool, 'diff_other_vs_span', p, s, l, ct, cc)
                          for p, s, l, ct, cc in zip(positions(tf), scores(tf), bedl(tf), coverage_t(tf), coverage_c(tf)))
df_peaks = pd.DataFrame(peaks_info, columns=[
    'modification', 'cell', 'replicate', 'tool', 'type', 'position', 'score', 'length', 'coveraget', 'coveragec'
])
del peaks_info
display(df_peaks.head())

In [None]:
print('Compute RPM and RPKM for peaks')
for m, c, r in tqdm(product(MODIFICATIONS, CELLS, REPS)):
    print(m, c, r)
    lct = libcoverage_t(m, c, r) * 1e-6
    lcc = libcoverage_c(c, r) * 1e-6
    t = df_peaks.loc[(df_peaks['modification'] == m) & (df_peaks['cell']==c) & (df_peaks['replicate']==r)].copy()
    t['rpmt'] = t['coveraget'] / lct
    t['rpmc'] = t['coveragec'] / lcc
    t['rpkmt'] = t['rpmt'] / t['length'] * 1e3
    t['rpkmc'] = t['rpmc'] / t['length'] * 1e3
    for col in ['rpmt', 'rpmc', 'rpkmt', 'rpkmc']:
        df_peaks.loc[(df_peaks['modification'] == m) & (df_peaks['cell']==c) & (df_peaks['replicate']==r), col] = list(t[col])
    del t

display(df_peaks.sample(5))
df_peaks.describe()

## Aggregated for all tools

In [None]:
PLOT_TYPES = ['peaks', 'length', 'rpm', 'rpkm']
TOOLS = ['MACS2', 'MACS2 broad', 'SICER', 'SPAN']

def process_type(df_peaks, m, peaks_type, tool, plot_type, name, show_control=False):
    t = df_peaks[(df_peaks['modification'] == m) & (df_peaks['type']==peaks_type) & (df_peaks['tool']==tool)]
    if plot_type == 'peaks':
        t = t.groupby(['cell', 'replicate']).size().reset_index(name='value')[['value']].copy()
    elif plot_type in ['score', 'length']:
        t = t.groupby(['cell', 'replicate'])[plot_type].mean().reset_index(name='value')[['value']].copy()
    elif plot_type in ['rpm', 'rpkm']:
        ts = []
        tt = t.groupby(['cell', 'replicate'])[plot_type + 't'].mean().reset_index(name='value')[['value']].copy()
        tt['type'] = 'treatment'
        ts.append(tt)
        if show_control:
            tt = t.groupby(['cell', 'replicate'])[plot_type + 'c'].mean().reset_index(name='value')[['value']].copy()
            tt['type'] = 'control'
            ts.append(tt)
        t = pd.concat(ts)
    t['variable'] = name
    return t

def plot_tools_summary_type(df_peaks, title, plot_type=PLOT_TYPES[0], tools=TOOLS,
                            show_tool=True, show_overlap=True, show_diff=True,
                            show_control=True, use_bar=False):
    if plot_type not in PLOT_TYPES:
        raise Exception(f'Unknown plot type {plot_type}, expected one of {PLOT_TYPES}')
    width = (1 if show_tool else 0) + (1 if show_overlap else 0) + (1 if show_diff else 0)
    plt.figure(figsize=(width * len(MODIFICATIONS) * 2, 5))

    for k, m in enumerate(MODIFICATIONS):
        tts = []

        # Processing tool
        if show_tool:
            for tool in tools:
                tts.append(process_type(df_peaks, m, 'tool', tool, plot_type, tool, show_control))

        # Overlap processing
        if show_overlap:
            if 'SPAN' not in tools:
                continue  # Cannot show overlap without SPAN
            for tool in tools:
                if tool != 'SPAN':
                    tts.append(process_type(df_peaks, m, 'overlap_span_vs_other', tool, plot_type, f'SPAN ∩ {tool}', show_control))
            for tool in tools:
                if tool != 'SPAN':
                    tts.append(process_type(df_peaks, m, 'overlap_other_vs_span', tool, plot_type, f'{tool} ∩ SPAN', show_control))

        # Difference processing
        if show_diff:
            if 'SPAN' not in tools:
                continue  # Cannot show overlap without SPAN
            for tool in tools:
                if tool != 'SPAN':
                    tts.append(process_type(df_peaks, m, 'diff_span_vs_other', tool, plot_type, f'SPAN - {tool}', show_control))
            for tool in tools:
                if tool != 'SPAN':
                    tts.append(process_type(df_peaks, m, 'diff_other_vs_span', tool, plot_type, f'{tool} - SPAN', show_control))

        t = pd.concat(tts)
        t['value'] = t['value'].astype(float)

        # Process variables
        for var in set(t['variable']):
            # Add mean variables
            tm = t[t['variable'] == var]['value'].mean()
            if plot_type in ['peaks', 'length']:
                t.loc[t['variable'] == var, 'variable'] = f'{var} ({int(tm)})'
            else:
                t.loc[t['variable'] == var, 'variable'] = f'{var} ({tm:.3f})'

        # Plot 
        ax = plt.subplot(1, len(MODIFICATIONS), k + 1)
        ax.title.set_text(m)
        if plot_type in ['rpm', 'rpkm']:
            if use_bar:
                sns.barplot(data=t, x='variable', y='value', hue='type', capsize=.2, errwidth=2, ax=ax)
            else:
                sns.boxplot(data=t, x='variable', y='value', hue='type', ax=ax)
        else:
            if use_bar:
                sns.barplot(data=t, x='variable', y='value', capsize=.2, errwidth=2, ax=ax)
            else:
                sns.boxplot(data=t, x='variable', y='value', ax=ax)
        ax.xaxis.set_tick_params(rotation=90)
        ax.set_ylabel(title)
        if k > 0:
            ax.set_ylabel('')

    plt.tight_layout()
    plt.savefig(f'{PATH}/analysis/{plot_type}.png', bbox_inches='tight', dpi=300)
    plt.show()

In [None]:
! mkdir {PATH}/analysis
print('Peaks number')
plot_tools_summary_type(df_peaks, 'Peaks number', 'peaks', show_tool=True, show_overlap=False, show_diff=True, use_bar=True)
print('Peaks length')
plot_tools_summary_type(df_peaks, 'Length', 'length', show_tool=True, show_overlap=False, show_diff=True, use_bar=False)
print('RPM')
plot_tools_summary_type(df_peaks, 'RPM', 'rpm', show_tool=True, show_overlap=False, show_diff=True, show_control=True, use_bar=False)
print('RPKM')
plot_tools_summary_type(df_peaks, 'RPKM', 'rpkm', show_tool=True, show_overlap=False, show_diff=True, show_control=True, use_bar=False)

# Save most significant peaks from difference with SPAN

In [None]:
BW_PATH = f'{PATH}/bw'
print('BW_PATH', BW_PATH)

BED4_PATH = f'{PATH}/bed4'
print('BED4_PATH', BED4_PATH)

JBR_PATH = f'{PATH}/jbr-1.0.beta.build.jar'
print('JBR_PATH', JBR_PATH)

! mkdir {PATH}/analysis/diff
print('DIFF_PATH', f'{PATH}/analysis/diff')

def find_bw_file(m, c, r):
    return glob.glob(f'{BW_PATH}/*{c}_{m}_{r}*.bw')[0]

def find_bw_input_file(c, r):
    return glob.glob(f'{BW_PATH}/*{c}_Input_{r}*.bw')[0]


def find_peaks_file(m, c, r, tool):
    if tool == 'SPAN':
        ext = '.peak'
    elif tool == 'MACS2':
        ext = '.narrowPeak'
    elif tool == 'MACS2 broad':
        ext = '.broadPeak'
    elif tool == 'SICER':
        ext = '-FDR0.01'
    else:
        raise f'Unexpected tool {tool}'
    return glob.glob(f'{BED4_PATH}/*{c}_{m}_{r}*{ext}.bed4')[0]

def save_span_diff_screenshots(df_peaks, tools=TOOLS, metrics='rpkmt', n=1):
    """ Save most significant differential peaks with SPAN"""
    if 'SPAN' not in tools:
        raise Exception('Cannot show overlap without SPAN')

    for m in tqdm(MODIFICATIONS):
        print(f'Processing {m}')

        # Difference processing
        for tool in tools:
            if tool != 'SPAN':
                print(tool)
                t = df_peaks[(df_peaks['modification'] == m) & (df_peaks['type']=='diff_span_vs_other') & (df_peaks['tool'] == tool)]
                t = t[[re.match('^chr[0-9]+:[0-9]+\\-[0-9]+$', p) is not None for p in t['position']]].copy()
                t = t.sort_values(by=[metrics], ascending=False).head(n).copy().reset_index()
                for i, row in t.iterrows():
                    c, r, p, mt = row['cell'], row['replicate'], row['position'], row[metrics]
                    bw_file = find_bw_file(m, c, r)
                    bw_input_file = find_bw_input_file(c, r)
                    span_peaks_file = find_peaks_file(m, c, r, 'SPAN')
                    tool_peaks_file = find_peaks_file(m, c, r, tool)
                    other_peaks = ''
                    for t in [t for t in tools if t != tool and t != 'SPAN']:
                        other_peaks += f',{find_peaks_file(m, c, r, t)}'
                    screenshot = f'{metrics}_{m}_diff_span_vs_{tool}_{i}_{mt:.3f}_{c}_{r}_{p}'.replace(' ', '_').replace(':', '_').replace('-', '_')
                    ! java -jar {JBR_PATH} --screenshot {PATH}/analysis/diff/{screenshot}_1.png --genome hg38 --screenshot_tracks {bw_file},{bw_input_file},{span_peaks_file},{tool_peaks_file}{other_peaks} --screenshot_position {p} --screenshot_zoomout 30 --screenshot_scalegroup 1,2 --screenshot_retina --quiet
                    ! java -jar {JBR_PATH} --screenshot {PATH}/analysis/diff/{screenshot}_2.png --genome hg38 --screenshot_tracks {bw_file},{bw_input_file},{span_peaks_file},{tool_peaks_file}{other_peaks} --screenshot_position {p} --screenshot_zoomout 300 --screenshot_scalegroup 1,2 --screenshot_retina --quiet


In [None]:
save_span_diff_screenshots(df_peaks, tools=TOOLS, metrics='rpkmt', n=5)

In [None]:
# save_span_diff_screenshots(df_peaks, tools=TOOLS, metrics='score')

# Peaks density vs length scatterplot

In [None]:
N = 100000
for m in MODIFICATIONS:
    print(m)
    plt.figure(figsize=(14, 4))
    k = 1
    t_span = df_peaks[(df_peaks['modification'] == m) & (df_peaks['type'] == 'tool') & (df_peaks['tool'] == 'SPAN')]
    t_span = t_span.sample(min(len(t_span), N))[['rpkmt', 'length', 'tool']].copy()
    for tool in tqdm(TOOLS):
        if tool == 'SPAN':
            continue
        # Use stratified sampling for performance reasons
        t = df_peaks[(df_peaks['modification'] == m) & (df_peaks['type'] == 'tool') & (df_peaks['tool'] == tool)]
        tst = pd.concat([t_span, t.sample(min(len(t), N))[['rpkmt', 'length', 'tool']].copy()])

        # Plot
        ax = plt.subplot(1, 3, k)
        k += 1
        # Clip for aesthetics
        sns.kdeplot(x=tst['length'].clip(upper=20000).values,
                    y=tst['rpkmt'].clip(upper=30.0).values,
                    hue=tst['tool'].values,
                    fill=True, alpha=0.4, ax=ax)
        ax.title.set_text(f'{m} SPAN vs {tool}')
        ax.set_xlabel('Length')
        ax.set_ylabel('RPKM')

    plt.tight_layout()
    plt.savefig(f'{PATH}/analysis/density_vs_len_{m}.png', bbox_inches='tight', dpi=300)
    plt.show

# Pairwise peaks number venn diagrams vs SPAN for each cell and replicate

In [None]:
from matplotlib_venn import venn2, venn2_circles


def venn_overlap(ax, peaks1, peaks2, overlap12, overlap21):
    # Tweak subsets to avoid empty areas
    subsets = (max(0.15 * max(peaks1, peaks2), peaks1 - overlap12),
               max(0.15 * max(peaks1, peaks2), peaks2 - overlap21),
               max(0.3 * max(peaks1, peaks2), overlap12, overlap21))
    v = venn2(subsets=subsets, set_labels=('SPAN', tool), set_colors=('r', 'g'), alpha=0.5, ax=ax)
    venn2_circles(subsets=subsets, color='grey')
    v.get_label_by_id('10').set_text(str(peaks1 - overlap12))
    v.get_label_by_id('01').set_text(str(peaks2 - overlap21))
    if overlap12 != 0 and overlap21 != 0:
        prop = f'{overlap12 / overlap21:.1f} : 1' if overlap12 > overlap21 else f'1 : {overlap21 / overlap12:.1f}'
        prop = prop.replace('.0', '')  # Cosmetics
    else:
        prop = ''
    v.get_label_by_id('11').set_text(f'{overlap12} : {overlap21}\n{prop}')
    for text in v.set_labels:
        text.set_fontsize(7)
    for text in v.subset_labels:
        text.set_fontsize(5)


for m in MODIFICATIONS:
    print(m)
    plt.figure(figsize=(15 * len(CELLS), 15 * len(TOOLS) * (len(TOOLS) - 1)))
    k = 1
    for c, r in tqdm(product(CELLS, REPS)):
        t = df_peaks[(df_peaks['modification'] == m) & (df_peaks['cell'] == c) &(df_peaks['replicate'] == r)]
        for tool in TOOLS:
            if tool == 'SPAN':
                continue
            peaks1 = len(t[(t['type'] == 'tool') & (t['tool'] == 'SPAN')])
            peaks2 = len(t[(t['type'] == 'tool') & (t['tool'] == tool)])
            overlap12 = len(t[(t['type'] == 'overlap_span_vs_other') & (t['tool'] == tool)])
            overlap21 = len(t[(t['type'] == 'overlap_other_vs_span') & (t['tool'] == tool)])

            # Plot
            ax = plt.subplot(len(CELLS), len(TOOLS) * (len(TOOLS) - 1), k)
            k += 1

            venn_overlap(ax, peaks1, peaks2, overlap12, overlap21)
            ax.title.set_text(f'{m} {c} {r}')

    plt.tight_layout()
    plt.savefig(f'{PATH}/analysis/venn_{m}.png', bbox_inches='tight', dpi=300)
    plt.show()

# Peaks length and density distributions analysis

In [None]:
PALETTE = {'MACS2': 'green', 'MACS2 broad': 'blue', 'SICER': 'orange', 'SPAN': 'brown'}
N = 100000

def plot_distribution(df_peaks, value, tools=TOOLS, threshold=None, log_scale_after_threshold=False):
    plt.figure(figsize=(16 if threshold is not None else 14, 6 if threshold is not None else 4))
    for i, m in enumerate(MODIFICATIONS):
        print(m)
        ts = df_peaks[(df_peaks['modification'] == m) & (df_peaks['type'] == 'tool')][[value, 'tool']].copy()

        # Use stratified sampling for performance reasons
        tss = []
        for tool in tools:
            tst = ts[ts['tool'] == tool]
            tss.append(tst.sample(min(len(tst), N)))
        tst = pd.concat(tss)

        # Plot
        if threshold is not None:
            ax = plt.subplot(2, len(MODIFICATIONS), i + 1)
            sns.histplot(data=tst.loc[tst[value] <= threshold], x=value, hue='tool', palette=PALETTE, element="poly", ax=ax)
            ax.title.set_text(m)
            if i > 0:
                ax.set_ylabel('')
            else:
                ax.set_ylabel(f'{value} <= {threshold}')

            ax = plt.subplot(2, len(MODIFICATIONS), i + 1 + len(MODIFICATIONS))
            sns.histplot(data=tst.loc[tst[value] > threshold], x=value, hue='tool', log_scale=log_scale_after_threshold, element="poly", palette=PALETTE, ax=ax)
            if i > 0:
                ax.set_ylabel('')
            else:
                ax.set_ylabel(f'{value} > {threshold}')
        else:
            ax = plt.subplot(2, len(MODIFICATIONS), i + 1)
            sns.histplot(data=tst, x=value, hue='tool', palette=PALETTE, element="poly", ax=ax)
            ax.title.set_text(m)
            if i > 0:
                ax.set_ylabel('')
            else:
                ax.set_ylabel(value)

            ax.title.set_text(m)
    plt.tight_layout()
    plt.savefig(f'{PATH}/analysis/hist_{value}_{"_".join(tools)}_{threshold}.png', bbox_inches='tight', dpi=300)
    plt.show()

In [None]:
print('Length distribution with 10k separation without SPAN')
plot_distribution(df_peaks, 'length', tools=['MACS2', 'MACS2 broad', 'SICER'], threshold=10000, log_scale_after_threshold=True)

In [None]:
print('RPKM distribution')
plot_distribution(df_peaks, 'rpkmt', tools=['MACS2', 'MACS2 broad', 'SICER', 'SPAN'], threshold=None)

In [None]:
__BREAK___

# Detailed peaks information for each modification, cell line and replicates

In [None]:
TOOLS_OTHER = [t for t in TOOLS if t != 'SPAN']

def process_file(df_peaks_mcr, m, peaks_type, tool, plot_type, name, show_control=False):
    t = df_peaks_mcr[(df_peaks_mcr['modification'] == m) & (df_peaks_mcr['type'] == peaks_type) & (df_peaks_mcr['tool'] == tool)]
    if plot_type == 'peaks':
        return pd.DataFrame(dict(variable=[name], value=[len(t)]))
    elif plot_type in ['score', 'length']:
        return pd.DataFrame(dict(variable=[name]*len(t), value=t[plot_type].values))
    elif plot_type in ['rpm', 'rpkm']:
        ts = [pd.DataFrame(dict(variable=[name] * len(t), value=t[plot_type + 't'].values, type=['treatment'] * len(t)))]
        if show_control:
            ts.append(pd.DataFrame(dict(variable=[name] * len(t), value=t[plot_type + 'c'].values, type=['control'] * len(t))))
        return pd.concat(ts)
    else:
        raise Exception(f'Unexpected peaks_type {peaks_type}')


for m, c, r in tqdm(product(MODIFICATIONS, CELLS, REPS)):
    print(m, c, r)
    t = df_peaks.loc[(df_peaks['modification'] == m) & (df_peaks['cell'] == c) & (df_peaks['replicate'] == r)]
    # Plot
    plt.figure(figsize=(15, 5))

    for i, plot_type in enumerate(['peaks', 'length', 'rpkm']):
        ts = []
        for tool in TOOLS:
            ts.append(process_file(t, m, 'tool', tool, plot_type, tool, True))
        for tool in TOOLS_OTHER:
            ts.append(process_file(t, m, 'overlap_span_vs_other', tool, plot_type, f'Overlap SPAN vs {tool}', True))
        for tool in TOOLS_OTHER:
            ts.append(process_file(t, m, 'overlap_other_vs_span', tool, plot_type, f'Overlap {tool} vs SPAN', True))
        for tool in TOOLS_OTHER:
            ts.append(process_file(t, m, 'diff_span_vs_other', tool, plot_type, f'Diff SPAN vs {tool}', True))
        for tool in TOOLS_OTHER:
            ts.append(process_file(t, m, 'diff_other_vs_span', tool, plot_type, f'Diff {tool} vs SPAN', True))
        tst = pd.concat(ts)
        tst['value'] = tst['value'].astype(float)

        # Plot
        ax = plt.subplot(1, 3, i + 1)
        if plot_type == 'peaks':
            sns.barplot(data=tst, x='variable', y='value', capsize=.2, errwidth=2, ax=ax)
        elif plot_type == 'length':
            tst['value'].clip(upper=20000, inplace=True)  # Clip for aesthetics
            sns.boxplot(data=tst, x='variable', y='value', ax=ax)
        elif plot_type == 'rpkm':
            tst['value'].clip(upper=20, inplace=True)  # Clip for aesthetics
            sns.boxplot(data=tst, x='variable', y='value', hue='type', ax=ax)
        ax.title.set_text(f'{plot_type} {m} {c} {r}')
        ax.set_ylabel(plot_type)
        ax.set_xlabel('type')
        ax.xaxis.set_tick_params(rotation=90)

    plt.tight_layout()
    plt.savefig(f'{PATH}/analysis/details_{m}_{c}_{r}.png', bbox_inches='tight', dpi=300)
    plt.show()
