# Control investigations


In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

import pandas as pd
import seaborn as sns
from tqdm.auto import tqdm

sns.set_style("whitegrid")
import matplotlib.pyplot as plt
import numpy as np
import os
import tempfile

In [None]:
def bedl(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[2] - tf[1]
    except:
        return np.zeros(0)  # Empty file


def lines(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return len(tf)
    except:
        return 0  # Empty file


def d(a, b):
    return a / b if b != 0 else 0


def last_col(file):
    try:
        cols = len(pd.read_csv(file, sep='\t', nrows=1, header=None).columns)
        return pd.read_csv(file, sep='\t', header=None, usecols=[cols - 1])[cols - 1]
    except:
        return np.zeros(0)  # Empty file


def sorted_file(file):
    ts = tempfile.mktemp()
    ! cat {file} | sort -k1,1 -k2,2n -k3,3n > {ts}
    return ts

# Load hg38 peaks

1. Create dataset without control tracks
2. Launch peak calling

```
conda activate snakemake
snakemake -p -s ~/work/chipseq-smk-pipeline/Snakefile all --cores all  --use-conda  --directory $(pwd) --config genome=hg38 fastq_dir=$(pwd) start_with_bams=True bams_dir=bams macs2=True span=False sicer=False --rerun-trigger mtime;
snakemake -p -s ~/work/chipseq-smk-pipeline/Snakefile all --cores all  --use-conda  --directory $(pwd) --config genome=hg38 fastq_dir=$(pwd) start_with_bams=True bams_dir=bams macs2=True macs2_mode=broad macs2_params="--broad --broad-cutoff=0.1" macs2_suffix="broad0.1"  span=False sicer=False --rerun-trigger mtime; 
snakemake -p -s ~/work/chipseq-smk-pipeline/Snakefile all --cores all  --use-conda  --directory $(pwd) --config genome=hg38 fastq_dir=$(pwd) start_with_bams=True bams_dir=bams macs2=False span=True sicer=True --rerun-trigger mtime;
```


In [None]:
GSE26320_PATH = os.path.expanduser('~/data/2023_GSE26320')
GSE26320_NO_CONTROL_PATH = os.path.expanduser('~/data/2023_GSE26320_no_control')
GSE26320_CELLS = ['GM12878', 'HMEC', 'HSMM', 'K562', 'NHEK', 'NHLF', 'H1', 'Huvec', 'HepG2']
MODIFICATIONS = ['H3K4me3', 'H3K27ac', 'H3K4me1', 'H3K27me3', 'H3K36me3']
GSE26320_REPS = ['rep1', 'rep2']

IMMUNE_PATH = os.path.expanduser('~/data/2023_Immune')
IMMUNE_NO_CONTROL_PATH = os.path.expanduser('~/data/2023_Immune_no_control')
IMMUNE_CELLS = ['CD4ABT', 'TCellBB', 'BCell', 'TCell', 'Monocyte', 'PBMC', 'NK', 'CD34', 'CD4', ]  ## Longest first
IMMUNE_REPS = ['rep1', 'rep2', 'rep3', '']

! mkdir -p {GSE26320_PATH}/pics
! mkdir -p {IMMUNE_PATH}/pics

In [None]:
TOOLS = ['MACS2', 'MACS2 broad', 'SPAN', 'SICER']
palette = plt.get_cmap('tab10')
TOOLS_PALETTE = {t: palette(i) for i, t in enumerate(TOOLS)}

In [None]:
def load_peaks(path, suffix, modifications, cells, replicates):
    df = pd.DataFrame(columns=['file', 'modification', 'cell', 'replicate', 'peaks'],
                      dtype=object)
    for f in tqdm(os.listdir(path)):
        if not f.endswith(suffix):
            continue
        cell = next((c for c in cells if c in f), None)
        mod = next((m for m in modifications if m in f), None)
        rep = next((r for r in replicates if r in f), None)
        if cell and mod and rep is not None:
            peaks_path = os.path.join(path, f)
            ps = lines(peaks_path)
            df.loc[len(df)] = (peaks_path, mod, cell, rep, ps)
    return df

In [None]:
def load_peaks_path(path, modifications, cells, replicates):
    df_macs2 = load_peaks(os.path.join(path, 'macs2'), '.narrowPeak', modifications, cells, replicates)
    df_macs2['tool'] = 'MACS2'
    print('MACS2', len(df_macs2))

    df_macs2broad = load_peaks(os.path.join(path, 'macs2'), '.broadPeak', modifications, cells, replicates)
    df_macs2broad['tool'] = 'MACS2 broad'
    print('MACS2 broad', len(df_macs2broad))

    df_sicer = load_peaks(os.path.join(path, 'sicer'), 'summary-FDR0.01', modifications, cells, replicates)
    if len(df_sicer) == 0:
        df_sicer = load_peaks(os.path.join(path, 'sicer'), 'E100.scoreisland', modifications, cells, replicates) 
    df_sicer['tool'] = 'SICER'
    print('SICER', len(df_sicer))

    df_span = load_peaks(os.path.join(path, 'span'), '.peak', modifications, cells, replicates)
    df_span['tool'] = 'SPAN'
    print('SPAN', len(df_span))

    return pd.concat([df_macs2, df_macs2broad, df_sicer, df_span]).reset_index(drop=True)

In [None]:
df_encode_peaks = load_peaks_path(GSE26320_PATH, MODIFICATIONS, GSE26320_CELLS, GSE26320_REPS)
df_encode_peaks['dataset'] = 'ENCODE'
df_encode_peaks['control'] = True

df_encode_no_control_peaks = load_peaks_path(GSE26320_NO_CONTROL_PATH, MODIFICATIONS, GSE26320_CELLS, GSE26320_REPS)
df_encode_no_control_peaks['dataset'] = 'ENCODE'
df_encode_no_control_peaks['control'] = False

df_immune_peaks = load_peaks_path(IMMUNE_PATH, MODIFICATIONS, IMMUNE_CELLS, IMMUNE_REPS)
df_immune_peaks['dataset'] = 'Roadmap'
df_immune_peaks['control'] = True

df_immune_no_control_peaks = load_peaks_path(IMMUNE_NO_CONTROL_PATH, MODIFICATIONS, IMMUNE_CELLS, IMMUNE_REPS)
df_immune_no_control_peaks['dataset'] = 'Roadmap'
df_immune_no_control_peaks['control'] = False

df_peaks_full = pd.concat([df_encode_peaks, df_encode_no_control_peaks, 
                      df_immune_peaks, df_immune_no_control_peaks]).reset_index(drop=True)
df_peaks_full.sample(3)

In [None]:
for ds in ['ENCODE', 'Roadmap']:
    print(ds)
    plt.figure(figsize=(14, 4))
    axs = [plt.subplot(1, 5, i + 1) for i in range(5)]
    for i, m in enumerate(MODIFICATIONS):
        ax = axs[i]
        sns.barplot(data=df_peaks_full[(df_peaks_full['dataset'] == ds) & 
                                  (df_peaks_full['modification'] == m)],
                    x='tool', y='peaks', hue='control',
                    capsize=.2, err_kws={'linewidth': 2}, edgecolor="black",
                    order=['MACS2', 'MACS2 broad', 'SPAN', 'SICER'],
                    hue_order=[True, False],
                    ax=ax,)
        sns.stripplot(data=df_peaks_full[(df_peaks_full['dataset'] == ds) &
                                    (df_peaks_full['modification'] == m)],
                      x='tool', y='peaks', hue='control',
                      dodge=True, size=1.5, color="black", alpha=0.5, palette='dark:black',
                      order=['MACS2', 'MACS2 broad', 'SPAN', 'SICER'], legend=False,
                      hue_order=[True, False],
                      ax=ax)

        ax.xaxis.set_tick_params(rotation=90)
        ax.title.set_text(m)
        if i > 0:
            ax.set_ylabel(None)
        else:
            ax.set_ylabel('peaks')
        if i < 4:
            ax.legend().set_visible(False)
        else:
            # Put a legend to the right of the current axis
            ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

        # ax.set_ylim(0, 1.9e7)
    plt.tight_layout()
    # plt.savefig(GSE26320_PATH_HG38 + f'/pics/library_mapped_detailed_all.pdf', bbox_inches='tight', dpi=300)
    plt.show()

In [None]:
df_peaks_full[(df_peaks_full['tool'] == 'SPAN') & (df_peaks_full['modification'] == 'H3K27me3') & (df_peaks_full['peaks'] > 100_000)]

In [None]:
# Remove outliers, see 2023 - figures.ipynb
OUTLIERS = [
    ('H3K4me3', 'NK', ''),
    ('H3K27me3', 'TCell', ''),
    ('H3K27me3', 'BCell', ''),
]
df_peaks = df_peaks_full.loc[[(m, c, r) not in OUTLIERS
                              for _, (m, c, r) in df_peaks_full[['modification', 'cell', 'replicate']].iterrows()]].copy()
df_peaks.sample(3)

In [None]:
for ds in ['ENCODE', 'Roadmap']:
    print(ds)
    plt.figure(figsize=(14, 4))
    axs = [plt.subplot(1, 5, i + 1) for i in range(5)]
    for i, m in enumerate(MODIFICATIONS):
        ax = axs[i]
        sns.barplot(data=df_peaks[(df_peaks['dataset'] == ds) &
                                  (df_peaks['modification'] == m)],
                    x='tool', y='peaks', hue='control',
                    capsize=.2, err_kws={'linewidth': 2}, edgecolor="black",
                    order=['MACS2', 'MACS2 broad', 'SPAN', 'SICER'],
                    hue_order=[True, False],
                    ax=ax,)
        sns.stripplot(data=df_peaks[(df_peaks['dataset'] == ds) &
                                    (df_peaks['modification'] == m)],
                      x='tool', y='peaks', hue='control',
                      dodge=True, size=1.5, color="black", alpha=0.5, palette='dark:black',
                      order=['MACS2', 'MACS2 broad', 'SPAN', 'SICER'], legend=False,
                      hue_order=[True, False],
                      ax=ax)

        ax.xaxis.set_tick_params(rotation=90)
        ax.title.set_text(m)
        if i > 0:
            ax.set_ylabel(None)
        else:
            ax.set_ylabel('peaks')
        if i < 4:
            ax.legend().set_visible(False)
        else:
            # Put a legend to the right of the current axis
            ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

        # ax.set_ylim(0, 1.9e7)
    plt.tight_layout()
    # plt.savefig(GSE26320_PATH_HG38 + f'/pics/library_mapped_detailed_all.pdf', bbox_inches='tight', dpi=300)
    plt.show()

In [None]:
print('NO CONTROL')
plt.figure(figsize=(12, 3.5))
axs = [plt.subplot(1, 5, i + 1) for i in range(5)]
for i, m in enumerate(MODIFICATIONS):
    ax = axs[i]
    sns.barplot(data=df_peaks[(df_peaks['modification'] == m) & (df_peaks['control'] == False)],
                x='tool', y='peaks', hue='dataset',
                capsize=.2, err_kws={'linewidth': 2}, edgecolor="black",
                # palette=TOOLS_PALETTE,
                order=['MACS2', 'MACS2 broad', 'SPAN', 'SICER'],
                hue_order=['ENCODE', 'Roadmap'],
                ax=ax)
    sns.stripplot(data=df_peaks[(df_peaks['modification'] == m) & (df_peaks['control'] == False)],
                  x='tool', y='peaks',  hue='dataset',
                  dodge=True, size=2.5, palette='dark:black', alpha=0.5, legend=False,
                  order=['MACS2', 'MACS2 broad', 'SPAN', 'SICER'],
                  hue_order=['ENCODE', 'Roadmap'],
                  ax=ax)
    ax.xaxis.set_tick_params(rotation=90)
    ax.title.set_text(m)
    if i > 0:
        ax.set_ylabel(None)
    else:
        ax.set_ylabel('peaks')
    if i < len(axs) -1:
        ax.legend().set_visible(False)
    else:
        # Put a legend to the right of the current axis
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

    # ax.set_ylim(0, 1.9e7)
plt.tight_layout()
# plt.savefig(GSE26320_PATH_HG38 + f'/pics/library_mapped_detailed_all.pdf', bbox_inches='tight', dpi=300)
plt.show()

# Compute ratio no control vs control

In [None]:
rows = []
for (ds, m, c, r, t), dft in tqdm(df_peaks.groupby(['dataset', 'modification', 'cell', 'replicate', 'tool'])):
    if len(dft) == 2:
        print(ds, m, c, r, t)
        if len(dft[dft['control'] == True]) != 1 or len(dft[dft['control'] == False]) != 1:
            continue
        with_control = dft[dft['control'] == True]['file'].values[0]
        without_control = dft[dft['control'] == False]['file'].values[0]
        peaks_with_control = lines(with_control)
        peaks_without_control = lines(without_control)
        rows.append((ds, m, c, r, t, d(peaks_without_control, peaks_with_control)))
df_ratio = pd.DataFrame(rows, columns=['dataset', 'modification', 'cell', 'replicate', 'tool', 'ratio'])
df_ratio['ratio'] = ((df_ratio['ratio'] - 1) * 100).astype(int)
del rows
df_ratio.sample(3)

In [None]:
TOOLS = ['MACS2', 'MACS2 broad', 'SPAN', 'SICER']
palette = plt.get_cmap('tab10')
TOOLS_PALETTE = {t: palette(i) for i, t in enumerate(TOOLS)}

In [None]:
for ds in ['ENCODE', 'Roadmap']:
    print(ds)
    plt.figure(figsize=(6, 3))
    ax = plt.axes()
    sns.boxplot(data=df_ratio[df_ratio['dataset'] == ds],
                x='modification', y='ratio', hue='tool',
                showfliers=False,
                order=['H3K4me3', 'H3K27ac', 'H3K4me1', 'H3K27me3', 'H3K36me3'],
                hue_order=['MACS2', 'MACS2 broad', 'SPAN', 'SICER'],
                palette = TOOLS_PALETTE,
                ax=ax)
    sns.stripplot(data=df_ratio[df_ratio['dataset'] == ds],
                  x='modification', y='ratio', hue='tool',
                  dodge=True, size=2, color="black", alpha=0.5, palette='dark:black',
                  order=['H3K4me3', 'H3K27ac', 'H3K4me1', 'H3K27me3', 'H3K36me3'],
                  hue_order=['MACS2', 'MACS2 broad', 'SPAN', 'SICER'], legend=False,
                  ax=ax)
    ax.set_ylabel('Difference %')
    ax.xaxis.set_tick_params(rotation=45)
    ax.set_ylim(-110, 145)
    ax.set_title('Difference in peaks without/with control')
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.tight_layout()
    plt.show()

In [None]:
df_ratio[(df_ratio['tool'] == 'MACS2') & 
         (df_ratio['modification'] == 'H3K4me3') & (df_ratio['ratio'] > 50)]

In [None]:
print('ENCODE + Roadmap')
plt.figure(figsize=(6, 4))
ax = plt.axes()
sns.boxplot(data=df_ratio,
            x='modification', y='ratio', hue='tool',
            showfliers=False,
            order=['H3K4me3', 'H3K27ac', 'H3K4me1', 'H3K27me3', 'H3K36me3'],
            hue_order=['MACS2', 'MACS2 broad', 'SPAN', 'SICER'],
            palette = TOOLS_PALETTE,
            ax=ax)
sns.stripplot(data=df_ratio,
              x='modification', y='ratio', hue='tool',
              dodge=True, size=3, alpha=0.5,
              order=['H3K4me3', 'H3K27ac', 'H3K4me1', 'H3K27me3', 'H3K36me3'],
              hue_order=['MACS2', 'MACS2 broad', 'SPAN', 'SICER'], legend=False,
              palette='dark:black',
              ax=ax)
ax.set_ylabel('Difference %')
ax.xaxis.set_tick_params(rotation=45)
ax.set_ylim(-110, 145)
ax.set_title('Difference in peaks without/with control')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
df_ratio[(df_ratio['modification'] == 'H3K4me3') & (df_ratio['ratio'] > 50)]

# Jaccard without control vs with control

In [None]:
tf = tempfile.mktemp()
rows = []
for (ds, m, c, r, t), dft in tqdm(df_peaks.groupby(['dataset', 'modification', 'cell', 'replicate', 'tool'])):
    if len(dft) == 2:
        with_control = sorted_file(dft[dft['control'] == True]['file'].values[0])
        without_control = sorted_file(dft[dft['control'] == False]['file'].values[0])
        peaks_with_control = lines(with_control)
        peaks_without_control = lines(without_control)
        len_with_control = bedl(with_control).sum()
        len_without_control = bedl(without_control).sum()
        !bedtools intersect -a {with_control} -b {without_control} -wo > {tf}
        overlap_len = int(last_col(tf).sum())
        jaccard = d(overlap_len, len_with_control + len_without_control - overlap_len)
        rows.append((ds, m, c, r, t, peaks_with_control, peaks_without_control, len_with_control, len_without_control, 
                     d(len_with_control, peaks_with_control), d(len_without_control, peaks_without_control),
                     overlap_len, jaccard))
df_jaccard = pd.DataFrame(rows, columns=['dataset', 'modification', 'cell', 'replicate', 'tool',
                                         'peaks_with_control', 'peaks_without_control',
                                         'len_with_control', 'len_without_control', 
                                         'avg_len_with_control', 'avg_len_without_control',
                                         'overlap_len', 'jaccard'])
del rows
df_jaccard.sample(3)

In [None]:
for ds in ['ENCODE', 'Roadmap']:
    print(ds)
    plt.figure(figsize=(6, 4))
    ax = plt.axes()
    sns.boxplot(data=df_jaccard[df_jaccard['dataset'] == ds],
                x='modification', y='jaccard', hue='tool',
                showfliers=False,
                order=['H3K4me3', 'H3K27ac', 'H3K4me1', 'H3K27me3', 'H3K36me3'],
                hue_order=['MACS2', 'MACS2 broad', 'SPAN', 'SICER'],
                ax=ax)
    sns.stripplot(data=df_jaccard[df_jaccard['dataset'] == ds],
                  x='modification', y='jaccard', hue='tool',
                  dodge=True, size=2, color="black", alpha=0.5, palette='dark:black',
                  order=['H3K4me3', 'H3K27ac', 'H3K4me1', 'H3K27me3', 'H3K36me3'],
                  hue_order=['MACS2', 'MACS2 broad', 'SPAN', 'SICER'], legend=False,
                  ax=ax)
    ax.set_ylabel('jaccard')
    ax.xaxis.set_tick_params(rotation=45)
    ax.set_ylim(-0.05, 1.05)
    ax.set_title('Jaccard between peaks without/with control')
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.tight_layout()
    plt.show()

In [None]:
df_jaccard[(df_jaccard['tool'] == 'SPAN') &
           (df_jaccard['modification'] == 'H3K27me3') & 
           (df_jaccard['jaccard'] < 0.1)]

In [None]:
print('ENCODE + Roadmap')
plt.figure(figsize=(6, 4))
ax = plt.axes()
sns.boxplot(data=df_jaccard,
            x='modification', y='jaccard', hue='tool',
            showfliers=False,
            order=['H3K4me3', 'H3K27ac', 'H3K4me1', 'H3K27me3', 'H3K36me3'],
            hue_order=['MACS2', 'MACS2 broad', 'SPAN', 'SICER'],
            ax=ax)
sns.stripplot(data=df_jaccard,
              x='modification', y='jaccard', hue='tool',
              dodge=True, size=3, color="black", alpha=0.5, palette='dark:black',
              order=['H3K4me3', 'H3K27ac', 'H3K4me1', 'H3K27me3', 'H3K36me3'],
              hue_order=['MACS2', 'MACS2 broad', 'SPAN', 'SICER'], legend=False,
              ax=ax)
ax.set_ylabel('jaccard')
ax.xaxis.set_tick_params(rotation=45)
ax.set_ylim(-0.05, 1.05)
ax.set_title('Jaccard between peaks without/with control')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
df_jaccard[(df_jaccard['dataset'] == 'Roadmap') & 
         (df_jaccard['modification'] == 'H3K27me3') & 
         (df_jaccard['jaccard'] < 0.2)]

In [None]:
df_jaccard['union_len'] = df_jaccard['len_with_control'] + df_jaccard['len_without_control']  - df_jaccard['overlap_len']
df_jaccard['overlap_diff'] = df_jaccard['len_with_control'] + df_jaccard['len_without_control']  - 2 * df_jaccard['overlap_len']

In [None]:
for ds in ['ENCODE', 'Roadmap']:
    print(ds)
    plt.figure(figsize=(15, 3))
    axs = [plt.subplot(1, 5, i + 1) for i in range(5)]
    for i, m in enumerate(MODIFICATIONS):
        ax = axs[i]
        sns.kdeplot(df_jaccard[(df_jaccard['dataset'] == ds) &
                               (df_jaccard['modification'] == m)],
                    palette=TOOLS_PALETTE,
                    hue_order=TOOLS,
                    x='jaccard', y='overlap_diff', hue='tool', 
                    alpha=0.5,
                    thresh=0.1,
                    fill=True,
                    legend=False,
                    ax = ax)
        sns.scatterplot(df_jaccard[(df_jaccard['dataset'] == ds) &
                                   (df_jaccard['modification'] == m)],
                        palette=TOOLS_PALETTE,
                        hue_order=TOOLS,
                        x='jaccard', y='overlap_diff', hue='tool', alpha=0.8,
                        ax = ax)
        ax.set_title(m)
        if i > 0:
            ax.set_ylabel(None)
        if i < len(axs) - 1:
            ax.legend().set_visible(False)
        else:
            # Put a legend to the right of the current axis
            ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.tight_layout()
    plt.show()

In [None]:
print('ENCODE + Roadmap')
df_jaccard['avg_length'] = (df_jaccard['avg_len_with_control'] + df_jaccard['avg_len_without_control']) / 2
plt.figure(figsize=(15, 3))
axs = [plt.subplot(1, 5, i + 1) for i in range(5)]
for i, m in enumerate(MODIFICATIONS):
    ax = axs[i]
    sns.kdeplot(df_jaccard[(df_jaccard['modification'] == m)],
                palette=TOOLS_PALETTE,
                hue_order=TOOLS,
                x='jaccard', y='avg_length', hue='tool',
                alpha=0.3,
                thresh=0.01,
                fill=True,
                legend=False,
                ax = ax)
    sns.scatterplot(df_jaccard[(df_jaccard['modification'] == m)],
                    palette=TOOLS_PALETTE,
                    hue_order=TOOLS,
                    style='dataset',
                    x='jaccard', y='avg_length', hue='tool', alpha=0.8,
                    ax = ax)
    ax.set_title(m)
    if i > 0:
        ax.set_ylabel(None)
    if i < len(axs) - 1:
        ax.legend().set_visible(False)
    else:
        # Put a legend to the right of the current axis
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
rows = []
for _, (ds, modification, tool, jaccard, avlc, avlwc) in df_jaccard[
    ['dataset', 'modification', 'tool', 'jaccard', 'avg_len_with_control', 'avg_len_without_control']].iterrows():
    rows.append((ds, modification, tool, jaccard, avlc))
    rows.append((ds, modification, tool, jaccard, avlwc))
df_jaccard_avg_len = pd.DataFrame(rows, columns=['dataset', 'modification', 'tool', 'jaccard', 'average_len'])
del rows

In [None]:
# K27me3 outliers HepG2 rep1 and HMEC rep2
df_jaccard[(df_jaccard['dataset'] == 'ENCODE') & (df_jaccard['modification'] == 'H3K27me3') & (df_jaccard['tool']=='SPAN')]

# Load BAMS

In [None]:
def load_bams(path, modifications, cells, replicates, control='Input'):
    df_bams = pd.DataFrame(columns=['bam_file', 'modification', 'cell', 'replicate', 'reads'], dtype=object)
    for f in tqdm(os.listdir(path)):
        if not f.endswith('.bam'):
            continue
        cell = next((c for c in cells if c in f), None)
        rep = next((r for r in replicates if r in f), None)
        mod = next((m for m in modifications if m in f), None)
        if mod is None and control is not None and control in f:
            mod = 'Control'
        if cell and mod and rep is not None:
            file = os.path.join(path, f)
            reads = ! samtools view -F 0x04 -c {file}
            reads = int(reads[0])
            df_bams.loc[len(df_bams)] = (file, mod, cell, rep, reads)
    return df_bams

In [None]:
df_encode_bams = load_bams(GSE26320_PATH + '/bams', MODIFICATIONS, GSE26320_CELLS, GSE26320_REPS, 'Input')
df_encode_bams['dataset'] = 'ENCODE'
df_immune_bams = load_bams(IMMUNE_PATH + '/bams', MODIFICATIONS, IMMUNE_CELLS, IMMUNE_REPS, 'Control')
df_immune_bams['dataset'] = 'Roadmap'

df_bams = pd.concat([df_encode_bams, df_immune_bams]).reset_index(drop=True)


In [None]:
plt.figure(figsize=(6, 4))
ax = plt.axes()
sns.barplot(data=df_bams, hue='modification', x='dataset', y='reads',
            capsize=.05, err_kws={'linewidth': 2}, edgecolor="black",
            order=['ENCODE', 'Roadmap'],
            hue_order=MODIFICATIONS + ['Control'],
            ax=ax)
sns.stripplot(data=df_bams, hue='modification', x='dataset', y='reads',
              dodge=True, size=3, palette='dark:black', alpha=0.5, legend=False,
              order=['ENCODE', 'Roadmap'],
              hue_order=MODIFICATIONS + ['Control'],
              ax=ax)
ax.set_title('Library size')
# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

plt.tight_layout()
plt.show()

In [None]:
rows = []
for (ds, m, c), group in tqdm(
        df_bams[(df_bams['modification'] != 'Control')].groupby(['dataset', 'modification', 'cell'])):
    if len(group) != 2:
        continue
    file1, file2 = group['bam_file'].values[0], group['bam_file'].values[1]
    r1, r2 = group['replicate'].values[0], group['replicate'].values[1]
    cr1 = r1
    # Find control1
    ct = df_bams[(df_bams['modification'] == 'Control') & (df_bams['cell'] == c) &
                 (df_bams['replicate'] == cr1) & (df_bams['dataset'] == ds)]
    if len(ct) == 0:  # Control can be different for Roadmap
        cr1 = ''
        ct = df_bams[(df_bams['modification'] == 'Control') & (df_bams['cell'] == c) &
                     (df_bams['replicate'] == cr1) & (df_bams['dataset'] == ds)]
    if len(ct) == 0:
        continue
    control1 = ct['bam_file'].values[0]
    # Find control1
    cr2 = r2
    ct = df_bams[(df_bams['modification'] == 'Control') & (df_bams['cell'] == c) &
                 (df_bams['replicate'] == cr2) & (df_bams['dataset'] == ds)]
    if len(ct) == 0:  # Control can be different for Roadmap
        cr2 = ''
        ct = df_bams[(df_bams['modification'] == 'Control') & (df_bams['cell'] == c) &
                     (df_bams['replicate'] == cr2) & (df_bams['dataset'] == ds)]
    if len(ct) == 0:
        continue
    control2 = ct['bam_file'].values[0]
    if control1 == control2:
        continue  # Ignore same control for replicates
    rows.append((ds, m, c, r1, file1, cr1, control1, r2, file2, cr2, control2))
df_controls = pd.DataFrame(rows, columns=['dataset', 'modification', 'cell',
                                          'replicate1', 'file1', 'control_replicate1', 'control_file1',
                                          'replicate2', 'file2', 'control_replicate2', 'control_file2'])
df_controls.sample(5)

In [None]:
t = pd.merge(df_controls, df_peaks[(df_peaks['control']==True)][['dataset', 'modification', 'cell', 'replicate', 'tool', 'peaks']], 
             left_on=['dataset', 'modification', 'cell', 'replicate1'],
             right_on=['dataset', 'modification', 'cell', 'replicate']
             )
t.rename({'peaks': 'rep1_peaks'}, axis=1, inplace=True)
t.drop('replicate', axis=1, inplace=True)
t = pd.merge(t, df_peaks[df_peaks['control']==True][['dataset', 'modification', 'cell', 'replicate', 'tool', 'peaks']],
             left_on=['dataset', 'modification', 'cell', 'tool', 'replicate2'],
             right_on=['dataset', 'modification', 'cell', 'tool', 'replicate']
             )
t.rename({'peaks': 'rep2_peaks'}, axis=1, inplace=True)
t.drop('replicate', axis=1, inplace=True)
t = pd.merge(t, df_bams[['dataset', 'modification', 'cell', 'replicate', 'reads']],
             left_on=['dataset', 'modification', 'cell', 'replicate1'],
             right_on=['dataset', 'modification', 'cell', 'replicate']
             )
t.rename({'reads': 'rep1_reads'}, axis=1, inplace=True)
t.drop('replicate', axis=1, inplace=True)
t = pd.merge(t, df_bams[['dataset', 'modification', 'cell', 'replicate', 'reads']],
             left_on=['dataset', 'modification', 'cell', 'replicate2'],
             right_on=['dataset', 'modification', 'cell', 'replicate']
             )
t.rename({'reads': 'rep2_reads'}, axis=1, inplace=True)
t.drop('replicate', axis=1, inplace=True)
t = pd.merge(t, df_bams[df_bams['modification'] == 'Control'][['dataset', 'cell', 'replicate', 'reads']], 
             left_on=['dataset', 'cell', 'control_replicate1'],
             right_on=['dataset', 'cell', 'replicate']
             )
t.rename({'reads': 'rep1_ctrl_reads'}, axis=1, inplace=True)
t.drop('replicate', axis=1, inplace=True)
t = pd.merge(t, df_bams[df_bams['modification'] == 'Control'][['dataset', 'cell', 'replicate', 'reads']],
             left_on=['dataset', 'cell', 'control_replicate2'],
             right_on=['dataset', 'cell', 'replicate']
             )
t.rename({'reads': 'rep2_ctrl_reads'}, axis=1, inplace=True)
t.drop('replicate', axis=1, inplace=True)

t.head(5)

In [None]:
sns.set_theme(style='white')
for tool in TOOLS:
    print(tool)
    plt.figure(figsize=(24, 4))
    axs = [plt.subplot(1, 5, i + 1) for i in range(5)]
    for i, m in enumerate(MODIFICATIONS):
        ax = axs[i]
        ax.set_title(m)
        tt = t[(t['dataset'] == 'ENCODE') & (t['modification'] == m) & (t['tool'] == tool)]
        rep1_peaks, rep1_reads, rep1_ctrl_reads = tt['rep1_peaks'], tt['rep1_reads'], tt['rep1_ctrl_reads'] 
        rep2_peaks, rep2_reads, rep2_ctrl_reads = tt['rep2_peaks'], tt['rep2_reads'], tt['rep2_ctrl_reads']
        minx = min(np.min(rep1_reads), np.min(rep2_reads))
        maxx = max(np.max(rep1_reads), np.max(rep2_reads))
        miny = min(np.min(rep1_ctrl_reads), np.min(rep2_ctrl_reads))
        maxy = max(np.max(rep1_ctrl_reads), np.max(rep2_ctrl_reads))
        minxy = min(minx, miny)
        maxxy = max(maxx, maxy)
        sns.lineplot(x=[minxy, maxxy], y=[minxy, maxxy], color='grey', ax=ax, alpha=0.5, linestyle='dotted')
        sns.scatterplot(x=rep1_reads.to_list() + rep2_reads.to_list(), 
                        y=rep1_ctrl_reads.to_list() + rep2_ctrl_reads.to_list(), 
                        alpha=0.8, ax = ax, 
                        size=rep1_peaks.to_list() + rep2_peaks.to_list(), 
                        color='blue')
        for r1r, r1cr, r2r, r2cr in zip(rep1_reads, rep1_ctrl_reads, rep2_reads, rep2_ctrl_reads):
            sns.lineplot(x=[r1r, r2r], y=[r1cr, r2cr], color='green', ax=ax, alpha=0.5, linestyle='dotted')
        for xv, yv, cell, rep in zip(rep1_reads, rep1_ctrl_reads, t['cell'], t['replicate1']):
            rep = '1' if rep == 'rep1' else '2' if rep == 'rep2' else ''
            ax.text(xv, yv, f'{cell} {rep}', fontsize = 5)
        for xv, yv, cell, rep in zip(rep2_reads, rep2_ctrl_reads, t['cell'], t['replicate2']):
            rep = '1' if rep == 'rep1' else '2' if rep == 'rep2' else ''
            ax.text(xv, yv, f'{cell} {rep}', fontsize = 5)
        offset = 0.1 * (maxxy - minxy)
        ax.set_xlabel('Signal reads')
        ax.set_ylabel('Control reads')
        ax.set_xlim(minxy - offset, maxxy + offset)
        ax.set_ylim(minxy - offset, maxxy + offset)
        if i > 0:
            ax.set_ylabel(None)
        # if i < len(axs) - 1:
        #     ax.legend().set_visible(False)
        # else:
        # Put a legend to the right of the current axis
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.tight_layout()
    plt.show()
sns.set_theme(style='whitegrid')

# Compare replicates

In [None]:
def compute_reps_peaks(df_peaks, modifications, cells):
    rows = []
    t = df_peaks[(df_peaks['modification'].isin(modifications)) & (df_peaks['cell'].isin(cells))]
    for (ds, m, c, to), group in t.groupby(['dataset', 'modification', 'cell', 'tool']):
        if len(group) != 2:
            continue
        group.sort_values(by='replicate', inplace=True)
        rep1, rep2 = group['replicate'].values[0], group['replicate'].values[1]
        peaks1, peaks2 = group['peaks'].values[0], group['peaks'].values[1]
        rows.append((ds, m, c, to, rep1, rep2, peaks1, peaks2))
    return pd.DataFrame(rows, columns=['dataset', 'modification', 'cell', 'tool', 'replicate1', 'replicate2', 'peaks1', 'peaks2'])

In [None]:
df_encode_peaks_original = df_peaks[(df_peaks['dataset'] == 'ENCODE') & (df_peaks['control'] == True)].copy()

In [None]:
df_encode_reps = compute_reps_peaks(df_encode_peaks_original, MODIFICATIONS, GSE26320_CELLS)
df_encode_reps.sample(5)

In [None]:
print('ENCODE')
df_encode_reps['reps_peaks_diff'] = np.fabs((df_encode_reps['peaks1'] - df_encode_reps['peaks2']) /
                                            (df_encode_reps['peaks1'] + df_encode_reps['peaks2']))
plt.figure(figsize=(8, 3))
ax = plt.axes()
sns.boxplot(data=df_encode_reps, x='modification', hue='tool', y='reps_peaks_diff',
            palette=TOOLS_PALETTE,
            order=MODIFICATIONS,
            hue_order=['MACS2', 'MACS2 broad', 'SPAN', 'SICER'],
            showfliers=False,
            ax=ax)
sns.stripplot(data=df_encode_reps, x='modification', hue='tool', y='reps_peaks_diff',
              dodge=True, size=3, palette='dark:black', alpha=0.8,
              order=MODIFICATIONS,
              hue_order=['MACS2', 'MACS2 broad', 'SPAN', 'SICER'], legend=False,
              ax=ax)
ax.set_title('Peaks difference between replicates')
ax.set_ylabel('Abs difference')
ax.set_xlabel('Modification')
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
df_immune_peaks_original = df_peaks[(df_peaks['dataset'] == 'Roadmap') & (df_peaks['control'] == True)].copy()
df_immune_reps = compute_reps_peaks(df_immune_peaks_original, MODIFICATIONS, IMMUNE_CELLS)
df_immune_reps.sample(5)

In [None]:
print('Roadmap')
df_immune_reps['reps_peaks_diff'] = np.fabs((df_immune_reps['peaks1'] - df_immune_reps['peaks2']) /
                                            (df_immune_reps['peaks1'] + df_immune_reps['peaks2']))
plt.figure(figsize=(8, 3))
ax = plt.axes()
sns.boxplot(data=df_immune_reps, x='modification', hue='tool', y='reps_peaks_diff',
            palette=TOOLS_PALETTE,
            order=MODIFICATIONS,
            hue_order=['MACS2', 'MACS2 broad', 'SPAN', 'SICER'],
            showfliers=False,
            ax=ax)
sns.stripplot(data=df_immune_reps, x='modification', hue='tool', y='reps_peaks_diff',
              dodge=True, size=3, palette='dark:black', alpha=0.8,
              order=MODIFICATIONS,
              hue_order=['MACS2', 'MACS2 broad', 'SPAN', 'SICER'], legend=False,
              ax=ax)
ax.set_title('Peaks difference between replicates')
ax.set_ylabel('Abs difference')
ax.set_xlabel('Modification')
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
print('ENCODE + Roadmap')
t = pd.concat([df_encode_reps, df_immune_reps]).reset_index(drop=True)
plt.figure(figsize=(6, 3))
ax = plt.axes()
sns.boxplot(data=t, x='modification', hue='tool', y='reps_peaks_diff',
            palette=TOOLS_PALETTE,
            order=MODIFICATIONS,
            hue_order=['MACS2', 'MACS2 broad', 'SPAN', 'SICER'],
            showfliers=False,
            ax=ax)
sns.stripplot(data=t, x='modification', hue='tool', y='reps_peaks_diff',
              dodge=True, size=2, palette='dark:black', alpha=0.8,
              order=MODIFICATIONS,
              hue_order=['MACS2', 'MACS2 broad', 'SPAN', 'SICER'], legend=False,
              ax=ax)
ax.set_title('Peaks difference between replicates')
ax.set_ylabel('Abs difference')
ax.set_xlabel('Modification')
ax.xaxis.set_tick_params(rotation=45)
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
print('ENCODE + Roadmap NO SPAN')
t = pd.concat([df_encode_reps, df_immune_reps]).reset_index(drop=True)
plt.figure(figsize=(6, 3))
ax = plt.axes()
sns.boxplot(data=t, x='modification', hue='tool', y='reps_peaks_diff',
            palette=TOOLS_PALETTE,
            order=MODIFICATIONS,
            hue_order=['MACS2', 'MACS2 broad', 'SICER'],
            showfliers=False,
            ax=ax)
sns.stripplot(data=t, x='modification', hue='tool', y='reps_peaks_diff',
              dodge=True, size=2, palette='dark:black', alpha=0.8,
              order=MODIFICATIONS,
              hue_order=['MACS2', 'MACS2 broad', 'SICER'], legend=False,
              ax=ax)
ax.set_title('Peaks difference between replicates')
ax.set_ylabel('Abs difference')
ax.set_xlabel('Modification')
ax.xaxis.set_tick_params(rotation=45)
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

# END