# 2021 Comparative analysis

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

from IPython.display import display
import pandas as pd

import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
import scipy as sp
import os, re
import glob
from tqdm.auto import tqdm
import subprocess, tempfile

import plotly.graph_objects as go
import plotly.express as px

In [None]:
# Ignore H3K27ac, H3K27ac because of a single replicate, bad quality
MODIFICATIONS = ['H3K27me3', 'H3K4me3', 'H3K4me1', 'H3K36me3',
                'H3K9me3', 'H3K9ac', 'H3K79me2', 'H3K56ac']  

# MACS2

In [None]:
MACS2_FOLDER='/mnt/stripe/shpynov/2021_comparative_analysis/macs2/'
MACS2_LEVELS = ['q0.05', 'broad0.05']

dfm = pd.DataFrame(columns=['modification', 'replicate', 'level', 'file', 'peaks', 'length'], dtype=object)
for file in tqdm(glob.glob(MACS2_FOLDER + '/*.*Peak')):
    if 'gapped' in file:
        continue
    level = next((l for l in MACS2_LEVELS if f'_{l}' in file), None)
    modification = next((m for m in MODIFICATIONS if f'_{m}' in file), None)
    if level and modification:
        replicate = os.path.basename(file).replace(f'H1_{modification}_', '')[:3]
        out = ! awk '{{N+=1;L+=($$3-$$2)}} END{{print(N,L)}}' {file}
        if out[0].strip() != '':
            peaks, length = out[0].split(' ') 
        else:
            peaks, length = 0, 0
        dfm.loc[len(dfm)] = (modification, replicate, f'macs2 {level}', file, peaks, length)
        
# Fix types
dfm['peaks'] = dfm['peaks'].astype(int)
dfm['length'] = dfm['length'].astype(int)

In [None]:
dfm

In [None]:
macs2levels2process = set(['macs2 q0.05', 'macs2 broad0.05'])

# SICER

In [None]:
SICER_FOLDER='/mnt/stripe/shpynov/2021_comparative_analysis//sicer'
SICER_LEVELS = ['FDR0.05']

dfsc = pd.DataFrame(columns=['modification', 'replicate', 'level', 'file', 'peaks', 'length'], dtype=object)
for file in tqdm(glob.glob(SICER_FOLDER + '/*islands-summary*')):
    level = next((l for l in SICER_LEVELS if f'-{l}' in file), None)
    modification = next((m for m in MODIFICATIONS if f'_{m}' in file), None)
    if level and modification:
        replicate = os.path.basename(file).replace(f'H1_{modification}_', '')[:3]
        out = ! awk '{{N+=1;L+=($$3-$$2)}} END{{print(N,L)}}' {file}
        if out[0].strip() != '':
            peaks, length = out[0].split(' ') 
        else:
            peaks, length = 0, 0
        dfsc.loc[len(dfsc)] = (modification, replicate, f'sicer {level}', file, peaks, length)

# Fix types
dfsc['peaks'] = dfsc['peaks'].astype(int)
dfsc['length'] = dfsc['length'].astype(int)

In [None]:
dfsc

In [None]:
sicerlevels2process = set(['sicer FDR0.05'])

# SPAN

In [None]:
SPAN_FOLDER='/mnt/stripe/shpynov/2021_comparative_analysis/span'
GAPS = [5]
FDRS = ['0.05']
# SPAN_LEVELS = ['200_1E-6_5', '200_0.01_5']
SPAN_LEVELS = []
for gap in GAPS:
    for fdr in FDRS:
        SPAN_LEVELS.append(f'200_{fdr}_{gap}')

dfs = pd.DataFrame(columns=['modification', 'replicate', 'level', 'file', 'peaks', 'length'], dtype=object)
for file in tqdm(glob.glob(SPAN_FOLDER + '/*.peak')):
    if 'Input' in file:
        continue
    level = next((l for l in SPAN_LEVELS if f'_{l}' in file), None)
    modification = next((m for m in MODIFICATIONS if f'_{m}' in file), None)
    if level and modification:
        replicate = os.path.basename(file).replace(f'H1_{modification}_', '')[:3]
        out = ! awk '{{N+=1;L+=($$3-$$2)}} END{{print(N,L)}}' {file}
        if out[0].strip() != '':
            peaks, length = out[0].split(' ') 
        else:
            peaks, length = 0, 0
        dfs.loc[len(dfs)] = (modification, replicate, f'span {level}', file, peaks, length)
        
# Fix types
dfs['peaks'] = dfs['peaks'].astype(int)
dfs['length'] = dfs['length'].astype(int)

In [None]:
dfs

In [None]:
import re
dfs['fdr'] = [float(re.sub('span 200_|_(0|5|10)', '', l)) for l in dfs['level']]
dfs['gap'] = [int(re.sub('.*_', '', l)) for l in dfs['level']]
dfs.sort_values(by=['fdr', 'gap'], inplace=True)
dfs.head()

In [None]:
spanlevels2process = set(['span 200_0.05_5'])

# Summary

In [None]:
dfa = pd.concat([dfm.loc[[l in macs2levels2process for l in dfm['level']]],
                 dfsc.loc[[l in sicerlevels2process for l in dfsc['level']]],
                 dfs.loc[[l in spanlevels2process for l in dfs['level']]]])

In [None]:
dfa['avg_length'] = dfa['length'] / dfa['peaks']
dfa.loc[~np.isfinite(dfa["avg_length"]), "avg_length"] = 0.0

In [None]:
# # List file to create session
# for m in MODIFICATIONS:
#     for c in CELLS:
#         bw = glob.glob(f'/mnt/stripe/shpynov/2020_roadmapepigenomics/bams_bws/*{c}.{m}.*.bw')[0]
#         print(bw)
#         dfcm = dfa.loc[np.logical_and(dfa['cell']==c, dfa['modification']==m)]
#         for l in sorted(set(dfa['level'])):
#             peaks = list(dfcm.loc[dfcm['level'] == l]['file'])
#             if peaks:
#                 peaks = peaks[0]
#                 print(f'{os.path.dirname(peaks)}/bb/{os.path.basename(peaks)}.bb')

# Group analysis

In [None]:
def plot_data_cells(df, cid, value, description):
    cids = sorted(set(df[cid]))
    axs = {}
    total = len(cids) * len(MODIFICATIONS)
    fig = plt.figure(figsize=(int(total * .5), 4))
    offset = 0
    for m in MODIFICATIONS:
        data = df.loc[df['modification'] == m].sort_values(by=[cid])
        xlabels = []
        for c in data[cid]:
            if c not in xlabels:
                xlabels.append(c)
        w = len(cids)
        ax = plt.subplot2grid((1, total), (0, offset), colspan=w)

        sns.barplot(data=data, 
                     x=cid, y=value,
                     capsize=.2, errwidth=2,
                     edgecolor="black",
                     ax = ax)

        sns.swarmplot(data=data,
                      x=cid, y=value,
                      size=1,
                      color="black",
                      alpha=0.5,
                      ax = ax)
        ax.legend().set_visible(False)
        axs[ax] = plt.ylim()
        if offset > 0:
            ax.get_yaxis().set_ticklabels([])
            ax.set_ylabel('')
        else:
            ax.set_ylabel(description)
        
        offset += w
        ax.set_xlabel('')
        ax.set_title(m)
        plt.xticks(range(0, len(xlabels)), xlabels, rotation=45)
            
    ymin = np.min([v[0] for v in axs.values()])
    ymax = np.max([v[1] for v in axs.values()])
 
    for ax in axs.keys():
        ax.set_ylim(bottom = ymin, top = ymax)
    plt.tight_layout()

In [None]:
plot_data_cells(dfa, 'level', 'peaks', 'Peaks')
plt.show()
plot_data_cells(dfa, 'level', 'avg_length', 'Average peak length')
plt.show()

# Consistency analysis

In [None]:
import tempfile

def bedl(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return (tf[2] - tf[1]).sum()
    except:
        return 0. # Empty file

tools_comparison = pd.DataFrame(columns=['modification', 'replicate', 'tool1', 'tool2',
                                         'peaks1', 'peaks2', 'length1', 'length2',
                                         'overlap1', 'overlap2', 'intersect'], dtype=object)

tools = list(sorted(set(dfa['level'])))
for m in tqdm(sorted(set(dfa['modification']))):
    print(m)
    t = dfa.loc[dfa['modification'] == m]
    for r in sorted(set(t['replicate'])):
        for i in range(len(tools)):
            for j in range(i + 1, len(tools)):
                t1 = t.loc[(t['replicate'] == r) & (t['level'] == tools[i])]
                t2 = t.loc[(t['replicate'] == r) & (t['level'] == tools[j])]
                m = t1['modification'].values[0]
                file1 = t1['file'].values[0]
                peaks1 = t1['peaks'].values[0]
                length1 = bedl(file1)
                file2 = t2['file'].values[0]
                peaks2 = t2['peaks'].values[0]
                length2 = bedl(file2)
                overlap1 = !bedtools intersect -a {file1} -b {file2} -wa -u | wc -l
                overlap1 = int(overlap1[0])
                overlap2 = !bedtools intersect -b {file1} -a {file2} -wa -u | wc -l            
                overlap2 = int(overlap2[0])            
                tf = tempfile.mktemp() 
                !bedtools intersect -b {file1} -a {file2} > {tf}
                intersectionl = bedl(tf)
                tools_comparison.loc[len(tools_comparison)] = \
                    (m, r, tools[i], tools[j], peaks1, peaks2, length1, length2, 
                     overlap1, overlap2, intersectionl)
            
tools_comparison

In [None]:
to = pd.DataFrame(columns=['Modification', 'Replicate', 'Comparison', 'Overlap'], dtype=object)
for _, row in tools_comparison.iterrows():
    m = row['modification']
    r = row['replicate']
    t1 = row['tool1']
    t2 = row['tool2']
    peaks1 = row['peaks1']
    peaks2 = row['peaks2']
    overlap1 = row['overlap1']
    overlap2 = row['overlap2']    
    to.loc[len(to)] = (m, r, f'Comparison {t1} vs {t2}', overlap1 / peaks1 if peaks1 !=0 else 0)
    to.loc[len(to)] = (m, r, f'Comparison {t2} vs {t1}', overlap2 / peaks2 if peaks2 !=0 else 0)

display(to) 

plt.figure(figsize=(18, 4))
sns.barplot(data=to.loc[['span' in c for c in to['Comparison']]], 
            x='Modification', y='Overlap', hue='Comparison', capsize=.1, errwidth=2, edgecolor="black")
plt.show()

In [None]:
tj = pd.DataFrame(columns=['Modification', 'Replicate', 'Comparison', 'Jaccard'], dtype=object)
for _, row in tools_comparison.iterrows():
    m = row['modification']
    r = row['replicate']
    t1 = row['tool1']
    t2 = row['tool2']
    length1 = row['length1']
    length2 = row['length2']
    intersection = row['intersect']
    try:
        jaccard = intersection / (length1 + length2 - intersection)
    except:
        jaccard = 0
    tj.loc[len(tj)] = (m, r, f'Comparison {t1} vs {t2}', jaccard)

display(tj) 

plt.figure(figsize=(18, 4))
sns.barplot(data=tj.loc[['span' in c for c in tj['Comparison']]], 
            x='Modification', y='Jaccard', hue='Comparison', capsize=.1, errwidth=2, edgecolor="black")
plt.show()