# SPAN Benchmark H3K4me3 H3K36me3 vs RNA-Seq

See `SPANBenchmarkH3K4me3H3K36me3Experiment` in `epigenome` project.
This experiment provides information per gene.
For each gene:
* _TPM / _FPKM - transcript per million reads provided by RSEM

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

from IPython.display import display
import pandas as pd
from tqdm.auto import tqdm
import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
import scipy as sp
import os

In [None]:
PATH='/mnt/stripe/bio/experiments/span_benchmark_h3k36me3_h3K4me3/data.tsv'

df = pd.read_csv(PATH, sep='\t', comment='#')
# Fix strand records
df['strand'] = ['+' if s == 1 else '-' for s in df['strand']]
# Filter out only protein coding genes
df = df.loc[df['coding'] == 1]
print(f'Number of protein coding genes {len(df)}')
df

In [None]:
df['loc'] = df['chr'] + ':' + df['start'].astype(str) + '-' + df['end'].astype(str)
df['len'] = df['end'] - df['start']
df.sort_values(by=['len'], ascending=False, inplace=True)
print('Longest genes')
display(df[['gene_symbol', 'loc', 'len']].reset_index(drop=True).iloc[:200,:])
print('Gene lengths')
print(df['len'].describe())

# Peaks statistics

```
for GAP in 0 5 10; do echo $GAP; for FDR in 0.1 0.01 1E-3 1E-6 1E-10 1E-15 1E-20; do echo $FDR; java -jar ../span-0.12.0.5096.jar analyze -m fit/wgEncodeBroadHistoneGm12878H3k4me3StdAlnRep1_wgEncodeBroadHistoneGm12878ControlStdAlnRep1_200#32f4e.span -cs /mnt/stripe/bio/genomes/hg19/hg19.chrom.sizes --fdr $FDR --gap $GAP --peaks wgEncodeBroadHistoneGm12878H3k4me3StdAlnRep1_${FDR}_${GAP}.peak | tee wgEncodeBroadHistoneGm12878H3k4me3StdAlnRep1_${FDR}_${GAP}.out;  done; done;
```

In [None]:
import glob
import re

DIR = '/mnt/stripe/shpynov/BenchmarkChIPseqPeakCallers_Code/H3K4me3DataFiltered'
MACS_PEAKS = DIR + "/H3K4me3_Rep2_EncodeAlign_TF_Calls/Test_peaks.narrowPeak"
SICER_PEAKS = DIR + "/wgEncodeBroadHistoneGm12878H3k4me3StdAlnRep1Filtered-W200-G600-islands-summary-FDR0.01"

ts = []
for peaksfile in [SICER_PEAKS] + \
    glob.glob(DIR + "/H3K4me3_Rep2_EncodeAlign_TF_Calls*/*.narrowPeak") + \
    glob.glob(DIR + "/H3K4me3_Rep2_EncodeAlign_TF_Calls*/*.broadPeak") +  \
    glob.glob(DIR + "/*.peak"):
    name = os.path.basename(peaksfile)
    dirname = os.path.basename(os.path.dirname(peaksfile))
    tf = pd.read_csv(peaksfile, sep='\t', header=None)
    if 'narrowPeak' in name:
        caller = 'MACS2_q0.01'#+ re.sub('H3K4me3_Rep2_EncodeAlign_TF_Calls_', '', dirname)
        significance = tf[8] # Minus log10 q
    if 'broadPeak' in name:
        caller = 'MACS2_q0.01'# + re.sub('H3K4me3_Rep2_EncodeAlign_TF_Calls_', '', dirname)
        significance = tf[8] # Minus log10 q
    elif 'islands-summary' in name:
        caller = 'SICER'
        significance = -np.log10(tf[7]) # Minus log10 p adj
    elif '.peak' in name:
        significance = tf[8] # Minus log10 q      
        caller = 'SPAN_' + re.sub('.*Rep1Filtered_|\.peak', '', name)

    t = tf.iloc[:, :3].copy()
    t.columns = ['chr', 'start', 'end']
    t['len'] = t['end'] - t['start']
    t['name'] = name
    t['caller'] = caller
    t['significance'] = significance
    t.sort_values(by=['len'], ascending=False, inplace=True)
    t['loc'] = t['chr'] + ':' + t['start'].astype(str) + '-' + t['end'].astype(str)
    print(f'Longest peak {name} {caller}\t' + str(t[['loc', 'len']].reset_index(drop=True).iloc[0, 1]))
    tf = pd.read_csv(f'{peaksfile}.rpm', header=None)
    tf.columns = ['rpm']
    t['rpm'] = tf['rpm']
    ts.append(t)
dfcallers = pd.concat(ts)
dfcallers.head()

In [None]:
# Lenghts distribution
for gap in [0, 5, 10]:
    plt.figure(figsize=(10, 6))
    for caller in sorted(set(dfcallers['caller'])):
        if 'SPAN' in caller and not re.match(f'SPAN.*_{gap}$', caller):
            continue
        tc = dfcallers.loc[dfcallers['caller'] == caller]
    #     print(caller)
    #     print(tc['len'].describe())
        sns.kdeplot(tc['len'], shade=True, label=caller)
    plt.xlim(0, 6000)
    plt.suptitle(f'Peak lenghts')
    plt.show()

# Top peaks precision / recall vs expressed genes

In [None]:
df['tss_start'] = [start-1000 if strand == '+' else end-1000 for (start, end, strand) in 
                   zip(df['start'], df['end'], df['strand'])] 
df['tss_end'] = [start+1000 if strand == '+' else end+1000 for (start, end, strand) in 
                   zip(df['start'], df['end'], df['strand'])] 

expressed = df.loc[df['TPM'] > 0.5]
print(f'Expressed {len(expressed)} out of {len(df)} protein coding')

expressed.sort_values(by=['chr', 'start', 'end'], inplace=True)
expressed[['chr', 'tss_start', 'tss_end']].to_csv('/tmp/expressed_tss.bed', sep='\t', index=None, header=None)
expressed

In [None]:
for i in tqdm(range(1, 16)):
    for caller in sorted(set(dfcallers['caller'])):
        topn = dfcallers.loc[dfcallers['caller']==caller].sort_values(
            by='significance', ascending=False
        ).head(1000*i).sort_values(by=['chr', 'start', 'end'])
        topn[['chr', 'start', 'end']].to_csv(f'/tmp/{caller}_{1000*i}.bed', sep='\t', index=None, header=None)

In [None]:
overlapdf = pd.DataFrame(columns=['caller', 'xn', 'recovered_promoter_fraction', 'correct_peak_fraction'],
                        dtype=object)
for i in tqdm(range(1, 16)):
    for caller in sorted(set(dfcallers['caller'])):
        overlap = ! bedtools intersect -u -a /tmp/expressed_tss.bed -b /tmp/{caller}_{1000*i}.bed | wc -l
        overlap = int(overlap[0].strip())
        recovered_promoter_fraction = overlap / len(expressed)
        overlap = ! bedtools intersect -u -b /tmp/expressed_tss.bed -a /tmp/{caller}_{1000*i}.bed | wc -l
        overlap = int(overlap[0].strip())
        correct_peak_fraction = overlap / (1000 * i)
        overlapdf.loc[len(overlapdf)] = (caller, 1000*i, recovered_promoter_fraction, correct_peak_fraction)
overlapdf

In [None]:
import plotly.express as px

fig = px.line(overlapdf, x="recovered_promoter_fraction", y="correct_peak_fraction", 
              color="caller", hover_name="xn")
fig.update_xaxes(range=[0, 1], row=1, col=1)
fig.update_yaxes(range=[0, 1], row=1, col=1)
fig.show()

In [None]:
import plotly.graph_objects as go
import plotly.express as px

for gap in [0, 5, 10]:
    fig = go.Figure()
    for caller in sorted(set(overlapdf['caller'])):
        if 'SPAN' in caller and not re.match(f'SPAN.*_{gap}$', caller):
            continue    
        t = overlapdf.loc[overlapdf['caller']==caller]
        fig.add_trace(go.Scatter(x=t["recovered_promoter_fraction"], y=t["correct_peak_fraction"], 
                                 mode='lines+markers',
                                 name=caller))
    fig.update_xaxes(range=[0, 1], title='recovered_promoter_fraction')
    fig.update_yaxes(range=[0, 1], title='correct_peak_fraction')
    fig.show()

# FRIP vs FDR

In [None]:
fripdf = pd.DataFrame(columns=['fdr', 'gap', 'frip', 'peaks'], dtype=object)
for gap in [0, 5, 10]:
    for fdr in ['0.1', '0.01', '0.05', '1E-3', '1E-4', '1E-5', '1E-6', 
                '1E-7', '1E-8', '1E-9', '1E-10', '1E-15', '1E-20', '1E-25', '1E-30', '1E-40']:
        frip = ! cat {DIR}/wgEncodeBroadHistoneGm12878H3k4me3StdAlnRep1Filtered_{fdr}_{gap}.peak.frip
        frip = float(frip[0].strip())
        peaks = ! cat {DIR}/wgEncodeBroadHistoneGm12878H3k4me3StdAlnRep1Filtered_{fdr}_{gap}.peak | wc -l
        peaks = int(peaks[0].strip())
        fripdf.loc[len(fripdf)] = (float(fdr), gap, frip, peaks)
# fripdf

In [None]:
import plotly.graph_objects as go
import plotly.express as px

fripmacs2 = ! cat {MACS_PEAKS}.frip
fripmacs2 = float(fripmacs2[0].strip())
print(f'FRIP MACS2 {fripmacs2}')

fripsicer = ! cat {SICER_PEAKS}.frip
fripsicer = float(fripsicer[0].strip())
print(f'FRIP SICER {fripsicer}')

fig = go.Figure()
for gap in sorted(set(fripdf['gap'])):
    t = fripdf.loc[fripdf['gap']==gap]
    fig.add_trace(go.Scatter(x=np.log10(t["fdr"]), y=t["frip"], 
                             mode='lines+markers',
                             name=gap))

fig.add_shape(type="line", x0=-40, y0=fripmacs2, x1=0, y1=fripmacs2, line=dict(
                color="Black",
                width=1,
                dash="dot"))
fig.add_shape(type="line", x0=-40, y0=fripsicer, x1=0, y1=fripsicer, line=dict(
                color="Black",
                width=1,
                dash="dot"))

fig.update_xaxes(title='log10 fdr')
fig.update_yaxes(title='frip')
fig.show()

In [None]:
peaksmacs2 = ! cat {MACS_PEAKS} | wc -l
peaksmacs2 = int(peaksmacs2[0].strip())
print(f'Peaks MACS2 {peaksmacs2}')

peakssicer = ! cat {SICER_PEAKS} | wc -l
peakssicer = int(peakssicer[0].strip())
print(f'Peaks SICER {peakssicer}')

fig = go.Figure()
for gap in sorted(set(fripdf['gap'])):
    t = fripdf.loc[fripdf['gap']==gap]
    fig.add_trace(go.Scatter(x=np.log10(t["fdr"]), y=t["peaks"], 
                             mode='lines+markers',
                             name=gap))

fig.add_shape(type="line", x0=-40, y0=peaksmacs2, x1=0, y1=peaksmacs2, line=dict(
                color="Black",
                width=1,
                dash="dot"))
fig.add_shape(type="line", x0=-40, y0=peakssicer, x1=0, y1=peakssicer, line=dict(
                color="Black",
                width=1,
                dash="dot"))

fig.update_xaxes(title='log10 fdr')
fig.update_yaxes(title='peaks')
fig.show()

# Plot RPKM distribution vs Fdr and Gap

In [None]:
dfcallers['rpkm'] = dfcallers['rpm'] / (dfcallers['len']/1000)
dfcallers.head()

In [None]:
for gap in [0, 5, 10]:
    plt.figure(figsize=(10, 5))
    for caller in sorted(set(dfcallers['caller'])):
        if 'SPAN' in caller and not re.match(f'SPAN.*_{gap}$', caller):
            continue    
        t = dfcallers.loc[dfcallers['caller']==caller]
        plt.xlim(0, 150)
#         plt.ylim(0, 0.08)
        sns.kdeplot(t['rpkm'], shade=True, label=caller)
    plt.suptitle(f'RPKMs')
    plt.show()

In [None]:
print(list(set(dfcallers['caller'])))

In [None]:
t = dfcallers.loc[['SPAN' in c for c in dfcallers['caller']]].copy()
t['log10fdr'] = [np.log10(float(re.sub('SPAN_|_(0|5|10)$', '', c))) for c in t['caller']]
t['gap'] = [re.sub('.*_', '', c) for c in t['caller']]
t2 = dfcallers.loc[dfcallers['caller'] == 'MACS2_q0.01'].copy()
t2['log10fdr'] = 5
t2['gap'] = 'MACS2'
t3 = dfcallers.loc[dfcallers['caller'] == 'SICER'].copy()
t3['log10fdr'] = 10
t3['gap'] = 'SICER'

In [None]:
# import plotly.express as px
# fig = px.box(pd.concat([t, t2, t3]), x="log10fdr", y="rpkm", color="gap")
# # fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
# fig.show()

In [None]:
plt.figure(figsize=(12, 10))
sns.violinplot(data=pd.concat([t, t2, t3]), x="log10fdr", y="rpkm", hue="gap")
plt.show()

In [None]:
# SPAN RPKM outliers
display(t.loc[t['caller'] == 'SPAN_1E-20_0'].sort_values(by=['rpkm'], ascending=False).head(10)[['loc', 'rpkm']])

In [None]:
# MACS2 RPKM outliers
display(t2.sort_values(by=['rpkm'], ascending=False).head(10)[['loc', 'rpkm']])

In [None]:
# SICER SPAN outliers
display(t3.sort_values(by=['rpkm'], ascending=False).head(10)[['loc', 'rpkm']])

# H3K36me3
```
for GAP in 0 5 10; do echo $GAP; for FDR in 0.1 0.01 1E-3 1E-6 1E-10 1E-15 1E-20; do echo $FDR; java -jar ../span-0.12.0.5096.jar analyze -m fit/wgEncodeBroadHistoneGm12878H3k4me3StdAlnRep1_wgEncodeBroadHistoneGm12878ControlStdAlnRep1_200#32f4e.span -cs /mnt/stripe/bio/genomes/hg19/hg19.chrom.sizes --fdr $FDR --gap $GAP --peaks wgEncodeBroadHistoneGm12878H3k4me3StdAlnRep1_${FDR}_${GAP}.peak | tee wgEncodeBroadHistoneGm12878H3k4me3StdAlnRep1_${FDR}_${GAP}.out;  done; done;
```

In [None]:
DIR = '/mnt/stripe/shpynov/BenchmarkChIPseqPeakCallers_Code/H3K36me3DataFiltered'
MACS_PEAKS = DIR + "/H3K36me3_Rep1_EncodeAlign_TF_Calls/Test_peaks.broadPeak"
SICER_PEAKS = DIR + "/wgEncodeBroadHistoneGm12878H3k36me3StdAlnRep1Filtered-W200-G600-islands-summary-FDR0.01"

ts = []
for peaksfile in [SICER_PEAKS] + \
    glob.glob(DIR + "/**/*.broadPeak") +  \
    glob.glob(DIR + "/*.peak"):
    name = os.path.basename(peaksfile)
    dirname = os.path.basename(os.path.dirname(peaksfile))
    try:
        tf = pd.read_csv(peaksfile, sep='\t', header=None)
    except:
        tf = pd.DataFrame(columns=range(10), dtype=object)

    if 'narrowPeak' in name:
        caller = 'MACS2_q0.01'#+ re.sub('H3K4me3_Rep2_EncodeAlign_TF_Calls_', '', dirname)
        significance = tf[8] # Minus log10 q
    if 'broadPeak' in name:
        caller = 'MACS2_q0.01broad'# + re.sub('H3K4me3_Rep2_EncodeAlign_TF_Calls_', '', dirname)
        significance = tf[8] # Minus log10 q
    elif 'islands-summary' in name:
        caller = 'SICER'
#         display(tf.sort_values(by=[7]).head(10))
        significance = -np.log10(tf[7] + 10e-99) # Minus log10 p adj
    elif '.peak' in name:
        significance = tf[8] # Minus log10 q      
        caller = 'SPAN_' + re.sub('.*Rep1Filtered_|\.peak', '', name)

    t = tf.iloc[:, :3].copy()
    t.columns = ['chr', 'start', 'end']
    t['len'] = t['end'] - t['start']
    t['name'] = name
    t['caller'] = caller
    t['significance'] = significance
    t.sort_values(by=['len'], ascending=False, inplace=True)
    t['loc'] = t['chr'] + ':' + t['start'].astype(str) + '-' + t['end'].astype(str)
    if len(t) > 0:
        print(f'Longest peak {name} {caller}\t' + str(t[['loc', 'len']].reset_index(drop=True).iloc[0, 1]))
        tf = pd.read_csv(f'{peaksfile}.rpm', header=None)
        tf.columns = ['rpm']
        t['rpm'] = tf['rpm']
    else:
        print(f'Longest peak {name} {caller}\t0')        
        t['rpm'] = []
    ts.append(t)
dfcallers = pd.concat(ts)
dfcallers['significance'] = dfcallers['significance'].astype(float)

dfcallers['rpkm'] = dfcallers['rpm'] / ((dfcallers['len'] + 1)/1000)
dfcallers['rpkm'] = dfcallers['rpkm'].astype(float)
dfcallers.head()

In [None]:
# Lenghts distribution
for gap in [0, 5, 10]:
    plt.figure(figsize=(10, 6))
    for caller in sorted(set(dfcallers['caller'])):
        if 'SPAN' in caller and not re.match(f'SPAN.*_{gap}$', caller):
            continue
        tc = dfcallers.loc[dfcallers['caller'] == caller]
    #     print(caller)
    #     print(tc['len'].describe())
        sns.kdeplot(tc['len'], shade=True, label=caller)
    plt.xlim(0, 6000)
    plt.suptitle(f'Peak lenghts')
    plt.show()

In [None]:
fripdf = pd.DataFrame(columns=['fdr', 'gap', 'frip', 'peaks'], dtype=object)
for gap in [0, 5, 10]:
    for fdr in ['0.1', '0.01', '0.05', '1E-3', '1E-4', '1E-5', '1E-6', 
                '1E-7', '1E-8', '1E-9', '1E-10', '1E-15', '1E-20']:
        frip = ! cat {DIR}/wgEncodeBroadHistoneGm12878H3k36me3StdAlnRep1Filtered_{fdr}_{gap}.peak.frip
        frip = float(frip[0].strip())
        peaks = ! cat {DIR}/wgEncodeBroadHistoneGm12878H3k36me3StdAlnRep1Filtered_{fdr}_{gap}.peak | wc -l
        peaks = int(peaks[0].strip())
        fripdf.loc[len(fripdf)] = (float(fdr), gap, frip, peaks)
# fripdf

In [None]:
for gap in [0, 5, 10]:
    plt.figure(figsize=(10, 5))
    for caller in sorted(set(dfcallers['caller'])):
        if 'SPAN' in caller and not re.match(f'SPAN.*_{gap}$', caller):
            continue    
        t = dfcallers.loc[dfcallers['caller']==caller]
        plt.xlim(0, 150)
#         plt.ylim(0, 0.08)
        sns.kdeplot(t['rpkm'], shade=True, label=caller)
    plt.suptitle(f'RPKMs')
    plt.show()

In [None]:
import plotly.graph_objects as go
import plotly.express as px

fripmacs2 = ! cat {MACS_PEAKS}.frip
fripmacs2 = float(fripmacs2[0].strip())
print(f'FRIP MACS2 {fripmacs2}')

fripsicer = ! cat {SICER_PEAKS}.frip
fripsicer = float(fripsicer[0].strip())
print(f'FRIP SICER {fripsicer}')

fig = go.Figure()
for gap in sorted(set(fripdf['gap'])):
    t = fripdf.loc[fripdf['gap']==gap]
    fig.add_trace(go.Scatter(x=np.log10(t["fdr"]), y=t["frip"], 
                             mode='lines+markers',
                             name=gap))

fig.add_shape(type="line", x0=-20, y0=fripmacs2, x1=0, y1=fripmacs2, line=dict(
                color="Black",
                width=1,
                dash="dot"))
fig.add_shape(type="line", x0=-20, y0=fripsicer, x1=0, y1=fripsicer, line=dict(
                color="Black",
                width=1,
                dash="dot"))

fig.update_xaxes(title='log10 fdr')
fig.update_yaxes(title='frip')
fig.show()

In [None]:
peaksmacs2 = ! cat {MACS_PEAKS} | wc -l
peaksmacs2 = int(peaksmacs2[0].strip())
print(f'Peaks MACS2 {peaksmacs2}')

peakssicer = ! cat {SICER_PEAKS} | wc -l
peakssicer = int(peakssicer[0].strip())
print(f'Peaks SICER {peakssicer}')

fig = go.Figure()
for gap in sorted(set(fripdf['gap'])):
    t = fripdf.loc[fripdf['gap']==gap]
    fig.add_trace(go.Scatter(x=np.log10(t["fdr"]), y=t["peaks"], 
                             mode='lines+markers',
                             name=gap))

fig.add_shape(type="line", x0=-20, y0=peaksmacs2, x1=0, y1=peaksmacs2, line=dict(
                color="Black",
                width=1,
                dash="dot"))
fig.add_shape(type="line", x0=-20, y0=peakssicer, x1=0, y1=peakssicer, line=dict(
                color="Black",
                width=1,
                dash="dot"))

fig.update_xaxes(title='log10 fdr')
fig.update_yaxes(title='peaks')
fig.show()

In [None]:
t = dfcallers.loc[['SPAN' in c for c in dfcallers['caller']]].copy()
t['log10fdr'] = [np.log10(float(re.sub('SPAN_|_(0|5|10)$', '', c))) for c in t['caller']]
t['gap'] = [re.sub('.*_', '', c) for c in t['caller']]
t2 = dfcallers.loc[dfcallers['caller'] == 'MACS2_q0.01broad'].copy()
t2['log10fdr'] = 5
t2['gap'] = 'MACS2'
t3 = dfcallers.loc[dfcallers['caller'] == 'SICER'].copy()
t3['log10fdr'] = 10
t3['gap'] = 'SICER'

In [None]:
plt.figure(figsize=(12, 10))
sns.violinplot(data=pd.concat([t, t2, t3]), x="log10fdr", y="rpkm", hue="gap")
plt.show()