# SPAN Benchmarks

1) H3K4me3 and H3K36me3 vs RNA-seq
See https://pubmed.ncbi.nlm.nih.gov/27169896/
Prepare expression information beforehand!
2) H3K27ac vs DHS

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

from IPython.display import display
import pandas as pd
from tqdm.auto import tqdm
import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
import scipy as sp
import os

# RNA-seq prepare genes positions

In [None]:
gtf_df = pd.read_csv(os.path.expanduser('~/data/2022_GSE26320_GM12878_rnaseq/gencode.GRCh38.p13.v41.annotation.gtf'), sep='\t', comment='#',
                     names=['chromosome', 'db', 'type', 'start', 'end', 'point1', 'strand', 'point2', 'aux'])
gtf_df.sample(10)

In [None]:
print('Parse GTF aux data')
auxes = {}
for i, aux in enumerate(tqdm(gtf_df['aux'])):
    for pair in aux.split(';'):
        kv = pair.strip().split(' ')
        if len(kv) != 2:
            continue
        k, v = kv
        if k not in auxes:
            auxes[k] = vs = []
        else:
            vs = auxes[k]
        vs.append(v.strip('"'))

for k, vs in auxes.items():
    if len(vs) == len(gtf_df):
        gtf_df[k] = vs
    else:
        print(f'Ignoring {k}')
del auxes
gtf_df.drop('aux', axis=1, inplace=True)

In [None]:
gtf_df[gtf_df['type'] == 'gene'].sample(5)

# Read RNA-seq expression

In [None]:
expression_df = pd.read_csv(os.path.expanduser('~/data/2022_GSE26320_GM12878_rnaseq/GM12878_SRR307008.genes.results'), sep='\t', comment='#')
expression_df.sample()

In [None]:
full_df = pd.merge(left=expression_df, right=gtf_df[gtf_df['type'] == 'gene'], left_on='gene_id', right_on='gene_id', how='left')
full_df.drop(labels=['point1', 'point2', 'level', 'type', 'db'], axis=1, inplace=True)
full_df.sample(5)

In [None]:
full_df['logTPM'] = np.log1p(full_df['TPM'])
full_df['logFPKM'] = np.log1p(full_df['FPKM'])
full_df = full_df[['chromosome', 'start', 'end', 'gene_id', 'strand', 'TPM', 'logTPM', 'FPKM', 'logFPKM']]
full_df.sort_values(by=['chromosome', 'start'], inplace=True)

In [None]:
full_df.to_csv(os.path.expanduser('~/data/2022_GSE26320_GM12878_rnaseq/genes_full.bed'), sep='\t', index=False, header=None)

# H3K4me3

In [None]:
df = pd.read_csv(os.path.expanduser('~/data/2022_GSE26320_GM12878_chipseq/k4me3_report.tsv'), sep='\t',
                 names=['n', 'file', 'peaks', 'tss_peaks', 'cp', 'rp'])
df['cpf'] = df['cp'] / df['peaks']
df['rpf'] = df['rp'] / df['tss_peaks']
df

In [None]:
def tool(file):
    if '.narrowPeak' in file:
        return 'Macs2'
    elif '.broadPeak' in file:
        return 'Macs2Broad'
    elif '.peak' in file:
        return 'Span'
    else:
        return 'Sicer'

df['tool'] = [tool(f) for f in df['file']]

## Recovered promoter fraction / Correct peak fraction

In [None]:
# import plotly.express as px
#
# fig = px.line(df, x="rpf", y="cpf", color="tool", hover_name="n")
# fig.update_xaxes(range=[0, 1], row=1, col=1, title='recovered promoter fraction')
# fig.update_yaxes(range=[0, 1], row=1, col=1, title='correct peak fraction')
# fig.show()

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
for t in set(df['tool']):
    dft = df[df['tool'] == t]
    fig.add_trace(go.Scatter(
        x=dft["rpf"], y=dft["cpf"], mode='lines+markers', name=t, hovertext=dft['n']))
fig.update_xaxes(range=[0, 0.8], title='Recovered Promoter Fraction')
fig.update_yaxes(range=[0, 1], title='Correct Peak Fraction')
fig.show()

# H3K36me3

In [None]:
genes_df = pd.read_csv(
    os.path.expanduser('~/data/2022_GSE26320_GM12878_rnaseq/GM12878_SRR307008.genes.results'),
    sep='\t'
)
genes_df['LTPM'] = np.log1p(genes_df['TPM'])
genes_df

In [None]:
ORANGE = (243/256, 135/256, 47/256)
BLUE = (35/256, 110/256, 150/256)

plt.figure(figsize=(6, 4))
expressed = genes_df.loc[genes_df['LTPM'] > 1, 'LTPM']
plt.hist(expressed, bins=1000, density=True, color=ORANGE, edgecolor=ORANGE, linewidth=1,
         label=f'Expressed TPM>1 ({len(expressed)})')

nexpressed = genes_df.loc[genes_df['LTPM'] <= 1, 'LTPM']
plt.hist(nexpressed, bins=50, density=True, color=BLUE, edgecolor=BLUE, linewidth=1,
         label=f'Not expressed TPM<=1 ({len(nexpressed)})')
plt.gca().set(title='Log TPM', ylabel='Frequency')
plt.legend()

plt.show()

In [None]:
genes_df['logTPM'] = np.log1p(genes_df['TPM'])
genes_df['logFPKM'] = np.log1p(genes_df['FPKM'])

In [None]:
plt.figure(figsize=(6, 4))
plt.hist(genes_df['logTPM'], bins=100, density=True, color=ORANGE, edgecolor=ORANGE, linewidth=1, label=f'TPM')
plt.gca().set(title='Log TPM', ylabel='Frequency')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
plt.hist(genes_df['logFPKM'], bins=100, density=True, color=ORANGE, edgecolor=ORANGE, linewidth=1, label=f'FPKM')
plt.gca().set(title='Log FPKM', ylabel='Frequency')
plt.legend()
plt.show()

We considered a peak as positive if it overlaps an active gene (defined varying RPKM from 0 to 2) and compared methods based on sensitivity, precision and F-score.

In [None]:
df = pd.read_csv(os.path.expanduser('~/data/2022_GSE26320_GM12878_chipseq/k36me3_report.tsv'), sep='\t',
                 names=['tpm', 'gene_file', 'genes', 'peaks_file', 'peaks', 'pg', 'gp'])
df['tool'] = [tool(f) for f in df['peaks_file']]
df

In [None]:
df['p'] = (df['pg'] + df['gp']) / 2
df['precision'] = df['pg'] / df['peaks']
df['sensitivity'] = df['gp'] / df['genes']
df['f1'] = [2 / (1/s + 1/p) for s, p in zip(df['sensitivity'], df['precision'])]
df

In [None]:
import plotly.express as px

fig = px.line(df, x="precision", y="sensitivity", color="tool", log_x=True)
fig.update_xaxes(title='Overlap peaks vs Active genes - Precision')
fig.update_yaxes(title='Overlap Active genes vs peaks - Sensitivity')
fig.show()

In [None]:
import plotly.express as px

fig = px.line(df, x="tpm", y="sensitivity", color="tool")
fig.update_xaxes(title='TPM threshold')
fig.update_yaxes(range=[0, 1], title='Sensitivity (Recall)')
fig.show()

In [None]:
import plotly.express as px

fig = px.line(df, x="tpm", y="precision", color="tool")
fig.update_xaxes(title='TPM threshold')
fig.update_yaxes(range=[0, 1], title='Precision')
fig.show()

In [None]:
import plotly.express as px

fig = px.line(df, x="tpm", y="f1", color="tool")
fig.update_xaxes(title='TPM threshold')
fig.update_yaxes(range=[0, 1], title='F1')
fig.show()

# H3K27ac vs DHS

In [None]:
df = pd.read_csv(
    os.path.expanduser('~/data/2022_GSE26320_k27ac/k27ac_report.tsv'),
    sep='\t',
    names=['q', 'file', 'peaks', 'dhs', 'dp', 'fp']
)
df['tool'] = [tool(f) for f in df['file']]
df

In [None]:
df['precision'] = df['fp'] / df['peaks']
df['sensitivity'] = df['dp'] / df['dhs']
df['f1'] = [2 / (1/s + 1/p) for s, p in zip(df['sensitivity'], df['precision'])]

In [None]:
import plotly.express as px

fig = px.line(df, x="precision", y="sensitivity", color="tool", log_x=True)
fig.update_xaxes(title='Overlap peaks vs DHS - Precision')
fig.update_yaxes(title='Overlap DHS vs peaks - Sensitivity')
fig.show()

In [None]:
import plotly.express as px

fig = px.line(df, x="q", y="sensitivity", color="tool", log_x=True)
fig.update_xaxes(title='Q')
fig.update_yaxes(title='Sensitivity (Recall)')
fig.show()

In [None]:
import plotly.express as px

fig = px.line(df, x="q", y="precision", color="tool", log_x=True)
fig.update_xaxes(title='Q')
fig.update_yaxes(title='Precision')
fig.show()

In [None]:
import plotly.express as px

fig = px.line(df, x="q", y="f1", color="tool", log_x=True)
fig.update_xaxes(title='Q')
fig.update_yaxes(title='F1')
fig.show()