# Deep analysis of SPAN peak calling

Consider launch SPAN with `--deep-analysis` command line argument.

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

from io import StringIO

import pandas as pd
import seaborn as sns
from IPython.display import display
from tqdm.auto import tqdm

sns.set_style("whitegrid")
import matplotlib.pyplot as plt
import numpy as np
import os
import re
import math
from itertools import product
import glob

# Plots

In [None]:
def plot_sensitivity(t, name, x='CandidatesN', y='CandidatesAL', logx=True, logy=True):
    plt.figure(figsize=(5, 4))
    ax = plt.axes()
    sns.lineplot(data=t[t['Gap'] > 0], x=x, y=y, hue='Gap', estimator=None,
                 palette='tab20',
                 hue_order=list(sorted(t['Gap'].unique())),
                 sort=False,
                 alpha=0.5,
                 ax=ax)
    sns.lineplot(data=t[t['Gap'] == 0], x=x, y=y, hue='Gap', estimator=None,
                 palette='tab20',
                 hue_order=list(sorted(t['Gap'].unique())),
                 sort=False,
                 legend=False,
                 ax=ax)

    sp = detect_sensitivity_triangle(t[t['Gap'] == 0])
    sns.scatterplot(data=t[t.index == sp[0]],
                    x=x, y=y, color='green', s=20,
                    legend=False, ax=ax)
    sns.scatterplot(data=t[t.index == sp[1]],
                    x=x, y=y, color='red', s=40,
                    legend=False, ax=ax)
    sns.scatterplot(data=t[t.index == sp[2]],
                    x=x, y=y, color='blue', s=20,
                    legend=False, ax=ax)

    if logx:
        ax.set(xscale='log')
    if logy:
        ax.set(yscale='log')
    ax.set_title(name)
    if len(t['Gap'].unique()) > 1:
        sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
    plt.tight_layout()
    plt.show()
    return sp

In [None]:
def plot_projections(t, name, sp, show_text=False):
    for y in ['CandidatesN', 'CandidatesAL', 'CandidatesML', 'SignalNoiseRatio', 'SignalControlRatio']:
        if y not in t.columns:
            continue
        print(y)
        plt.figure(figsize=(5, 2))
        ax = plt.axes()
        tgap = t[t['Gap'] > 0]
        if len(tgap) > 0:
            sns.lineplot(data=tgap, x='SensitivityN', y=y, estimator=None,
                         # marker='o', markersize=3,
                         hue='Gap',
                         hue_order=list(sorted(t['Gap'].unique())),
                         palette='tab20',
                         alpha=0.5,
                         sort=False,
                         ax=ax)
        tnogap = t[t['Gap'] == 0]
        sns.lineplot(data=tnogap, x='SensitivityN', y=y, estimator=None,
                     # marker='o', markersize=3,
                     hue='Gap',
                     hue_order=list(sorted(t['Gap'].unique())),
                     palette='tab20',
                     alpha=1,
                     sort=False,
                     legend=False,
                     ax=ax)

        if sp is not None:
            sns.scatterplot(x=[tnogap['SensitivityN'].values[sp[0]]], y=[tnogap[y].values[sp[0]]],
                            color='green', s=20,
                            legend=False, ax=ax)
            sns.scatterplot(x=[tnogap['SensitivityN'].values[sp[1]]], y=[tnogap[y].values[sp[1]]],
                            color='red', s=40,
                            legend=False, ax=ax)
            sns.scatterplot(x=[tnogap['SensitivityN'].values[sp[2]]], y=[tnogap[y].values[sp[2]]],
                            color='blue', s=20,
                            legend=False, ax=ax)
            if show_text and y in ['CandidatesN', 'CandidatesAL', 'CandidatesML']:
                ax.text(tnogap['SensitivityN'].values[sp[0]], 10,
                        f"t1={tnogap['Sensitivity'].values[sp[0]]:.2e}", fontsize=5)
                ax.text(tnogap['SensitivityN'].values[sp[1]], 100,
                        f"t1={tnogap['Sensitivity'].values[sp[0]]:.2e}", fontsize=5)
                ax.text(tnogap['SensitivityN'].values[sp[2]], 10,
                        f"t1={tnogap['Sensitivity'].values[sp[0]]:.2e}", fontsize=5)

        ax.set(yscale='log')
        ax.set_title(name)
        sens = np.array(tnogap['Sensitivity'])
        # First and last labels are out of the plot
        sens_ticks = np.array([sens[int(t)] if 0 <= t < len(sens) else np.nan for t in ax.get_xticks()[1:-1]])
        sens_ticks[-1] = sens.max()
        labels = [''] + [f'{t:.1e}' for t in sens_ticks] + ['']
        ax.set_xticklabels(labels)
        ax.set_xlabel('Sensitivity')
        if len(t['Gap'].unique()) > 1:
            sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
        plt.tight_layout()
        plt.show()


# Area based triangles

In [None]:
def triangle_signed_square(x1, y1, x2, y2, x3, y3):
    return 0.5 * (x1 * y2 - x2 * y1 + x2 * y3 - x3 * y2 + x3 * y1 - x1 * y3)


def detect_sensitivity_triangle(t):
    im1 = int(len(t) * 0.1)
    im2 = int(len(t) * 0.9)
    ns = np.log1p(t['CandidatesN'].values)
    als = np.log1p(t['CandidatesAL'].values)
    # print('Limit', 'Im1', im1, 'Im2', im2)
    max_area = 0
    i1, i2, i3 = -1, -1, -1
    for i in range(im1, im2):
        i1mab = find_sensitivity_max_area_between(ns, als, 0, i, -1)
        i3mab = find_sensitivity_max_area_between(ns, als, i, len(t) - 1, -1)
        # print('Checking', 'I', i,  'I1mab', i1mab, 'I3mab', i3mab)
        if i1mab[0] == -1 or i3mab[0] == -1:
            continue
        # we want area to be balanced between parts, geometric mean is better here
        area = math.sqrt(i1mab[1] * i3mab[1])
        if area > max_area:
            max_area = area
            i1, i3, i2 = i1mab[0], i3mab[0], i
    # print('Found', 'I1', i1, 'I2', i2, 'I3', i3, max_area)
    if i3 == -1 or i2 == -1 or i1 == -1:
        print("Failed to estimate sensitivity triangle")
        return im1, (im1 + im2) / 2, im2

    # Update i3, i1 points to be closer to i2 for more accurate pivot_gap estimation
    i3mab = find_sensitivity_max_area_between(ns, als, i3, i2, -1)
    if i3mab[0] != -1:
        i3 = i3mab[0]
    i1mab = find_sensitivity_max_area_between(ns, als, i2, i1, -1)
    if i1mab[0] != -1:
        i1 = i1mab[0]
    # print('Result', 'I1', i1, 'I2', i2, 'I3', i3, max_area)
    return i1, i2, i3


def find_sensitivity_max_area_between(ns, als, start, end, sign):
    if start > end:
        return find_sensitivity_max_area_between(ns, als, end, start, sign)
    max_i = -1
    max_area = 0
    start_n, start_al = ns[start], als[start]
    end_n, end_al = ns[end], als[end]
    for i in range(start + 1, end):
        n, al = ns[i], als[i]
        area = triangle_signed_square(start_n, start_al, n, al, end_n, end_al)
        if area * sign > 0:
            continue
        area = math.fabs(area)
        if area > max_area:
            max_i, max_area = i, area
    return max_i, max_area

# Bad quality example

In [None]:
t = StringIO('''Sensitivity	Gap	CandidatesN	CandidatesAL
-375.1737060546875	0	1	1.0
-280.0399200217636	0	9	1.6666666666666667
-209.029458995094	0	92	1.4891304347826086
-156.02530783606127	0	453	1.6291390728476822
-116.46155906622265	0	1239	1.883777239709443
-86.93009440741795	0	2916	2.0174897119341564
-64.8870011209932	0	5615	2.1814781834372217
-48.433433130109364	0	10588	2.3860974688326406
-36.15203976516999	0	16322	2.576277416983213
-26.984871703631814	0	24183	2.750568581234752
-20.142246622637973	0	34899	3.1237857818275594
-15.03472402844767	0	45585	3.3377426785126687
-11.222329407759858	0	53435	3.938748011602882
-8.376653744889191	0	47325	5.995456946645536
-6.252563564320888	0	27576	13.580395996518712
-4.66708453237974	0	24350	17.816632443531827
-3.483639599710967	0	23656	20.143599932363884
-2.600283919538606	0	23517	21.657481821660927
-1.9409230687273291	0	23611	22.653593663970184
-1.4487580876885016	0	23776	23.33983849259758
-1.0813926788036474	0	23999	23.867577815742322
-0.8071810854467272	0	24240	24.26526402640264
-0.602502048953911	0	24478	24.615859138818532
-0.4497240155135174	0	24753	24.881994101725045
-0.33568631091090884	0	25075	25.060019940179462
-0.25056544779870277	0	25416	25.208215297450426
-0.18702890642218395	0	25772	25.32826323141394
-0.1396034933977802	0	26168	25.388833690003057
-0.10420386742181362	0	26637	25.404062018996132
-0.07778061795863045	0	27109	25.409900770961674
-0.05805758154384941	0	27699	25.306509260262104
-0.04333576748533296	0	28378	25.138734230742124
-0.032347002641238944	0	29185	24.910604762720574
-0.024144687877662503	0	30126	24.61840270862378
-0.018022255699405163	0	31272	24.22521744691737
-0.013452304794349106	0	32592	23.79120643102602
-0.01004116839192559	0	34071	23.340700302309884
-0.007495002842736571	0	36080	22.680182926829268
-0.005594475206471149	0	39225	21.588833652007647
-0.004175869374906439	0	43757	20.204241607057156
-0.0031169831651252974	0	48401	19.266853990620028
-0.002326601523998155	0	53579	18.580413968159167
-0.001736639039965088	0	63313	17.234154123165858
-0.0012962748988267487	0	82491	15.135032912681384
-9.675750542623222E-4	0	110976	13.33552299596309
-7.222244961142835E-4	0	155013	11.45052350448027
-5.390881260216058E-4	0	212609	10.160722264814753
-4.023901282510646E-4	0	235773	10.838098510007507
-3.0035500226806847E-4	0	244449	12.541376728888235
-2.241931922623222E-4	0	292100	12.481119479630264
-1.6734393326970745E-4	0	968964	5.192648024075198
-1.2491009079976728E-4	0	617855	10.359380437157586
-9.32363096692581E-5	0	592402	12.330579572655056
-6.959413274846464E-5	0	569118	14.391375777958174
-5.1946964977399616E-5	0	549164	15.138854695500797
-3.8774636076238994E-5	0	530365	16.720228521867018
-2.8942449351927953E-5	0	507602	18.508398312063388
-2.1603436144233477E-5	0	479059	21.157412761267402
-1.6125395869679023E-5	0	25070	42.360430793777425
-1.2036436713946952E-5	0	25070	42.360430793777425
-8.984325714524885E-6	0	25070	42.360470682090146
-6.706146550095041E-6	0	25070	42.360470682090146
-5.005651284285609E-6	0	25070	42.360470682090146
-3.7363550874853966E-6	0	25070	42.360470682090146
-2.788917674629893E-6	0	25070	42.360470682090146
-2.0817244650849684E-6	0	25070	42.360470682090146
-1.553856102657593E-6	0	25070	42.360470682090146
-1.159840712957992E-6	0	25070	42.360470682090146
-8.657368447014698E-7	0	25070	42.360470682090146
-6.46209669914216E-7	0	25070	42.360470682090146
-4.823485797634483E-7	0	25070	42.360470682090146
-3.60038178368175E-7	0	25070	42.360470682090146
-2.68742348005349E-7	0	25070	42.360470682090146
-2.0059664210825222E-7	0	25070	42.360470682090146
-1.497308225658031E-7	0	25070	42.360470682090146
-1.1176318302543429E-7	0	25070	42.360470682090146
-8.34230979696061E-8	0	25070	42.360470682090146
-6.226928301838606E-8	0	25070	42.360470682090146
-4.6479496710090816E-8	0	25070	42.360470682090146
-3.469356815599526E-8	0	25070	42.360470682090146
-2.589622858660116E-8	0	25070	42.360470682090146
-1.9329653611705986E-8	0	25070	42.360470682090146
-1.4428182370225825E-8	0	25070	42.360470682090146
-1.0769590117353544E-8	0	25070	42.360470682090146
-8.03871675029177E-9	0	25070	42.360470682090146
-6.000318144633462E-9	0	25070	42.360470682090146
-4.478801648970997E-9	0	25070	42.360470682090146
-3.3431001035780287E-9	0	25070	42.360470682090146
-2.4953813940635528E-9	0	25070	42.360470682090146
-1.8626209532804875E-9	0	25070	42.360470682090146
-1.3903112461497955E-9	0	25070	42.360470682090146
-1.0377663570068893E-9	0	25070	42.360470682090146
-7.746172051170474E-10	0	25070	42.360470682090146
-5.781954776352038E-10	0	25070	42.360470682090146
-4.315809255841242E-10	0	25070	42.360470682090146
-3.22143812140927E-10	0	25070	42.360470682090146
-2.404569561554013E-10	0	25070	42.360470682090146
-1.7948365166246456E-10	0	25070	42.360470682090146
-1.339715087854371E-10	0	25070	42.360470682090146
-9.999999999999925E-11	0	25070	42.360470682090146''')
t = pd.read_csv(t, sep='\t')
t['SensitivityN'] = range(len(t))
# For log
t['CandidatesN'] += 1
t['CandidatesAL'] += 1
t.sample(3)

In [None]:
sp = plot_sensitivity(t, 'Mix noise bad quality')
display(sp)

In [None]:
plot_projections(t, 'Mix noise bad quality', sp)

# Peak infos

In [None]:
GSE26320_PATH = os.path.expanduser('~/data/2023_GSE26320')
GSE26320_CELLS = ['GM12878', 'HMEC', 'HSMM', 'K562', 'NHEK', 'NHLF', 'H1', 'Huvec', 'HepG2']
MODIFICATIONS_ABF = ['k4me3', 'k27ac', 'k4me1', 'k27me3', 'k36me3']
MODIFICATIONS = ['H3K4me3', 'H3K27ac', 'H3K4me1', 'H3K27me3', 'H3K36me3']
GSE26320_REPS = ['rep1', 'rep2']

IMMUNE_PATH = os.path.expanduser('~/data/2023_Immune')
IMMUNE_CELLS = ['CD4ABT', 'TCellBB', 'BCell', 'TCell', 'Monocyte', 'PBMC', 'NK', 'CD34', 'CD4', ]  ## Longest first
IMMUNE_REPS = ['rep1', 'rep2', 'rep3', '']

IMMGEN_PATH = os.path.expanduser('~/data/2024_Immgen')
IMMGEN_CELLS = [

    'Thymus_Thymic_epithelial_Cell', 'Thymus_Double_Negative_Thymocytes',
                'Thymus_Immature_Single_Positive_Thymocytes',
                'Thymus_CD4_SP_Thymocytes_ATAC',
                'Thymus_CD8_SP_Thymocytes',
                'Spleen_CD4_Naive_T_Cell',
                'Spleen_CD8_Naive_T_Cell',
                'Spleen_Activated_T_Cell',
                'Colon_CD4_Treg_Cell',
                'Bone_marrow_Neutrophil',
                'Spleen_Neutrophil_ATAC',
                'Peritoneal_cavity_Neutrophil',
                'Spleen_NK_Cell',
                'Bone_marrow_NK_Cell',
                'Thymus_gdT_Cell',
                'Lymph_nodes_gdT_Cell',
                'Spleen_CD8_T_Cell',
                '']  ## Part, longest first
IMMGEN_REPS = ['']

CTCF_PATH = os.path.expanduser('~/data/2024_TFs')
CTCF_CELLS = ['H1']  ## Longest first
CTCF_REPS = ['']

Y20O20_PATH = os.path.expanduser('~/data/2018_chipseq_y20o20')
Y20O20_CELLS = ['']
Y20O20_REPS = [f'OD{i}' for i in range(1, 30)] + [f'YD{i}' for i in range(1, 30)]

In [None]:
def update_abf_modifications(df):
    df.loc[df['modification'] == 'k4me3', 'modification'] = 'H3K4me3'
    df.loc[df['modification'] == 'k27ac', 'modification'] = 'H3K27ac'
    df.loc[df['modification'] == 'k4me1', 'modification'] = 'H3K4me1'
    df.loc[df['modification'] == 'k27me3', 'modification'] = 'H3K27me3'
    df.loc[df['modification'] == 'k36me3', 'modification'] = 'H3K36me3'

In [None]:
def load_peaks_infos(path, suffix, modifications, cells, replicates):
    ts = []
    vals = {}
    for f in os.listdir(path):
        if not f.endswith(f'{suffix}.txt'):
            continue
        if 'ATAC_seq' in f:
            rep = re.sub('_.*', '', os.path.basename(f))
            cell = re.sub('(SRR[0-9]+_)|(_ATAC.*)', '', os.path.basename(f))
        else:
            cell = next((c for c in cells if c in f), None)
            rep = next((r for r in replicates if r in f), None)
        mod = next((m for m in modifications if m.lower() in f.lower()), None)
        if mod and cell is not None and rep is not None:
            txt_path = os.path.join(path, f)
            print(txt_path, mod, cell, rep)
            with open(txt_path, 'r') as file:
                txt = file.read().rstrip()
                vals = {}
                for (c, text) in [
                    ('peaks', 'Count'),
                    ('length_mean', 'Mean length'),
                    ('length_median', 'Median length'),
                    ('model_signal', 'Signal mean'),
                    ('model_noise', 'Noise mean'),
                    ('model_signal_to_noise', 'Signal to noise'),
                    ('frip', 'FRIP'),
                    ('log_null_pvals_mean', 'LogNullPVals mean'),
                    ('log_null_pvals_std', 'LogNullPVals std'),
                    ('coverage', 'Treatment coverage'),
                    ('control_coverage', 'Control coverage'),
                    ('control_scale', 'Control scale'),
                    ('beta', 'Beta'),
                    ('min_correlation', 'Min control correlation'),
                    ('coverage_non_zero', 'Coverage >0 %'),
                    ('coverage_max', 'Coverage >0 max'),
                    ('coverage_mean', 'Coverage >0 mean'),
                    ('coverage_median', 'Coverage >0 median'),
                    ('coverage_std', 'Coverage >0 std'),
                    ('roughness', 'Track roughness'),
                    ('autocorrelation_average_score', 'Average autocorrelation score'),
                    ('sensitivity_before_merge', 'Sensitivity beforeMerge'),
                    ('sensitivity_before_merge_idx', 'Sensitivity beforeMerge index'),
                    ('sensitivity_stable', 'Sensitivity stable'),
                    ('sensitivity_stable_idx', 'Sensitivity stable index'),
                    ('sensitivity_before_noise', 'Sensitivity beforeNoise'),
                    ('sensitivity_before_noise_idx', 'Sensitivity beforeNoise index'),
                    ('minimal_additional', 'Minimal additional'),
                    ('minimal_additional_idx', 'Minimal additional index'),
                    ('fragmentation_average_score', 'Average fragmentation score'),
                    ('signal_density', 'Candidates signal density'),
                    ('noise_density', 'Candidates noise density'),
                    ('signal_to_noise', 'Coverage signal to noise'),
                    ('signal_to_control', 'Coverage signal to control'),
                ]:
                    try:
                        if text in txt:
                            x = re.sub('\n(\n|.)*', '', re.sub(f'(.|\n)*{text}: ', '', txt)).replace(',', '')
                            vals[c] = float(x)
                    except Exception as e:
                        print(f'Failed to process {c}: {text}: {x}', e)
                fit_snr_adjusted = re.sub('\n(\n|.)*', '',
                                          re.sub('(.|\n)*Out of low noise level down: ', '', txt)) == 'true'
                vals['fit_snr_adjusted'] = fit_snr_adjusted
                fit_low_adjusted = re.sub('\n(\n|.)*', '',
                                          re.sub('(.|\n)*Out of signal-to-noise range down: ', '', txt)) == 'true'
                vals['fit_low_adjusted'] = fit_low_adjusted

                ts.append((txt_path, mod, cell, rep, *vals.values()))
    df = pd.DataFrame(ts, columns=['file', 'modification', 'cell', 'replicate', *vals.keys()])
    return df

In [None]:
gse26320_infos = load_peaks_infos(os.path.join(GSE26320_PATH, 'span'), '.peak',
                                  MODIFICATIONS, GSE26320_CELLS, GSE26320_REPS)
gse26320_infos['dataset'] = 'ENCODE'
immune_infos = load_peaks_infos(os.path.join(IMMUNE_PATH, 'span'), '.peak',
                                MODIFICATIONS, IMMUNE_CELLS, IMMUNE_REPS)
immune_infos['dataset'] = 'Roadmap'
immgen_infos = load_peaks_infos(os.path.join(IMMGEN_PATH, 'span'), '.peak',
                                ['ATAC'], IMMGEN_CELLS, IMMGEN_REPS)
immgen_infos['dataset'] = 'Immgen'
ctcf_infos = load_peaks_infos(os.path.join(CTCF_PATH, 'span'), '.peak',
                                ['CTCF'], CTCF_CELLS, CTCF_REPS)
ctcf_infos['dataset'] = 'ENCODE'

y20o20_infos = load_peaks_infos(os.path.join(Y20O20_PATH, 'span'), '.peak',
                                MODIFICATIONS_ABF, Y20O20_CELLS, Y20O20_REPS)
y20o20_infos['dataset'] = 'ABF'

full_infos = pd.concat([gse26320_infos, immune_infos, immgen_infos, ctcf_infos, y20o20_infos]).reset_index(drop=True)
update_abf_modifications(full_infos)
# full_infos = pd.concat([gse26320_infos, immune_infos]).reset_index(drop=True)
full_infos.sample(3)

In [None]:
full_infos['modification'].unique()

In [None]:
# Remove outliers
OUTLIERS = {
    'H3K4me3:': {'NK': ['']},
    'H3K27ac': {'': ['YD1', 'YD6']},
    'H3K27me3': {
        'TCell':[''],
        'BCell': [''],
        '': ['OD9', 'YD1', 'YD2', 'YD3', 'YD4', 'YD9', 'YD10', 'YD11']},
    'H3K36me3': {'': ['OD3', 'OD6', 'OD12', 'OD18', 'OD20', 'YD1', 'YD3', 'YD4', 'YD5']},
    'H3K4me3': {'': ['OD6', 'OD7', 'OD14', 'YD2', 'YD3', 'YD10', 'YD14']}
}
full_infos['outlier'] = [
    any(m == mo and c == co and r == ro
           for mo, crlo in OUTLIERS.items()
                for co, rlo in crlo.items()
                    for ro in rlo) for m, c, r in zip(full_infos['modification'], full_infos['cell'], full_infos['replicate'])
]
print('Total outliers', sum(full_infos['outlier']))
df_peaks = full_infos[~full_infos['outlier']].copy()
print('Good peaks', len(df_peaks))

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(6, 3))
ax = plt.axes()
sns.barplot(data=df_peaks, x='modification', y='peaks', hue='dataset',
            capsize=.1, err_kws={'linewidth': 2}, edgecolor="black",
            # order=['CTCF', 'ATAC'] + MODIFICATIONS,
            order=MODIFICATIONS,
            # hue_order=['ENCODE', 'Roadmap', 'ABF', 'Immgen'],
            hue_order=['ENCODE', 'Roadmap', 'ABF'],
            ax=ax)
sns.swarmplot(data=df_peaks, x='modification', y='peaks', hue='dataset',
              dodge=True, size=2, palette='dark:black', alpha=0.5,
              # order=['CTCF', 'ATAC'] + MODIFICATIONS,
              order=MODIFICATIONS,
              # hue_order=['ENCODE', 'Roadmap', 'ABF', 'Immgen'],
              hue_order=['ENCODE', 'Roadmap', 'ABF'],
              legend=False,
              ax=ax)
ax.xaxis.set_tick_params(rotation=90)
ax.title.set_text('Peaks')
# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
for out in ['fit_snr_adjusted', 'fit_low_adjusted']:
    print(out)
    t = full_infos[['dataset', 'modification']].copy()
    t['number'] = [1 if o == True else 0 for o in full_infos[out]]
    t1 = t.groupby(['dataset', 'modification']).sum().reset_index()
    t2 = t.groupby(['dataset', 'modification']).count().reset_index()
    t1['number'] /= t2['number']
    t1['number'] *= 100
    plt.figure(figsize=(3, 1))
    ax = plt.axes()
    sns.barplot(data=t1, x='modification', y='number',
                hue='dataset',
                order=['CTCF', 'ATAC' ] + MODIFICATIONS,
                hue_order=['ENCODE', 'Roadmap', 'Immgen'],
                ax=ax)
    ax.xaxis.set_tick_params(rotation=90)
    ax.set_ylabel('%')
    sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
    # plt.tight_layout()
    ax.set_title(f'{out}')
    plt.show()

# Autocorrelation vs fragmentation

In [None]:
plt.figure(figsize=(2.5, 3))
ax = plt.axes()
sns.barplot(data=df_peaks, x='modification', y='autocorrelation_average_score',
            hue='modification', legend=False,
            order=['CTCF', 'ATAC' ] + MODIFICATIONS,
            capsize=.2, err_kws={'linewidth': 2},
            ax=ax)
ax.xaxis.set_tick_params(rotation=45)
ax.set_title('Average PEP autocorrelation')
ax.set_ylabel('Autocorrelation')
ax.set_ylabel('Experiment')
plt.tight_layout()
plt.show()

In [None]:
for y in ['autocorrelation_average_score', 'fragmentation_average_score']:
    plt.figure(figsize=(5, 2.5))
    ax = plt.axes()
    sns.boxplot(df_peaks,
                x='modification', y=y,
                hue='dataset',
                order=['CTCF', 'ATAC' ] + MODIFICATIONS,
                hue_order=['ENCODE', 'Roadmap', 'ABF', 'Immgen'],
                ax=ax)
    ax.xaxis.set_tick_params(rotation=90)
    ax.set_title(y)
    sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
    plt.tight_layout()
    plt.show()

In [None]:
plt.figure(figsize=(5, 3.5))
ax = plt.axes()
sns.scatterplot(df_peaks, x='autocorrelation_average_score', y='length_mean',
                hue='modification',
                hue_order=['CTCF', 'ATAC'] + MODIFICATIONS,
                style='dataset',
                alpha=0.8,
                ax=ax)
ax.set_xlabel('autocorrelation')
ax.set_ylabel('average length')
ax.set_title('Autocorrelation vs length')
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(5, 3.5))
ax = plt.axes()
sns.kdeplot(
    df_peaks, x='autocorrelation_average_score', y='fragmentation_average_score',
    hue='modification',
    hue_order=['CTCF', 'ATAC'] + MODIFICATIONS,
    alpha=0.2,
    common_norm=False, common_grid=False,
    thresh=.1,
    fill=True,
    warn_singular=False,
    ax = ax,
)
sns.scatterplot(df_peaks, x='autocorrelation_average_score', y='fragmentation_average_score',
                hue='modification',
                hue_order=['CTCF', 'ATAC'] + MODIFICATIONS,
                style='dataset',
                alpha=0.8,
                ax=ax)
ax.set_xlabel('narrow ← AC ~ width → broad')
ax.set_ylabel('high ← fragmentation → low')
ax.set_title('Autocorrelation vs Fragmentation')
ax.set_xlim(-0.1, 1.1)
ax.set_ylim(-0.1, 1.1)
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()


* Empirical almost no fragmentation threshold<br>
    `SPAN_DEFAULT_FRAGMENTATION_MAX_THRESHOLD = 0.8`

* Additional gap for tracks with high fragmentation<br>
    `SPAN_DEFAULT_FRAGMENTATION_COMPENSATION_GAP = 50`  // x default bin 100bp

```
        if (avgFragmentation < SPAN_DEFAULT_FRAGMENTATION_MAX_THRESHOLD) {
            val fragmentationGap =
                ceil((SPAN_DEFAULT_FRAGMENTATION_MAX_THRESHOLD - avgFragmentation) /
                        SPAN_DEFAULT_FRAGMENTATION_MAX_THRESHOLD * SPAN_DEFAULT_FRAGMENTATION_COMPENSATION_GAP).toInt()
        }
```

In [None]:
plt.figure(figsize=(5, 3.5))
ax = plt.axes()
sns.scatterplot(df_peaks, x='autocorrelation_average_score', y='fragmentation_average_score',
                hue='peaks',
                style='dataset',
                alpha=0.8,
                ax=ax)
ax.set_xlabel('narrow ← AC ~ width → broad')
ax.set_ylabel('high ← fragmentation → low')
ax.set_title('Peaks number')
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(5, 3.5))
ax = plt.axes()
sns.scatterplot(df_peaks, x='autocorrelation_average_score', y='fragmentation_average_score',
                hue='length_mean',
                style='dataset',
                alpha=0.8,
                ax=ax)
ax.set_xlabel('narrow ← AC ~ width → broad')
ax.set_ylabel('high ← fragmentation → low')
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
ax.set_title('Average length')
plt.tight_layout()
plt.show()

# PEP threshold sensitivity

In [None]:
def load_peaks_sensitivity(path, modifications, cells, replicates):
    ts = []
    # print(os.listdir(path))
    for f in tqdm(os.listdir(path)):
        if not f.endswith('.peak.sensitivity.tsv'):
            continue
        # print(f)
        if 'ATAC_seq' in f:
            cell = re.sub('(SRR[0-9]+_)|(_ATAC.*)', '', os.path.basename(f))
            rep = re.sub('_.*', '', os.path.basename(f))
        else:
            cell = next((c for c in cells if f'{c}_' in f), None)
            rep = next((r for r in replicates if f'{r}_' in f or f'{r}-' in f), None)
        mod = next((m for m in modifications if m.lower() in f.lower()), None)
        # print(f, cell, mod, rep)
        if cell and mod and rep is not None:
            try:
                tsv_path = os.path.join(path, f)
                print(tsv_path, mod, cell, rep)
                t = pd.read_csv(tsv_path, sep='\t')
                t['file'] = tsv_path
                t['modification'] = mod
                t['cell'] = cell
                t['replicate'] = rep
                for g, tg in t.groupby('Gap'):
                    tg = tg.sort_values(by=['Sensitivity']).copy().reset_index(drop=True)
                    tg.drop_duplicates(inplace=True)
                    tg['SensitivityN'] = range(len(tg))
                    ts.append(tg)
            except Exception as e:
                print(e)
    df = pd.concat(ts).reset_index(drop=True)
    return df


In [None]:
gse26320_sensitivity = load_peaks_sensitivity(GSE26320_PATH + '/span', MODIFICATIONS, GSE26320_CELLS, GSE26320_REPS)
gse26320_sensitivity['dataset'] = 'ENCODE'
print('ENCODE')
# display(gse26320_sensitivity.head())

immune_sensitivity = load_peaks_sensitivity(IMMUNE_PATH + '/span', MODIFICATIONS, IMMUNE_CELLS, IMMUNE_REPS)
immune_sensitivity['dataset'] = 'Roadmap'
print('Immune')
# display(immune_sensitivity.head())

immgen_sensitivity = load_peaks_sensitivity(IMMGEN_PATH + '/span', ['ATAC'], IMMGEN_CELLS, IMMGEN_REPS)
immgen_sensitivity['dataset'] = 'ImmGen'
print('ImmGen')
# display(immgen_sensitivity.head())

ctcf_sensitivity = load_peaks_sensitivity(CTCF_PATH + '/span', ['CTCF'], CTCF_CELLS, CTCF_REPS)
ctcf_sensitivity['dataset'] = 'ENCODE'
print('CTCF')
# display(ctcf_sensitivity.head())

full_sensitivity = pd.concat([gse26320_sensitivity, immune_sensitivity, immgen_sensitivity, ctcf_sensitivity]).reset_index(drop=True)
# For log scale
full_sensitivity.fillna(0, inplace=True)
full_sensitivity['CandidatesN'] += 1
full_sensitivity['CandidatesAL'] += 1
full_sensitivity.sample(3)

## Examples

In [None]:
EXAMPLES = [
    ('CTCF', 'H1', '', 'ENCODE'),
    ('ATAC', 'Blood_Monocyte', 'SRR5799491', 'ImmGen'),
    ('H3K4me1', 'CD34', '', 'Roadmap'),
    ('H3K4me3', 'K562', 'rep1', 'ENCODE'),
    ('H3K27ac', 'K562', 'rep1', 'ENCODE'),
    ('H3K27me3', 'Monocyte', '', 'Roadmap'),
    ('H3K36me3', 'CD4', '', 'Roadmap'),
]


In [None]:
for (m, c, r, ds) in EXAMPLES:
    t = full_sensitivity[(full_sensitivity['dataset'] == ds) &
                         (full_sensitivity['modification'] == m) &
                         (full_sensitivity['cell'] == c) &
                         (full_sensitivity['replicate'] == r)].copy().reset_index(drop=True)
    if len(t) == 0:
        continue
    name = f'{ds} {m} {c} {r}'
    # plot_sensitivity(t, name, 'CandidatesAL', 'CandidatesML', logx=True, logy=True)
    # plot_sensitivity(t, name, 'CandidatesN', 'SignalNoiseRatio', logx=True, logy=False)
    # plot_sensitivity(t, name, 'CandidatesN', 'SignalControlRatio', logx=True, logy=False)
    # plot_sensitivity(t, name, 'SignalControlRatio', 'SignalNoiseRatio', logx=False, logy=False)
    sp = plot_sensitivity(t, name)
    plot_projections(t, name, sp)

In [None]:
for (m, c, r, ds) in [
    ('H3K27me3', 'Monocyte', '', 'Roadmap'),
]:
    t = full_sensitivity[(full_sensitivity['dataset'] == ds) &
                         (full_sensitivity['modification'] == m) &
                         (full_sensitivity['cell'] == c) &
                         (full_sensitivity['replicate'] == r) &
                         (full_sensitivity['Gap'] == 0)].copy().reset_index(drop=True)
    name = f'{ds} {m} {c} {r}'
    plt.figure(figsize=(5, 2))
    ax = plt.axes()
    sns.lineplot(data=t, x='SensitivityN', y='CandidatesN',
             sort=False,
             estimator=None,
             ax=ax)
    ax.set(yscale='log')
    sens = np.array(t['Sensitivity'])
    # First and last labels are out of the plot
    sens_ticks = np.array([sens[int(t)] if 0 <= t < len(sens) else np.nan for t in ax.get_xticks()[1:-1]])
    sens_ticks[-1] = sens.max()
    labels = [''] + [f'{t:.0e}' for t in sens_ticks] + ['']
    # labels = ['', '-2e2', '-3', '-6e-2', '-1e-3', '-2e-5', '-4e-7']
    ax.set_xticklabels(labels)
    ax.set_title('Number of candidates')
    ax.set_xlabel('PEP threshold')
    ax.set_ylabel('Number')
    plt.tight_layout()
    plt.show()


## Per modification

In [None]:
dfg = full_sensitivity[(full_sensitivity['Gap'] == 0)].copy()
dfg.sort_values(by=['dataset', 'modification', 'cell', 'replicate', 'SensitivityN'], inplace=True)
plt.figure(figsize=(5, 4))
ax = plt.axes()
for i, (_, t) in enumerate(dfg.groupby(['dataset', 'modification', 'cell', 'replicate'])):
    sns.lineplot(data=t, x='CandidatesN', y='CandidatesAL',
                 hue='modification',
                 hue_order=['ATAC'] + MODIFICATIONS,
                 sort=False,
                 estimator=None,
                 # errorbar=('se', .95),
                 alpha=0.3,
                 legend = i == 0,
                 ax=ax)
ax.set(xscale='log')
ax.set(yscale='log')
ax.set_title('Sensitivity')
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
# dfg = full_sensitivity[full_sensitivity['Gap'] == 0]
# for y in ['CandidatesN', 'CandidatesAL', 'SignalNoiseRatio', 'SignalControlRatio']:
#     print(y)
#     plt.figure(figsize=(8, 2))
#     ax = plt.axes()
#     sns.lineplot(data=dfg, x='Sensitivity', y=y, hue='modification',
#                  hue_order=['CTCF', 'ATAC'] + MODIFICATIONS,
#                  alpha=0.5,
#                  sort=False,
#                  errorbar=('se', .95),
#                  ax=ax)
#     ax.set(xscale='log')
#     ax.set(yscale='log')
#     ax.invert_xaxis()
#     ax.set_title(y)
#     sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
#     plt.tight_layout()
#     plt.show()

In [None]:
# plt.rc('font', size=5)
# dfg = full_sensitivity[full_sensitivity['Gap'] == 0]
# for y in ['CandidatesN', 'CandidatesAL', 'SignalNoiseRatio', 'SignalControlRatio']:
#     print(y)
#     plt.figure(figsize=(3, 6))
#     axs = [plt.subplot(7, 1, i + 1) for i in range(7)]
#     for i, m in enumerate(['CTCF', 'ATAC'] + MODIFICATIONS):
#         ax = axs[i]
#         sns.lineplot(data=dfg[dfg['modification'] == m], x='Sensitivity', y=y, hue='dataset',
#                      alpha=0.5,
#                      sort=False,
#                      errorbar=('se', .95),
#                      ax=ax)
#         ax.set(xscale='log')
#         ax.set(yscale='log')
#         ax.invert_xaxis()
#         ax.set_title(f'{m} {y}')
#         sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
#     plt.tight_layout()
#     plt.show()
# plt.rc('font', size=8)

## Individual

In [None]:
# import plotly.express as px
#
# full_sensitivity['experiment'] = full_sensitivity['dataset'] + ' ' + full_sensitivity['cell'] + ' ' + full_sensitivity['replicate']
# for m in ['ATAC'] + MODIFICATIONS:
#     dfm = full_sensitivity[(full_sensitivity['modification'] == m) &
#                            (full_sensitivity['Gap'] == 0)]
#     if len(dfm) == 0:
#         continue
#     fig = px.line(dfm, x='CandidatesN', y='CandidatesAL', color='experiment',
#                   hover_data=['experiment', 'Sensitivity', 'CandidatesN', 'CandidatesAL'],
#                   log_x=True, log_y=True,
#                   title=f'{m}',
#                   width=800, height=600)
#     fig.show()

In [None]:
# import plotly.express as px
#
# full_sensitivity['experiment'] = full_sensitivity['dataset'] + ' ' + full_sensitivity['cell'] + ' ' + full_sensitivity['replicate']
# plt.rc('font', size=4)
# for y in ['CandidatesN', 'CandidatesAL', 'SignalNoiseRatio']:
#     print(y)
#     for m in ['ATAC'] + MODIFICATIONS:
#         dfm = full_sensitivity[(full_sensitivity['modification'] == m) &
#                                (full_sensitivity['Gap'] == 0)]
#         if len(dfm) == 0:
#             continue
#         fig = px.line(dfm, x='Sensitivity', y=y, color='experiment',
#                       hover_data=['experiment', 'Sensitivity', y],
#                       log_x=True, log_y=True,
#                       title=f'{m} {y}',
#                       width=1000, height=400)
#         fig['layout']['xaxis']['autorange'] = "reversed"
#         fig.show()
# plt.rc('font', size=8)

# Triangles

In [None]:
sps = {}
for m in ['CTCF', 'ATAC'] + MODIFICATIONS:
    for ds in ['ENCODE', 'Roadmap', 'ImmGen']:
        dfdsm = full_sensitivity[(full_sensitivity['dataset'] == ds) & (full_sensitivity['modification'] == m)]
        for i, ((cell, rep), t) in enumerate(dfdsm.groupby(['cell', 'replicate'])):
            print(ds, m, cell, rep)
            tt = t[t['Gap'] == 0].sort_values(by=['Sensitivity']).copy().reset_index(drop=True)
            sp = detect_sensitivity_triangle(tt)
            name = f'{ds} {m} {cell} {rep}'
            sps[name] = sp

## All

In [None]:
for m in ['CTCF', 'ATAC'] + MODIFICATIONS:
    for ds in ['ENCODE', 'Roadmap', 'ImmGen']:
        print(ds, m)
        dfdsm = full_sensitivity[(full_sensitivity['dataset'] == ds) & (full_sensitivity['modification'] == m)]
        t = dfdsm
        minx, maxx = t['CandidatesN'].min(), t['CandidatesN'].max()
        minx, maxx = 1, maxx + (maxx - minx) * 0.1
        miny, maxy = t['CandidatesAL'].min(), t['CandidatesAL'].max()
        miny, maxy = 1, maxy + (maxy - miny) * 0.1
        n = min(len(dfdsm[['cell', 'replicate']].copy().drop_duplicates()), 18)
        print(n)
        if n <= 6:
            ncols, nrows = n, 1
        else:
            ncols = int(math.floor(math.sqrt(n) + 2))
            nrows = int(math.ceil(n / ncols))
        plt.figure(figsize=(3 * ncols + 1, 3 * nrows))
        axs = [plt.subplot(nrows, ncols, i + 1) for i in range(n)]
        for i, ((cell, rep), t) in enumerate(dfdsm.groupby(['cell', 'replicate'])):
            if i == 18:
                break
            tt = t
            ax = axs[i]
            sns.lineplot(data=tt, x="CandidatesN", y="CandidatesAL", hue='Gap',
                         palette='tab20',
                         sort=False,
                         alpha=0.5,
                         ax=ax)

            tt = t[t['Gap'] == 0].sort_values(by=['Sensitivity']).copy().reset_index(drop=True)
            sp = detect_sensitivity_triangle(tt)
            st = full_infos[(full_infos['dataset'] == ds) &
                            (full_infos['modification'] == m) &
                            (full_infos['cell'] == cell) &
                            (full_infos['replicate'] == rep)]
            name = f'{ds} {m} {cell} {rep}'
            sp = sps[name] if name in sps else None
            if sp is not None:
                sns.scatterplot(data=tt[tt.index.isin([sp[0],])],
                                x="CandidatesN", y="CandidatesAL", color='green', s=20,
                                legend=False, ax=ax)
                sns.scatterplot(data=tt[tt.index.isin([sp[1],])],
                                x="CandidatesN", y="CandidatesAL", color='red', s=40,
                                legend=False, ax=ax)
                sns.scatterplot(data=tt[tt.index.isin([sp[2],])],
                                x="CandidatesN", y="CandidatesAL", color='blue', s=20,
                                legend=False, ax=ax)
            ax.set(yscale='log')
            ax.set(xscale='log')
            ax.set_title(f'{m} {cell} {rep}')
            ax.set_xlim(minx, maxx)
            ax.set_ylim(miny, maxy)
            if i == n - 1:
                sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
            else:
                ax.legend().set_visible(False)
        plt.tight_layout()
        plt.show()


## Figures

In [None]:
ts = []
for (m, c, r, ds) in EXAMPLES:
    t = full_sensitivity[(full_sensitivity['dataset'] == ds) &
                         (full_sensitivity['modification'] == m) &
                         (full_sensitivity['cell'] == c) &
                         (full_sensitivity['replicate'] == r) &
                         (full_sensitivity['Gap'] == 0)].copy().reset_index(drop=True)
    name = f'{ds} {m} {c} {r}'
    sp = sps[name] if name in sps else None
    if sp is not None:
        t['S2CN0'] = t[t.index == sp[0]]['CandidatesN'].values[0]
        t['S2CAL0'] = t[t.index == sp[0]]['CandidatesAL'].values[0]
        t['S2CN1'] = t[t.index == sp[1]]['CandidatesN'].values[0]
        t['S2CAL1'] = t[t.index == sp[1]]['CandidatesAL'].values[0]
        t['S2CN2'] = t[t.index == sp[2]]['CandidatesN'].values[0]
        t['S2CAL2'] = t[t.index == sp[2]]['CandidatesAL'].values[0]

    ts.append(t)

t = pd.concat(ts).reset_index(drop=True)
plt.figure(figsize=(5, 4))
ax = plt.axes()
sns.lineplot(data=t, x='CandidatesN', y='CandidatesAL',
             hue='modification',
             hue_order=['CTCF', 'ATAC'] + MODIFICATIONS,
             sort=False,
             estimator=None,
             alpha=0.5,
             ax=ax)

# sns.scatterplot(data=t, x='S2CN', y='S2CAL',
#                 hue='modification',
#                 hue_order=['CTCF', 'ATAC'] + MODIFICATIONS,
#                 alpha=0.5, size=10, legend=False,
#                 ax=ax)
sns.scatterplot(data=t, x='S2CN0', y='S2CAL0', color='green',
             alpha=0.3, size=10, legend=False,
             ax=ax)
sns.scatterplot(data=t, x='S2CN1', y='S2CAL1', color='red',
                alpha=0.3, size=10, legend=False,
                ax=ax)
sns.scatterplot(data=t, x='S2CN2', y='S2CAL2', color='blue',
                alpha=0.3, size=10, legend=False,
                ax=ax)

ax.set(xscale='log')
ax.set(yscale='log')
ax.set_title('Candidates number vs length by sensitivity')
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
sensitivities = list(sorted(full_sensitivity['Sensitivity'].unique()))
ts = []
for (m, c, r, ds) in EXAMPLES:
    t = full_sensitivity[(full_sensitivity['dataset'] == ds) &
                         (full_sensitivity['modification'] == m) &
                         (full_sensitivity['cell'] == c) &
                         (full_sensitivity['replicate'] == r) &
                         (full_sensitivity['Gap'] == 0)].copy().reset_index(drop=True)
    name = f'{ds} {m} {c} {r}'
    sp = sps[name] if name in sps else None
    if sp is None:
        continue
    t['S2N'] = sp[1]
    t['S2CN'] = t[t.index == sp[1]]['CandidatesN'].values[0]
    t['S2CAL'] = t[t.index == sp[1]]['CandidatesAL'].values[0]
    ts.append(t)
t = pd.concat(ts).reset_index(drop=True)
display(t.sample(10))


In [None]:
for y in ['CandidatesN', 'CandidatesAL', 'SignalNoiseRatio', 'SignalControlRatio']:
    print(y)
    plt.figure(figsize=(5, 2))
    ax = plt.axes()
    sns.lineplot(data=t, x='SensitivityN', y=y, hue='modification',
                 hue_order=['CTCF', 'ATAC'] + MODIFICATIONS,
                 alpha=0.5,
                 sort=False,
                 estimator=None,
                 ax=ax)
    if y == 'CandidatesN':
        sns.scatterplot(data=t, x='S2N', y='S2CN',
                        hue='modification',
                        hue_order=['CTCF', 'ATAC'] + MODIFICATIONS,
                        alpha=0.8, size=50, legend=False,
                        ax=ax)
    elif y == 'CandidatesAL':
        sns.scatterplot(data=t, x='S2N', y='S2CAL',
                        hue='modification',
                        hue_order=['CTCF', 'ATAC'] + MODIFICATIONS,
                        alpha=0.8, size=50, legend=False,
                        ax=ax)
    ax.set(yscale='log')
    ax.set_title(y)
    sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
    plt.tight_layout()
    plt.show()


In [None]:
plt.figure(figsize=(5, 2))
ax = plt.axes()
sns.lineplot(data=t, x='SensitivityN', y='CandidatesN', hue='modification',
             hue_order=['CTCF', 'ATAC'] + MODIFICATIONS,
             alpha=0.5,
             sort=False,
             estimator=None,
             ax=ax)
ax.set(yscale='log')
ax.set_title('Number of candidates')
ax.set_xlabel('PEP threshold rank')
ax.set_ylabel('Number')
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()


# Fragmentation

In [None]:
ts = []
for (ds, m, c, r), t in full_sensitivity.groupby(['dataset', 'modification', 'cell', 'replicate']):
    if len(t['file'].unique()) != 1:
        continue
    gaps = t['file'].values[0].replace('.sensitivity.tsv', '.gaps.tsv')
    if not os.path.exists(gaps):
        continue
    print(ds, m, c, r, gaps)
    t = pd.read_csv(gaps, sep='\t')
    t['dataset'] = ds
    t['modification'] = m
    t['cell'] = c
    t['replicate'] = r
    t['N'] = t['CandidatesN'] / np.max(t['CandidatesN'])
    ts.append(t)
gaps_df = pd.concat(ts).reset_index(drop=True)
del ts
gaps_df.sample(5)

## From files

In [None]:
FILE_EXAMPLES = [
    ('CTCF', 'H1', '', 'ENCODE'),
    ('ATAC', 'Blood_Monocyte', 'SRR5799491', 'ImmGen'),
    # ('ATAC', 'Blood_Monocyte', 'SRR5799492', 'ImmGen'),
    # ('ATAC', 'Small_Intestine_CD8_T_Cell', 'SRR5799444', 'ImmGen'),
    # ('ATAC', 'Small_Intestine_CD8_T_Cell', 'SRR5799445', 'ImmGen'),
    # ('H3K4me3', 'CD34', '', 'Roadmap'),
    # ('H3K4me3', 'K562', 'rep1', 'ENCODE'),
    # ('H3K4me3', 'CD34', 'rep2', 'Roadmap'),
    # ('H3K4me3', 'CD4', '', 'Roadmap'),
    # ('H3K27ac', 'K562', 'rep1', 'ENCODE'),
    # ('H3K4me1', 'H1', 'rep1', 'ENCODE'),
    # ('H3K4me1', 'H1', 'rep2', 'ENCODE'),
    # ('H3K27me3', 'GM12878', 'rep2', 'ENCODE'),
    # ('H3K27me3', 'NHLF', 'rep1', 'ENCODE'),
    # ('H3K27me3', 'CD4', '', 'Roadmap'),
    # ('H3K27me3', 'CD4', 'rep2', 'Roadmap'),
    ('H3K27me3', 'Monocyte', '', 'Roadmap'),
    # ('H3K27me3', 'BCell', '', 'Roadmap'),
    # ('H3K27me3', 'PBMC', '', 'Roadmap'),
    # ('H3K27me3', 'TCell', '', 'Roadmap'),
    # ('H3K27me3', 'H1', 'rep1', 'ENCODE'),
    # ('H3K27me3', 'HepG2', 'rep1', 'ENCODE'),
    # ('H3K27me3', 'HepG2', 'rep2', 'ENCODE'),
    # ('H3K36me3', 'CD4', '', 'Roadmap'),
    # ('H3K36me3', 'CD4', 'rep1', 'Roadmap'),
    # ('H3K36me3', 'GM12878', 'rep1', 'ENCODE'),
    # ('H3K36me3', 'CD34', '', 'Roadmap'),
    # ('H3K36me3', 'Monocyte', '', 'Roadmap'),
    # ('H3K36me3', 'BCell', '', 'Roadmap'),
    # ('H3K36me3', 'TCell', '', 'Roadmap'),
    # ('H3K36me3', 'PBMC', '', 'Roadmap'),
]

In [None]:
for (m, c, r, ds) in FILE_EXAMPLES:
    t = gaps_df[(gaps_df['dataset'] == ds) &
                (gaps_df['modification'] == m) &
                (gaps_df['cell'] == c) &
                (gaps_df['replicate'] == r)]
    if len(t) == 0:
        continue
    plt.figure(figsize=(5, 2))
    ax = plt.axes()
    sns.lineplot(data=t.reset_index(drop=True), x='Gap', y='N',
                 estimator=None,
                 alpha=0.5,
                 sort=False,
                 ax=ax)
    ax.set_ylabel('%')
    ax.set_title(f'{ds} {m} {c} {r}')
    plt.tight_layout()
    plt.show()

## Per modification

In [None]:
plt.figure(figsize=(5, 2))
ax = plt.axes()
sns.lineplot(data=gaps_df, x='Gap', y='N', hue='modification',
             alpha=0.5,
             sort=False, errorbar=('se', .95),
             hue_order=['CTCF', 'ATAC'] + MODIFICATIONS,
             ax=ax)
# ax.axvline(x=20, ymin=60, ymax=100, color='black', lw=1, ls='dotted')
# sns.scatterplot(x=[20], y=[75],
#                 color='red', s=20,
#                 legend=False, ax=ax)
ax.set_ylabel('Fraction')
# ax.set_ylim(20, 100)
# ax.set(xscale='log')
ax.set_title('Fragmentation')
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
plt.rc('font', size=5)
plt.figure(figsize=(3, 6))
axs = [plt.subplot(7, 1, i + 1) for i in range(7)]
ymin = np.min(gaps_df['N'])
for i, m in enumerate(['CTCF', 'ATAC'] + MODIFICATIONS):
    t = gaps_df[gaps_df['modification'] == m]
    if len(t) == 0:
        continue
    ax = axs[i]
    sns.lineplot(t,
                 x='Gap', y='N',
                 hue='dataset',
                 alpha=0.5, errorbar=('se', .95),
                 ax=ax)
    # sns.scatterplot(x=[20], y=[70],
    #                 color='red', s=20,
    #                 legend=False, ax=ax)
    ax.set_ylim(bottom=ymin)
    ax.set_ylabel('Fraction')
    ax.set_title(f'{m} fragmentation')
    sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()
plt.rc('font', size=8)

# Autocorrelations

## Coverage

In [None]:
ts = []
for m in ['CTCF', 'ATAC'] + MODIFICATIONS:
    for ds in ['ENCODE', 'Roadmap', 'ImmGen']:
        print(ds, m)
        dfdsm = full_sensitivity[(full_sensitivity['dataset'] == ds) &
                                 (full_sensitivity['modification'] == m) &
                                 (full_sensitivity['Gap'] == 0)]
        for (cell, rep), t in dfdsm.groupby(['cell', 'replicate']):
            if len(t['file'].unique()) != 1:
                continue
            corrs = t['file'].values[0].replace('.sensitivity.tsv', '.ac.coverage.tsv')
            if not os.path.exists(corrs):
                continue
            t = pd.read_csv(corrs, sep='\t')
            t['dataset'] = ds
            t['modification'] = m
            t['cell'] = cell
            t['replicate'] = rep
            ts.append(t)
ac_cov_df = pd.concat(ts).reset_index(drop=True)
del ts
ac_cov_df.sample(5)

In [None]:
plt.figure(figsize=(5, 2))
ax = plt.axes()
sns.lineplot(ac_cov_df[ac_cov_df['D'] <= 200],
             x='D', y='Correlation',
             hue='modification',
             hue_order=['CTCF', 'ATAC'] + MODIFICATIONS,
             alpha=0.5,  # errorbar=('se', .95),
             ax=ax)
ax.set_xlabel('Distance')
ax.set_ylabel('Autocorrelation')
ax.set_title('Raw signal coverage autocorrelation')
ax.set(xscale='log')
# ax.invert_xaxis()
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
plt.rc('font', size=5)
plt.figure(figsize=(3, 6))
axs = [plt.subplot(7, 1, i + 1) for i in range(7)]
ymin = np.min(ac_cov_df['Correlation'])
for i, m in enumerate(['CTCF', 'ATAC'] + MODIFICATIONS):
    t = ac_cov_df[ac_cov_df['modification'] == m]
    if len(t) == 0:
        continue
    ax = axs[i]
    sns.lineplot(t,
                 x='D', y='Correlation',
                 hue='dataset',
                 alpha=0.5,
                 ax=ax)
    ax.set_title(f'{m} Coverage autocorrelation')
    ax.set(xscale='log')
    ax.set_ylim(bottom=ymin)
    # ax.invert_xaxis()
    sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()
plt.rc('font', size=8)

## PVals

In [None]:
ts = []
for m in ['CTCF', 'ATAC'] + MODIFICATIONS:
    for ds in ['ENCODE', 'Roadmap', 'ImmGen']:
        print(ds, m)
        dfdsm = full_sensitivity[(full_sensitivity['dataset'] == ds) &
                                 (full_sensitivity['modification'] == m) &
                                 (full_sensitivity['Gap'] == 0)]
        for (cell, rep), t in dfdsm.groupby(['cell', 'replicate']):
            if len(t['file'].unique()) != 1:
                continue
            corrs = t['file'].values[0].replace('.sensitivity.tsv', '.ac.pvals.tsv')
            if not os.path.exists(corrs):
                continue
            t = pd.read_csv(corrs, sep='\t')
            t['dataset'] = ds
            t['modification'] = m
            t['cell'] = cell
            t['replicate'] = rep
            ts.append(t)
ac_pvals_df = pd.concat(ts).reset_index(drop=True)
del ts
ac_pvals_df.sample(5)

In [None]:
plt.figure(figsize=(5, 2))
ax = plt.axes()
sns.lineplot(ac_pvals_df[ac_pvals_df['D'] <= 200],
             x='D', y='Correlation',
             hue='modification',
             hue_order=['CTCF', 'ATAC'] + MODIFICATIONS,
             alpha=0.5,  #errorbar=('se', .95),
             ax=ax)
# ax.axvline(x=10, ymin=0, ymax=1.0, color='black', lw=.5, ls='dotted')
# sns.scatterplot(x=[10], y=[0.7],
#                 color='red', s=20,
#                 legend=False, ax=ax)
ax.set_xlabel('Distance')
ax.set_ylabel('Autocorrelation')
ax.set_title('HMM PEP autocorrelation')
ax.set(xscale='log')
# ax.invert_xaxis()
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
plt.rc('font', size=5)
plt.figure(figsize=(3, 6))
axs = [plt.subplot(7, 1, i + 1) for i in range(7)]
ymin = np.min(ac_pvals_df['Correlation'])
for i, m in enumerate(['CTCF', 'ATAC'] + MODIFICATIONS):
    t = ac_pvals_df[ac_pvals_df['modification'] == m]
    if len(t) == 0:
        continue
    ax = axs[i]
    sns.lineplot(t,
                 x='D', y='Correlation',
                 hue='dataset',
                 alpha=0.5,  #errorbar=('se', .95),
                 ax=ax)
    # sns.scatterplot(x=[10], y=[0.7],
    #                 color='red', s=20,
    #                 legend=False, ax=ax)
    ax.set_ylim(bottom=ymin)
    ax.set_title(f'{m} Pvals autocorrelation')
    ax.set(xscale='log')
    # ax.invert_xaxis()
    sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()
plt.rc('font', size=8)

## Per modification

In [None]:
# import plotly.express as px
#
# ac_pvals_df['experiment'] = ac_pvals_df['dataset'] + ' ' + ac_pvals_df['cell'] + ' ' + ac_pvals_df['replicate']
#
# plt.rc('font', size=4)
# # for m in ['ATAC'] + MODIFICATIONS:
# for m in ['H3K27me3']:
#     dfm = ac_pvals_df[(ac_pvals_df['modification'] == m) & (ac_pvals_df['D'] <= 50)]
#     if len(dfm) == 0:
#         continue
#     fig = px.line(dfm, x='D', y='Correlation', color='experiment',
#                   hover_data=['experiment'],
#                   title=f'{m} Pvals autocorrelation',
#                   width=1000, height=400)
#     fig.show()
# plt.rc('font', size=8)

# Candidates

In [None]:
cns = {}
cals = {}
for (ds, m, c, r), t in full_sensitivity.groupby(['dataset', 'modification', 'cell', 'replicate']):
    tt = t[t['Gap'] == 0].sort_values('Sensitivity').reset_index(drop=True)
    name = f'{ds} {m} {c} {r}'
    sp = sps[name]
    cn = tt['CandidatesN'].values[sp[1]]
    cal = tt['CandidatesAL'].values[sp[1]]
    cns[(ds, m, c, r)] = cn
    cals[(ds, m, c, r)] = cal

full_infos['candidatesN'] = [cns[(ds, m, c, r)] if (ds, m, c, r) in cns else 0 for _, (ds, m, c, r) in
                             full_infos[['dataset', 'modification', 'cell', 'replicate']].iterrows()]
full_infos['candidatesAL'] = [cals[(ds, m, c, r)] if (ds, m, c, r) in cals else 0 for _, (ds, m, c, r) in
                              full_infos[['dataset', 'modification', 'cell', 'replicate']].iterrows()]
full_infos.sample(3)

In [None]:
# Remove outliers
OUTLIERS = [
    ('H3K4me3', 'NK', ''),
    ('H3K27me3', 'TCell', ''),
    ('H3K27me3', 'BCell', ''),
]
df_peaks = full_infos.loc[[(m, c, r) not in OUTLIERS
                           for _, (m, c, r) in full_infos[['modification', 'cell', 'replicate']].iterrows()]].copy()

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(5, 3))
ax = plt.axes()
t = df_peaks
sns.barplot(data=t, x='modification', y='candidatesN', hue='dataset',
            capsize=.1, edgecolor="black",
            order=['CTCF', 'ATAC'] + MODIFICATIONS,
            hue_order=['ENCODE', 'Roadmap', 'ABF', 'ImmGen'],
            err_kws={'linewidth': 2},
            ax=ax)
sns.stripplot(data=t, x='modification', y='candidatesN', hue='dataset',
              dodge=True, size=2, palette='dark:black', alpha=0.5,
              order=['CTCF', 'ATAC'] + MODIFICATIONS,
              hue_order=['ENCODE', 'Roadmap', 'ABF', 'ImmGen'], legend=False,
              ax=ax)
ax.xaxis.set_tick_params(rotation=90)
# ax.set_ylim(0, 2e5)
# ax.set(yscale='log')
# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
full_infos[(full_infos['modification'] == 'H3K27me3')].sort_values(by=['candidatesN'], ascending=False)[
    ['cell', 'replicate', 'candidatesN']].head(10)

# Min signal-to-noise

In [None]:
min_sns = {}
for m in ['CTCF', 'ATAC'] + MODIFICATIONS:
    for ds in ['ENCODE', 'Roadmap', 'ImmGen']:
        print(ds, m)
        dfdsm = full_sensitivity[(full_sensitivity['dataset'] == ds) & (full_sensitivity['modification'] == m)]
        for (cell, rep), t in dfdsm.groupby(['cell', 'replicate']):
            tt = t[t['Gap'] == 0].sort_values(by=['Sensitivity']).copy().reset_index(drop=True)
            sp = detect_sensitivity_triangle(tt)
            snr = tt['SignalNoiseRatio'].copy().values
            snr[:sp[0]] = 1e6
            snr[sp[2]:] = 1e6
            mi = np.argmin(snr)
            min_snr = snr[mi]
            print(f'{ds} {m} {cell} {rep} {mi}: {min_snr}')
            min_sns[(ds, m, cell, rep)] = min_snr

full_infos['minimal_signal_noise_ratio'] = [min_sns[(ds, m, c, r)] for _, (ds, m, c, r) in
                                            full_infos[['dataset', 'modification', 'cell', 'replicate']].iterrows()]
full_infos.sample(3)

# Pvalues

## Examples

In [None]:
t = StringIO('''Q	LogNullP
0.0	-216.5814666748047
1.0E-5	-119.61390686035156
2.0E-5	-112.77011108398438
3.0000000000000004E-5	-105.98941040039062
4.0E-5	-102.6241455078125
5.0E-5	-102.6241455078125
6.0E-5	-99.27644348144531
7.000000000000001E-5	-99.27644348144531
8.0E-5	-95.94697570800781
9.0E-5	-92.6364517211914
1.0E-4	-92.6364517211914
1.1E-4	-92.6364517211914
1.2E-4	-89.34564208984375
1.3000000000000002E-4	-89.34564208984375
1.4000000000000001E-4	-89.34564208984375
1.5000000000000001E-4	-86.07534790039062
1.6E-4	-86.07534790039062
1.7E-4	-86.07534790039062
1.8E-4	-86.07534790039062
1.9E-4	-86.07534790039062
2.0E-4	-82.82644653320312
2.1E-4	-82.82644653320312
2.2E-4	-82.82644653320312
2.3E-4	-82.82644653320312
2.4E-4	-82.82644653320312
2.5E-4	-79.59986877441406
2.6000000000000003E-4	-79.59986877441406
2.7000000000000006E-4	-79.59986877441406
2.800000000000001E-4	-79.59986877441406
2.900000000000001E-4	-79.59986877441406
3.0000000000000014E-4	-79.59986877441406
3.1000000000000016E-4	-76.39661407470703
3.200000000000002E-4	-76.39661407470703
3.300000000000002E-4	-76.39661407470703
3.4000000000000024E-4	-76.39661407470703
3.5000000000000027E-4	-76.39661407470703
3.600000000000003E-4	-76.39661407470703
3.700000000000003E-4	-76.39661407470703
3.8000000000000035E-4	-76.3956298828125
3.9000000000000037E-4	-73.21774291992188
4.000000000000004E-4	-73.21774291992188
4.100000000000004E-4	-73.21774291992188
4.2000000000000045E-4	-73.21774291992188
4.300000000000005E-4	-73.21774291992188
4.400000000000005E-4	-73.21774291992188
4.5000000000000053E-4	-73.21774291992188
4.6000000000000056E-4	-73.21774291992188
4.700000000000006E-4	-70.0644302368164
4.800000000000006E-4	-70.0644302368164
4.900000000000006E-4	-70.0644302368164
5.000000000000007E-4	-70.0644302368164
5.100000000000007E-4	-70.0644302368164
5.200000000000007E-4	-70.0644302368164
5.300000000000007E-4	-70.0644302368164
5.400000000000008E-4	-70.0644302368164
5.500000000000008E-4	-70.0644302368164
5.600000000000008E-4	-70.0644302368164
5.700000000000008E-4	-70.0644302368164
5.800000000000009E-4	-68.97872924804688
5.900000000000009E-4	-66.93791198730469
6.000000000000009E-4	-66.93791198730469
6.10000000000001E-4	-66.93791198730469
6.20000000000001E-4	-66.93791198730469
6.30000000000001E-4	-66.93791198730469
6.40000000000001E-4	-66.93791198730469
6.500000000000011E-4	-66.93791198730469
6.600000000000011E-4	-66.93791198730469
6.700000000000011E-4	-66.93791198730469
6.800000000000011E-4	-66.93791198730469
6.900000000000012E-4	-66.93791198730469
7.000000000000012E-4	-66.93778228759766
7.100000000000012E-4	-63.83954620361328
7.200000000000012E-4	-63.83954620361328
7.300000000000013E-4	-63.83954620361328
7.400000000000013E-4	-63.83954620361328
7.500000000000013E-4	-63.83954620361328
7.600000000000013E-4	-63.83954620361328
7.700000000000014E-4	-63.83954620361328
7.800000000000014E-4	-63.83954620361328
7.900000000000014E-4	-63.83954620361328
8.000000000000014E-4	-63.83954620361328
8.100000000000015E-4	-63.83954620361328
8.200000000000015E-4	-63.83954620361328
8.300000000000015E-4	-63.83954620361328
8.400000000000016E-4	-63.83954620361328
8.500000000000016E-4	-63.8370475769043
8.600000000000016E-4	-60.77081298828125
8.700000000000016E-4	-60.77081298828125
8.800000000000017E-4	-60.77081298828125
8.900000000000017E-4	-60.77081298828125
9.000000000000017E-4	-60.77081298828125
9.100000000000017E-4	-60.77081298828125
9.200000000000018E-4	-60.77081298828125
9.300000000000018E-4	-60.77081298828125
9.400000000000018E-4	-60.77081298828125
9.500000000000018E-4	-60.77081298828125
9.600000000000019E-4	-60.77081298828125
9.700000000000019E-4	-60.77081298828125
9.80000000000002E-4	-60.77081298828125
9.90000000000002E-4	-60.77081298828125
''')
t = pd.read_csv(t, sep='\\t')
plt.figure(figsize=(5, 2))
ax = plt.axes()
sns.lineplot(data=t, x='Q', y='LogNullP', estimator=None,
             alpha=0.5,
             sort=False,
             ax=ax)
ax.set_title('K27me3 Monocyte')
plt.tight_layout()
plt.show()

## From files

In [None]:
ts = []
for m in ['CTCF', 'ATAC'] + MODIFICATIONS:
    for ds in ['ENCODE', 'Roadmap', 'ImmGen']:
        print(ds, m)
        dfdsm = full_sensitivity[(full_sensitivity['dataset'] == ds) &
                                 (full_sensitivity['modification'] == m) &
                                 (full_sensitivity['Gap'] == 0)]
        for (cell, rep), t in dfdsm.groupby(['cell', 'replicate']):
            if len(t['file'].unique()) != 1:
                continue
            pvals = t['file'].values[0].replace('.sensitivity.tsv', '.logps.tsv')
            if not os.path.exists(pvals):
                continue
            t = pd.read_csv(pvals, sep='\t')
            t['dataset'] = ds
            t['modification'] = m
            t['cell'] = cell
            t['replicate'] = rep
            ts.append(t)
pvals_df = pd.concat(ts).reset_index(drop=True)
del ts
pvals_df.sample(5)

In [None]:
for (m, c, r, ds) in FILE_EXAMPLES:
    t = pvals_df[(pvals_df['dataset'] == ds) &
                 (pvals_df['modification'] == m) &
                 (pvals_df['cell'] == c) &
                 (pvals_df['replicate'] == r)].copy().reset_index(drop=True)
    if len(t) == 0:
        continue
    name = f'{ds} {m} {c} {r}'
    plt.figure(figsize=(4, 2))
    ax = plt.axes()
    sns.lineplot(t,
                 x='Q', y='LogNullP', estimator=None,
                 ax=ax)
    ax.set_title(f'Pvalues {name}')
    plt.tight_layout()
    plt.show()

## All

In [None]:
pvals_df['MinusLogNullP'] = -pvals_df['LogNullP']
plt.figure(figsize=(5, 2))
ax = plt.axes()
sns.lineplot(pvals_df[pvals_df['Q']<=0.005],
             x='Q', y='LogNullP',
             hue='modification',
             hue_order=['CTCF', 'ATAC'] + MODIFICATIONS,
             alpha=0.5, errorbar=('se', .95),
             ax=ax)
ax.set_title('Pvalues')
# ax.set(yscale='log')
# ax.invert_xaxis()
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
plt.rc('font', size=5)
plt.figure(figsize=(3, 6))
axs = [plt.subplot(7, 1, i + 1) for i in range(7)]
for i, m in enumerate(['CTCF', 'ATAC'] + MODIFICATIONS):
    ax = axs[i]
    sns.lineplot(pvals_df[(pvals_df['modification'] == m) & (pvals_df['dataset'] != '') & (pvals_df['Q'] <= 0.005)],
                 x='Q', y='LogNullP',
                 hue='dataset',
                 alpha=0.5, errorbar=('se', .95),
                 ax=ax)
    ax.set_title(f'{m} Pvalues')
    sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()
plt.rc('font', size=8)

# Summary

In [None]:
full_infos['effective_coverage'] = full_infos['coverage'] - full_infos['beta'] * full_infos['control_coverage'] * \
                                   full_infos['control_scale']

In [None]:
features = [
    'model_signal',
    'model_noise',
    'model_signal_to_noise',
    'log_null_pvals_mean',
    'log_null_pvals_std',
    'coverage',
    'control_coverage',
    'control_scale',
    'beta',
    'min_correlation',
    'coverage_non_zero',
    'coverage_max',
    'coverage_mean',
    'coverage_median',
    'coverage_std',
    # 'roughness',
    'signal_density',
    'noise_density',
    'signal_to_noise',
    'signal_to_control',
    'autocorrelation_average_score',
    'fragmentation_average_score'
]

full_features = features # + ['candidatesN', 'candidatesAL', 'minimal_signal_noise_ratio', ]

In [None]:
full_infos[(full_infos['modification'] == 'H3K27ac') &
           (full_infos['cell'].isin(['CD4']))][['modification', 'cell'] + full_features].T

In [None]:
ncols = int(math.ceil(math.sqrt(len(full_features) + 3)))
nrows = int(math.ceil(len(full_features) / ncols))
plt.figure(figsize=(3.2 * ncols, 2.5 * nrows))
axs = [plt.subplot(nrows, ncols, i + 1) for i in range(len(full_features))]
for i, y in enumerate(full_features):
    ax = axs[i]
    sns.boxplot(data=full_infos, x='modification', y=y, 
                hue='dataset',
                hue_order=['ENCODE', 'Roadmap', 'ABF', 'ImmGen'],
                order=['ATAC'] + MODIFICATIONS,
                showfliers=False,
                ax=ax)
    ax.xaxis.set_tick_params(rotation=90)
    if i < (nrows - 1) * ncols:
        ax.set_xlabel(None)
        ax.set_xticks([], minor=False)
        ax.set_xticklabels([])
    ax.xaxis.set_tick_params(rotation=90)
    if i % ncols == ncols - 1:
        sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
    else:
        ax.legend().set_visible(False)
plt.tight_layout()
plt.show()

In [None]:
full_infos.fillna(0, inplace=True)
corrs = full_infos[full_features].corr(method='pearson').fillna(0)
plt.rc('font', size=5)
sns.clustermap(corrs,
               col_cluster=True, row_cluster=True,
               figsize=(4.5, 4.5),
               cmap=plt.cm.seismic,
               vmin=-1, vmax=1)
plt.show()
plt.rc('font', size=8)

In [None]:
# for m in MODIFICATIONS:
#     print(m)
#     corrs = full_infos[full_infos['modification'] == m][features].corr(method='spearman')
#     plt.rc('font', size=5)
#     sns.clustermap(corrs,
#                    col_cluster=True, row_cluster=True,
#                    figsize=(4, 4),
#                    cmap=plt.cm.seismic,
#                 vmin=-1, vmax=1)
#     plt.show()
#     plt.rc('font', size=8)

# Segments

In [None]:
import matplotlib.patches as mpatches


def d(a, b):
    return a / b if b != 0 else 0


def plot_segments(sdf, df, name, sp=None, ax=None, legend=True):
    sdf['New %'] = [d(n, t) * 100 for n, t in zip(sdf['New'], sdf['CandidatesN'])]
    sdf['Total %'] = 100
    axisnone = ax is None
    if ax is None:
        plt.figure(figsize=(8, 2))
        ax = plt.axes()
    # Background
    sns.barplot(x=range(len(sdf)), y=sdf['Total %'],
                color='orange',
                alpha=0.5,
                ax=ax)
    # Foreground
    sns.barplot(x=range(len(sdf)), y=sdf['New %'],
                color='blue',
                alpha=0.5,
                ax=ax)
    if df is not None:
        assert len(sdf) == len(df)
        df = df.sort_values(by=['Sensitivity'])
        ns = np.log1p(df['CandidatesN'])
        ns = ns * 80 / ns.max() + 10
        sns.lineplot(x=range(len(sdf)), y=ns,
                     color='black',
                     sort=False,
                     estimator=None,
                     ax=ax)
        als = np.log1p(df['CandidatesAL'])
        als = als * 80 / als.max() + 10
        sns.lineplot(x=range(len(sdf)), y=als,
                     color='brown',
                     sort=False,
                     estimator=None,
                     ax=ax)
        # snrs = np.log1p(df['SignalNoiseRatio'])
        # snrs = snrs * 80 / snrs.max() + 10
        # sns.lineplot(x=range(len(sdf)), y=snrs,
        #              color='green',
        #              sort=False,
        #              estimator=None,
        #              ax=ax)

    ax.set_title(name)
    ax.set_ylabel('peaks %')

    # Limit ticks number
    # for i, label in enumerate(ax.get_xticklabels()):
    #     if axisnone and i not in tens_closest:
    #         label.set_visible(False)
    sens = np.array(sorted(sdf['Sensitivity']))
    tens_closest = {np.argmin(np.fabs(sens - x)): x for x in [-100, -10, -1, -0.1, -1e-2, -1e-3, -1e-4, -1e-6]}
    labels = [str(tens_closest[i]) if i in tens_closest else '' for i in range(len(ax.get_xticklabels()))]
    ax.set_xticklabels(labels)
    ax.set_xlabel('Sensitivity')
    for i in tens_closest.keys():
        ax.axvline(x=i, ymin=0, ymax=90, color='grey', lw=0.5)

    # Points
    if sp is not None:
        ax.axvline(x=sp[2], ymin=0, ymax=90, color='blue', lw=2)
        ax.axvline(x=sp[1], ymin=0, ymax=90, color='red', lw=2)
        ax.axvline(x=sp[0], ymin=0, ymax=90, color='green', lw=2)
        # ax.axvline(x=int((sp[1] + sp[2]) / 2), ymin=0, ymax=90, color='grey', lw=2)

    # add legend
    if legend:
        top_bar = mpatches.Patch(color='orange', label='old %')
        bottom_bar = mpatches.Patch(color='blue', label='new %')
        ns = mpatches.Patch(color='black', label='number')
        if df is None:
            plt.legend(handles=[bottom_bar, top_bar, ns])
        else:
            als = mpatches.Patch(color='brown', label='avglen')
            # snrs = mpatches.Patch(color='green', label='snr')
            # plt.legend(handles=[bottom_bar, top_bar, ns, als, snrs])
            plt.legend(handles=[bottom_bar, top_bar, ns, als])
        sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))

    new_value = 1
    for patch in ax.patches:
        current_width = patch.get_width()
        diff = current_width - new_value
        # we change the bar width
        patch.set_width(new_value)
        # we recenter the bar
        patch.set_x(patch.get_x() + diff * .5)
    if axisnone:
        plt.tight_layout()
        plt.show()

## From files

In [None]:
for (m, c, r, ds) in FILE_EXAMPLES:
    t = full_sensitivity[(full_sensitivity['dataset'] == ds) &
                         (full_sensitivity['modification'] == m) &
                         (full_sensitivity['cell'] == c) &
                         (full_sensitivity['replicate'] == r) &
                         (full_sensitivity['Gap'] == 0)]
    if len(t['file'].unique()) != 1:
        continue
    segments = t['file'].values[0].replace('.sensitivity.tsv', '.segments.tsv')
    if not os.path.exists(segments):
        continue
    sdf = pd.read_csv(segments, sep='\t')
    # Hack with indexing
    # sdf['Sensitivity'] = sorted(sdf['Sensitivity'])
    name = f'{ds} {m} {c} {r}'
    sp = sps[name] if name in sps else None
    plot_segments(sdf, t, name, sp)

In [None]:
for (m, c, r, ds) in [
    ('H3K27me3', 'Monocyte', '', 'Roadmap'),
]:
    t = full_sensitivity[(full_sensitivity['dataset'] == ds) &
                         (full_sensitivity['modification'] == m) &
                         (full_sensitivity['cell'] == c) &
                         (full_sensitivity['replicate'] == r) &
                         (full_sensitivity['Gap'] == 0)].copy().reset_index(drop=True)
    segments = t['file'].values[0].replace('.sensitivity.tsv', '.segments.tsv')
    if not os.path.exists(segments):
        continue
    sdf = pd.read_csv(segments, sep='\t')
    sdf['New %'] = [d(n, t) * 100 for n, t in zip(sdf['New'], sdf['CandidatesN'])]
    sdf['SensitivityN'] = range(len(sdf))
    name = f'{ds} {m} {c} {r}'
    plt.figure(figsize=(5, 2))
    ax = plt.axes()
    sns.lineplot(data=sdf, x='SensitivityN', y='New %',
             sort=False,
             estimator=None,
             ax=ax)
    sens = np.array(t['Sensitivity'])
    # First and last labels are out of the plot
    sens_ticks = np.array([sens[int(t)] if 0 <= t < len(sens) else np.nan for t in ax.get_xticks()[1:-1]])
    sens_ticks[-1] = sens.max()
    labels = [''] + [f'{t:.0e}' for t in sens_ticks] + ['']
    ax.set_xticklabels(labels)
    ax.set_title('Percentage of new candidates')
    ax.set_xlabel('PEP threshold')
    ax.set_ylabel('New %    ')
    plt.tight_layout()
    plt.show()


In [None]:

for (m, c, r, ds) in EXAMPLES:
    t = full_sensitivity[(full_sensitivity['dataset'] == ds) &
                         (full_sensitivity['modification'] == m) &
                         (full_sensitivity['cell'] == c) &
                         (full_sensitivity['replicate'] == r) &
                         (full_sensitivity['Gap'] == 0)]
    if len(t['file'].unique()) != 1:
        continue
    segments = t['file'].values[0].replace('.sensitivity.tsv', '.segments.tsv')
    if not os.path.exists(segments):
        continue
    sdf = pd.read_csv(segments, sep='\t')
    name = f'{ds} {m} {c} {r}'
    sp = sps[name] if name in sps else None
    plot_segments(sdf, t, name, sp)

## Per modification

In [None]:
ts = []
for m in ['CTCF', 'ATAC'] + MODIFICATIONS:
    for ds in ['ENCODE', 'Roadmap', 'ImmGen']:
        print(ds, m)
        dfdsm = full_sensitivity[(full_sensitivity['dataset'] == ds) &
                                 (full_sensitivity['modification'] == m) &
                                 (full_sensitivity['Gap'] == 0)]
        for (cell, rep), t in dfdsm.groupby(['cell', 'replicate']):
            if len(t['file'].unique()) != 1:
                continue
            segments = t['file'].values[0].replace('.sensitivity.tsv', '.segments.tsv')
            if not os.path.exists(segments):
                continue
            sdf = pd.read_csv(segments, sep='\t')
            sdf['dataset'] = ds
            sdf['modification'] = m
            sdf['cell'] = cell
            sdf['replicate'] = rep
            sdf['SensitivityN'] = range(len(sdf))
            ts.append(sdf)
segments_df = pd.concat(ts).reset_index(drop=True)
segments_df['New %'] = [d(n, t) * 100 for n, t in zip(segments_df['New'], segments_df['CandidatesN'])]
del ts
segments_df.fillna(0, inplace=True)
segments_df.sample(5)

In [None]:
plt.figure(figsize=(5, 2))
ax = plt.axes()
sns.lineplot(segments_df,
             x='SensitivityN', y='New %',
             hue='modification',
             hue_order=['CTCF', 'ATAC'] + MODIFICATIONS,
             alpha=0.5, errorbar=('se', .95),
             ax=ax)
ax.set_xlabel('PEP threshold rank')
ax.set_ylabel('New %')
ax.set_title('Percentage of new candidates')
# ax.set(xscale='log')
# ax.invert_xaxis()
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))

plt.tight_layout()
plt.show()

In [None]:
plt.rc('font', size=5)
plt.figure(figsize=(3, 6))
axs = [plt.subplot(7, 1, i + 1) for i in range(7)]
for i, m in enumerate(['CTCF', 'ATAC'] + MODIFICATIONS):
    ax = axs[i]
    sns.lineplot(segments_df[segments_df['modification'] == m],
                 x='SensitivityN', y='New %',
                 hue='dataset',
                 alpha=0.5, errorbar=('se', .95),
                 ax=ax)
    ax.set_ylabel('peaks %')
    ax.set_title(f'{m} New %')
    # ax.set(xscale='log')
    # ax.invert_xaxis()
    sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))

plt.tight_layout()
plt.show()
plt.rc('font', size=8)

## Individual

In [None]:
# import plotly.express as px
#
# segments_df['experiment'] = segments_df['dataset'] + ' ' + segments_df['cell'] + ' ' + segments_df['replicate']
#
# plt.rc('font', size=4)
# # for m in ['ATAC'] + MODIFICATIONS:
# for m in ['ATAC']:
#     dfm = segments_df[segments_df['modification'] == m]
#     if len(dfm) == 0:
#         continue
#     fig = px.line(dfm, x='SensitivityN', y='New %', color='experiment',
#                   hover_data=['experiment', 'Sensitivity', 'New %'],
#                   log_x=True,
#                   title=f'{m} New %',
#                   width=1000, height=400)
#     fig.show()
# plt.rc('font', size=8)

## All

In [None]:
for m in ['CTCF', 'ATAC'] + MODIFICATIONS:
    for ds in ['ENCODE', 'Roadmap', 'ImmGen']:
        print(ds, m)
        dfdsm = segments_df[(segments_df['dataset'] == ds) &
                            (segments_df['modification'] == m)]
        n = min(len(dfdsm[['cell', 'replicate']].copy().drop_duplicates()), 18)
        print(n)
        if n <= 6:
            ncols, nrows = n, 1
        else:
            ncols = int(math.floor(math.sqrt(n) + 2))
            nrows = int(math.ceil(n / ncols))
        plt.figure(figsize=(5 * ncols + 1, 2 * nrows))
        axs = [plt.subplot(nrows, ncols, i + 1) for i in range(n)]
        for i, ((cell, rep), sdf) in enumerate(dfdsm.groupby(['cell', 'replicate'])):
            if i == 18:
                break
            t = full_sensitivity[(full_sensitivity['dataset'] == ds) &
                                 (full_sensitivity['modification'] == m) &
                                 (full_sensitivity['cell'] == cell) &
                                 (full_sensitivity['replicate'] == rep) &
                                 (full_sensitivity['Gap'] == 0)]
            ax = axs[i]
            name = f'{ds} {m} {cell} {rep}'
            sp = sps[name] if name in sps else None
            if i == n - 1:
                plot_segments(sdf, t, name, sp, ax, True)
                sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
            else:
                plot_segments(sdf, t, name, sp, ax, False)
        plt.tight_layout()
        plt.show()

# Coverage

## TCell H3K27me3 scores

In [None]:
out = StringIO('''Q	Coverage
1	1.101189395318887
2	1.101189395318887
3	1.171189395318887
4	1.171189395318887
5	1.171189395318887
6	1.171189395318887
7	1.171189395318887
8	1.2411893953188868
9	1.2411893953188868
10	1.2411893953188868
11	1.2411893953188868
12	1.2411893953188868
13	1.2411893953188868
14	1.2411893953188868
15	1.2411893953188868
16	1.2411893953188868
17	1.2411893953188868
18	1.2411893953188868
19	1.311189395318887
20	1.311189395318887
21	1.311189395318887
22	1.311189395318887
23	1.311189395318887
24	1.311189395318887
25	1.311189395318887
26	1.311189395318887
27	1.311189395318887
28	1.311189395318887
29	1.311189395318887
30	1.311189395318887
31	1.311189395318887
32	1.311189395318887
33	1.311189395318887
34	1.311189395318887
35	1.381189395318887
36	1.381189395318887
37	1.381189395318887
38	1.381189395318887
39	1.381189395318887
40	1.381189395318887
41	1.381189395318887
42	1.381189395318887
43	1.381189395318887
44	1.381189395318887
45	1.381189395318887
46	1.381189395318887
47	1.381189395318887
48	1.381189395318887
49	1.381189395318887
50	2.4823787906377737
51	2.552378790637774
52	2.552378790637774
53	2.622378790637774
54	2.622378790637774
55	2.622378790637774
56	2.622378790637774
57	2.692378790637774
58	2.692378790637774
59	2.692378790637774
60	2.692378790637774
61	2.692378790637774
62	2.692378790637774
63	2.692378790637774
64	2.762378790637774
65	2.762378790637774
66	2.762378790637774
67	2.762378790637774
68	2.762378790637774
69	2.762378790637774
70	2.762378790637774
71	3.933568185956661
72	4.003568185956661
73	4.003568185956661
74	4.003568185956661
75	4.073568185956661
76	4.073568185956661
77	4.073568185956661
78	4.073568185956661
79	4.143568185956661
80	4.143568185956661
81	4.143568185956661
82	5.244757581275548
83	5.384757581275548
84	5.384757581275548
85	5.4547575812755476
86	5.4547575812755476
87	5.524757581275548
88	5.524757581275548
89	6.695946976594435
90	6.8359469765944345
91	6.8359469765944345
92	6.905946976594435
93	8.147136371913321
94	8.217136371913321
95	8.287136371913322
96	9.598325767232208
97	10.909515162551095
98	12.360704557869983
99	16.294272743826642
100	144.1096440897587''')
t1 = pd.read_csv(out, sep='\\t')
t1.sample(3)

In [None]:
plt.figure(figsize=(4, 2))
ax = plt.axes()
sns.lineplot(t1, x='Q', y='Coverage',
             color='blue',
             ax=ax)
ax.set(yscale='log')
ax.set_title('H3K27me3 TCell')
plt.tight_layout()
plt.show()

## Monocyte H3K27me3 scores

In [None]:
out = StringIO('''Q	Coverage
1	0.3674968528850454
2	0.3674968528850454
3	0.3674968528850454
4	0.5783312352566969
5	0.5783312352566969
6	0.5783312352566969
7	0.5783312352566969
8	0.5783312352566969
9	0.5783312352566969
10	0.5783312352566969
11	0.5783312352566969
12	0.5783312352566969
13	0.7891656176283485
14	0.7891656176283485
15	0.7891656176283485
16	0.7891656176283485
17	0.7891656176283485
18	0.7891656176283485
19	0.7891656176283485
20	0.7891656176283485
21	0.7891656176283485
22	0.7891656176283485
23	0.7891656176283485
24	0.7891656176283485
25	0.7891656176283485
26	0.7891656176283485
27	0.7891656176283485
28	0.7891656176283485
29	0.7891656176283485
30	0.7891656176283485
31	0.7891656176283485
32	0.7891656176283485
33	1.0
34	1.0
35	1.0
36	1.0
37	1.0
38	1.0
39	1.0
40	1.0
41	1.0
42	1.0
43	1.0
44	1.0
45	1.0
46	1.0
47	1.0
48	1.0
49	1.0
50	1.0
51	1.0
52	1.0
53	1.0
54	1.0
55	1.0
56	1.0
57	1.0
58	1.0
59	1.0
60	1.0
61	1.0
62	1.0
63	1.3674968528850453
64	1.3674968528850453
65	1.578331235256697
66	1.578331235256697
67	1.578331235256697
68	1.7891656176283486
69	1.7891656176283486
70	1.7891656176283486
71	1.7891656176283486
72	1.7891656176283486
73	1.7891656176283486
74	1.7891656176283486
75	2.0
76	2.0
77	2.0
78	2.0
79	2.0
80	2.0
81	2.0
82	2.0
83	2.0
84	2.156662470513394
85	2.578331235256697
86	2.7891656176283486
87	2.7891656176283486
88	2.7891656176283486
89	3.0
90	3.0
91	3.0
92	3.578331235256697
93	3.7891656176283486
94	4.0
95	4.578331235256697
96	5.0
97	6.367496852885045
98	8.789165617628349
99	14.0
100	68.53735837982705''')
t1 = pd.read_csv(out, sep='\\t')
t1.sample(3)

In [None]:
plt.figure(figsize=(4, 2))
ax = plt.axes()
sns.lineplot(t1, x='Q', y='Coverage',
             color='blue',
             ax=ax)
ax.set(yscale='log')
ax.set_title('H3K27me3 Monocytes')
plt.tight_layout()
plt.show()

## Monocyte CD4 scores

In [None]:
out = StringIO('''Q	Coverage
1	0.8205303096321184
2	1.0171085913600988
3	1.213686873088079
4	1.213686873088079
5	1.213686873088079
6	1.213686873088079
7	1.213686873088079
8	1.4102651548160592
9	1.4102651548160592
10	1.4102651548160592
11	1.4102651548160592
12	1.4102651548160592
13	1.4102651548160592
14	1.4102651548160592
15	1.4102651548160592
16	1.4102651548160592
17	1.6068434365440396
18	1.6068434365440396
19	1.6068434365440396
20	1.6068434365440396
21	1.6068434365440396
22	1.6068434365440396
23	1.6068434365440396
24	1.6068434365440396
25	1.6068434365440396
26	1.6068434365440396
27	1.6068434365440396
28	1.6068434365440396
29	1.6068434365440396
30	1.6068434365440396
31	1.6068434365440396
32	1.8034217182720198
33	1.8034217182720198
34	1.8034217182720198
35	1.8034217182720198
36	1.8034217182720198
37	1.8034217182720198
38	1.8034217182720198
39	1.8034217182720198
40	1.8034217182720198
41	1.8034217182720198
42	1.8034217182720198
43	1.8034217182720198
44	1.8034217182720198
45	1.8034217182720198
46	1.8034217182720198
47	1.8034217182720198
48	1.8034217182720198
49	1.8034217182720198
50	1.8034217182720198
51	2.0
52	2.0
53	2.0
54	2.0
55	2.0
56	2.0
57	2.0
58	2.0
59	2.0
60	2.0
61	2.0
62	2.0
63	2.0
64	2.0
65	2.8205303096321184
66	3.213686873088079
67	3.213686873088079
68	3.4102651548160594
69	3.4102651548160594
70	3.4102651548160594
71	3.4102651548160594
72	3.6068434365440396
73	3.6068434365440396
74	3.6068434365440396
75	3.6068434365440396
76	3.6068434365440396
77	3.6068434365440396
78	3.6068434365440396
79	3.80342171827202
80	3.80342171827202
81	3.80342171827202
82	3.80342171827202
83	3.80342171827202
84	3.80342171827202
85	4.0
86	4.0
87	4.0
88	4.0
89	4.0
90	5.213686873088079
91	5.410265154816059
92	5.60684343654404
93	5.60684343654404
94	5.80342171827202
95	5.80342171827202
96	6.0
97	6.0
98	7.60684343654404
99	8.0
100	245.3337880504351''')
t2 = pd.read_csv(out, sep='\\t')
t2.sample(3)

In [None]:
plt.figure(figsize=(4, 2))
ax = plt.axes()
sns.lineplot(t2, x='Q', y='Coverage',
             color='blue',
             ax=ax)
ax.set(yscale='log')
ax.set_title('H3K27me3 CD4')
plt.tight_layout()
plt.show()

## From files

In [None]:
ts = []
for m in ['CTCF', 'ATAC'] + MODIFICATIONS:
    for ds in ['ENCODE', 'Roadmap', 'ImmGen']:
        print(ds, m)
        dfdsm = full_sensitivity[(full_sensitivity['dataset'] == ds) &
                                 (full_sensitivity['modification'] == m) &
                                 (full_sensitivity['Gap'] == 0)]
        for (cell, rep), t in dfdsm.groupby(['cell', 'replicate']):
            if len(t['file'].unique()) != 1:
                continue
            assert len(t['file'].unique()) == 1
            cov = t['file'].values[0].replace('.sensitivity.tsv', '.coverage.tsv')
            if not os.path.exists(cov):
                continue
            t = pd.read_csv(cov, sep='\t')
            t['dataset'] = ds
            t['modification'] = m
            t['cell'] = cell
            t['replicate'] = rep
            ts.append(t)
cov_df = pd.concat(ts).reset_index(drop=True)
del ts
cov_df['CoverageI'] = cov_df['Coverage'].astype(int)
cov_df.sample(5)

In [None]:
for (m, c, r, ds) in EXAMPLES:
    cov = cov_df[(cov_df['dataset'] == ds) &
                 (cov_df['modification'] == m) &
                 (cov_df['cell'] == c) &
                 (cov_df['replicate'] == r)]
    plt.figure(figsize=(4, 2))
    ax = plt.axes()
    sns.lineplot(cov, x='Q', y='CoverageI',
                 color='blue',
                 ax=ax)
    ax.set(yscale='log')
    ax.set_title(f'{ds} {m} {c} {r}')
    plt.tight_layout()
    plt.show()

## QQs

In [None]:
covs = []
for (m, c, r, ds) in [
    ('H3K27me3', 'Monocyte', '', 'Roadmap'),
    ('H3K27me3', 'CD4', '', 'Roadmap'),
    ('H3K27me3', 'TCell', '', 'Roadmap')
]:
    covs.append((f'{ds} {m} {c} {r}',
                 cov_df[(cov_df['dataset'] == ds) &
                        (cov_df['modification'] == m) &
                        (cov_df['cell'] == c) &
                        (cov_df['replicate'] == r)]))

for i, j in [(0, 1), (0, 2)]:
    n1, t1 = covs[i]
    n2, t2 = covs[j]
    plt.figure(figsize=(3, 3))
    ax = plt.axes()
    minxy = min(t1['Coverage'].min(), t2['Coverage'].min())
    maxxy = max(t1['Coverage'].max(), t2['Coverage'].max())
    sns.lineplot(x=[minxy, maxxy], y=[minxy, maxxy], ax=ax, color='black', alpha=0.5, linestyle='dotted')
    sns.lineplot(x=t1['Coverage'].tolist(), y=t2['Coverage'].tolist(),
                 color='blue',
                 sort=False, estimator=None,
                 ax=ax)
    ax.set(xscale='log')
    ax.set(yscale='log')
    ax.set_xlim(minxy, maxxy)
    ax.set_ylim(minxy, maxxy)
    ax.set_xlabel(n1)
    ax.set_ylabel(n2)
    ax.set_title('QQ')
    plt.tight_layout()
    plt.show()

## All

In [None]:
plt.figure(figsize=(5, 2))
ax = plt.axes()
sns.lineplot(cov_df[cov_df['Q'] >= 50],
             x='Q', y='CoverageI',
             hue='modification',
             hue_order=['CTCF', 'ATAC'] + MODIFICATIONS,
             alpha=0.5, errorbar=('se', .95),
             ax=ax)
ax.set_title('Coverage')
ax.set(yscale='log')
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
plt.rc('font', size=5)
plt.figure(figsize=(3, 6))
axs = [plt.subplot(7, 1, i + 1) for i in range(7)]
for i, m in enumerate(['CTCF', 'ATAC'] + MODIFICATIONS):
    ax = axs[i]
    sns.lineplot(cov_df[cov_df['modification'] == m],
                 x='Q', y='CoverageI',
                 hue='dataset',
                 alpha=0.5, errorbar=('se', .95),
                 ax=ax)
    ax.set_title(f'{m} Coverage')
    ax.set(yscale='log')
    sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()
plt.rc('font', size=8)

# Candidates from bw coverage

In [None]:
CHROM_SIZES = {
    c: s for _, (c, s) in pd.read_csv(os.path.join(GSE26320_PATH, 'hg38.chrom.sizes'),
                                      sep='\t', names=['chr', 'size']).iterrows() if '_' not in c
}


def load_bws(path, modification, cells, replicates, control='Input'):
    df_bws = pd.DataFrame(columns=['file', 'modification', 'cell', 'replicate'], dtype=object)
    for f in tqdm(os.listdir(path)):
        if '.bw' not in f:
            continue
        mod = next((m for m in modification if m in f), None)
        if mod is None and control is not None and control in f:
            mod = 'Control'
        cell = next((c for c in cells if c in f), None)
        rep = next((r for r in replicates if r in f), None)
        if mod and cell and rep is not None:
            df_bws.loc[len(df_bws)] = ((os.path.join(path, f)), mod, cell, rep)
    return df_bws

In [None]:
encode_bws = load_bws(GSE26320_PATH + '/bw', MODIFICATIONS, GSE26320_CELLS, GSE26320_REPS, 'Input')
encode_bws['dataset'] = 'ENCODE'
display(encode_bws.sample(3))

immune_bws = load_bws(IMMUNE_PATH + '/bw', MODIFICATIONS, IMMUNE_CELLS, IMMUNE_REPS, 'Control')
immune_bws['dataset'] = 'Roadmap'
display(immune_bws.sample(3))
all_bws = pd.concat([encode_bws, immune_bws]).reset_index(drop=False)

In [None]:
import pyBigWig

total_coverages = {}
ts = []
for _, (ds, m, c, r) in tqdm(list(all_bws[['dataset', 'modification', 'cell', 'replicate']].iterrows())):
    print(ds, m, c, r)
    t = all_bws[(all_bws['modification'] == m) &
                (all_bws['cell'] == c) &
                (all_bws['replicate'] == r) &
                (all_bws['dataset'] == ds)]
    if len(t) == 0:
        continue
    bw_path = t['file'].values[0]
    # print(bw_path)

    try:
        with pyBigWig.open(bw_path) as bw:
            total_coverage = sum(
                bw.stats(chr, exact=True, type='sum')[0] for chr in CHROM_SIZES.keys() if '_' not in chr)
            print('Total coverage', total_coverage)
            # Multiplier to align BAM coverage with BigWig estimation
            total_coverage *= 1.1e-2
            total_coverages[(ds, m, c, r)] = total_coverage
    except Exception:
        pass
        # print('ERROR')

total_coverages_df = pd.DataFrame([(ds, m, c, r, n) for (ds, m, c, r), n in total_coverages.items()],
                                  columns=['dataset', 'modification', 'cell', 'replicate', 'reads'])
total_coverages_df.sample(5)

In [None]:
# Prepare BAM files
# https://github.com/Boyle-Lab/Blacklist/raw/refs/heads/master/lists/hg38-blacklist.v2.bed.gz
BLACKLIST_PATH = os.path.expanduser('~/data') + '/hg38-blacklist.v2.bed'
blacklist_df = pd.read_csv(BLACKLIST_PATH, sep='\t', header=None)
blacklist_df.rename({0: 'chr', 1: 'start', 2: 'end'}, axis=1, inplace=True)
blacklist_df

In [None]:
def collect_bw_stats(bw_path, chrom_sizes, blacklist, region_len, top_regions, work_regions, resolution):
    print('Prepare chromosome indexes')
    chr_indx = {}
    indx = 0
    for chr, chr_size in chrom_sizes.items():
        if '_' in chr:
            continue
        bins = int(math.floor(chr_size / region_len))
        chr_indx[chr] = (indx, indx + bins)
        indx += bins
    chromosomes = list(chr_indx.keys())

    print('Compute coverage in regions')
    region_coverages = np.zeros(indx)
    with pyBigWig.open(bw_path) as bw:
        for chr, (istart, iend) in chr_indx.items():
            region_coverages[istart: iend] = \
                np.array(
                    [x or 0 for x in bw.stats(chr, 1, CHROM_SIZES[chr], nBins=iend - istart, exact=True, type='sum')])

    print('Processing blacklisted regions')
    blacklist_marked = 0
    for chr, start, end in zip(blacklist['chr'], blacklist['start'], blacklist['end']):
        if chr not in chr_indx:
            continue
        offset = chr_indx[chr][0]
        for x in range(int(math.floor(start / region_len)), int(math.ceil(end / region_len)) + 1):
            region_coverages[min(x - offset, len(region_coverages) - 1)] = 0
            blacklist_marked += 1
    print(f'Marked {blacklist_marked} / {len(region_coverages)} bins as blacklist')

    if top_regions > len(region_coverages):
        raise Exception(f'Too many top regions {top_regions} > {len(region_coverages)}')

    print('Split by chromosomes')
    chr_xs = [[] for _ in range(len(chr_indx))]  # Create new array for each chromosome index
    ichr = 0
    for x in np.sort(np.argpartition(region_coverages, -top_regions)[-top_regions:]):
        while chr_indx[chromosomes[ichr]][1] < x:
            ichr += 1
        chr_xs[ichr].append(x)

    print('Collect final regions')
    peaks = []
    for ichr, xs in enumerate(chr_xs):
        chr = chromosomes[ichr]
        offset = chr_indx[chr][0]
        for i, x in enumerate(xs):
            peaks.append((chr, (x - offset) * region_len, (x + 1 - offset) * region_len))
    df = pd.DataFrame(peaks, columns=['chromosome', 'start', 'end'])
    if len(df) > work_regions:
        print(f'Pick {work_regions} / {top_regions} uniform regions for computation speedup')
        step = int(math.ceil(len(df) / float(work_regions)))
        df = df.iloc[range(0, len(df), step)].copy()

    print('Collecting bigwig stats')
    region_stats = []
    with pyBigWig.open(bw_path) as bw:
        for _, (chr, start, end) in df.iterrows():
            stats = np.array(
                bw.stats(chr, start, end, nBins=int(math.ceil((end - start) / resolution)), exact=True, type='sum'))
            region_stats.append(stats)
    return df, region_stats

In [None]:
REGION_LEN = 20_000
TOP_REGIONS = 10_000
WORK_REGIONS = 1000
RESOLUTION = 100

# Compute on top chromosomes for speed
chrom_sizes_part = {f'chr{i + 1}': CHROM_SIZES[f'chr{i + 1}'] for i in range(3)}

## Candidates calling from ModelToPeaks.kt

In [None]:
def next_clear_bit(arr, start):
    for i in range(start, len(arr)):
        if not arr[i]:
            return i
    return -1


def next_set_bit(arr, start):
    for i in range(start, len(arr)):
        if arr[i]:
            return i
    return -1


def set_bits(arr, start, end):
    for i in range(start, end):
        arr[i] = True


# Copy of BitList.aggregate
def aggregate(arr, gap=0):
    ranges = []
    offset = 0
    while offset < len(arr):
        left = next_set_bit(arr, offset)
        if left == -1:
            break
        right = min(next_clear_bit(arr, left + 1), len(arr))
        if right == -1:
            break
        while gap > 0 and right < len(arr):
            next_set = next_set_bit(arr, right + 1)
            if next_set != -1 and next_set - right <= gap:
                right = min(next_clear_bit(arr, next_set + 1), len(arr))
            else:
                break
        ranges.append((left, right))
        offset = right
    return ranges

In [None]:
def call_candidates(bins, gap):
    candidates = aggregate(bins)
    if len(candidates) == 0:
        return candidates
    last_end = -1
    for (start, end) in candidates:
        if last_end != -1:
            if start - last_end < gap:
                set_bits(bins, last_end, start)
        last_end = end

    candidates = aggregate(bins)
    for (start, end) in candidates:
        assert start < end, f'{start} >= {end}\n{bins}'
    return candidates


def analyze_candidates(coverages, fdr=0.05):
    rows = []
    bins = np.ndarray(len(coverages), dtype='bool')
    for s in np.exp(np.linspace(-8, 1, 100) * math.log(10)):
        cq = np.percentile(coverages, 100 * (1.0 - s * fdr))
        for gap in [0]:
            for i in range(len(bins)):
                bins[i] = coverages[i] >= cq
            candidates = call_candidates(bins, gap)
            avg_len = sum((end - start) for (start, end) in candidates) / len(candidates) if len(candidates) > 0 else 0
            rows.append((s, gap, len(candidates), avg_len))
    df = pd.DataFrame(rows, columns=['Sensitivity', 'Gap', 'CandidatesN', 'CandidatesAL'])
    return df

## Visualization

In [None]:
def plot_candidates(t, title=None):
    plt.figure(figsize=(5, 4))
    ax = plt.axes()
    sns.lineplot(data=t, x="CandidatesN", y="CandidatesAL", hue='Gap', estimator=None,
                 # marker='o', markersize=3,
                 palette='tab20',
                 sort=False,
                 ax=ax)
    ax.set(yscale='log')
    ax.set(xscale='log')
    sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
    if title:
        plt.title(title)
    plt.tight_layout()
    plt.show()

## Examples

In [None]:
for (m, c, r, ds) in EXAMPLES:
    bw_path = all_bws[(all_bws['modification'] == m) &
                      (all_bws['cell'] == c) &
                      (all_bws['replicate'] == r) &
                      (all_bws['dataset'] == ds)]['file'].values[0]

    _, region_stats = collect_bw_stats(bw_path, chrom_sizes_part, blacklist_df, REGION_LEN, TOP_REGIONS,
                                       WORK_REGIONS, RESOLUTION)
    coverages = np.concatenate(region_stats, axis=0)
    t = analyze_candidates(coverages)
    display(t.head(10))
    display(t.tail(20))
    # For log
    t.fillna(0, inplace=True)
    t['CandidatesN'] += 1
    t['CandidatesAL'] += 1
    # break
    t.sort_values(['Sensitivity', 'Gap'], inplace=True)
    plot_candidates(t, f'{m} {c} {r}')
    plot_projections(t, f'{m} {c} {r}', None)


## Segments

In [None]:
for (m, c, r, ds) in EXAMPLES:
    bw_path = all_bws[(all_bws['modification'] == m) &
                      (all_bws['cell'] == c) &
                      (all_bws['replicate'] == r) &
                      (all_bws['dataset'] == ds)]['file'].values[0]

    _, region_stats = collect_bw_stats(bw_path, chrom_sizes_part, blacklist_df, REGION_LEN, TOP_REGIONS,
                                       WORK_REGIONS, RESOLUTION)
    coverages = np.concatenate(region_stats, axis=0)
    rows = []
    fdr = 0.05
    prev_bins = np.ndarray(len(coverages), dtype='bool')
    bins = np.ndarray(len(coverages), dtype='bool')
    for i, s in enumerate(np.exp(np.linspace(-8, 1, 50) * math.log(10))):
        cq = np.percentile(coverages, 100 * (1.0 - s * fdr))
        if i == 0:
            for j in range(len(bins)):
                prev_bins[j] = coverages[j] >= cq
            continue
        for j in range(len(bins)):
            bins[j] = coverages[j] >= cq
        if prev_bins is None:
            prev_bins = bins.copy()
            continue
        candidates = call_candidates(bins, 0)
        total = len(candidates)
        overlap_candidates = 0
        for (start, end) in candidates:
            ovlp = any(prev_bins[i] for i in range(start, end))
            if ovlp:
                overlap_candidates += 1
        rows.append((s, total, total - overlap_candidates, overlap_candidates))
        for j in range(len(bins)):
            prev_bins[j] = bins[j]

    sdf = pd.DataFrame(rows, columns=['Sensitivity', 'Total', 'New', 'Old'])
    # display(sdf)
    sdf['New %'] = [d(n, t) * 100 for n, t in zip(sdf['New'], sdf['Total'])]
    sdf['Total %'] = 100
    plt.figure(figsize=(10, 1))
    ax = plt.axes()
    # Background
    sns.barplot(sdf, x='Sensitivity', y='Total %',
                color='orange',
                ax=ax)
    # Foreground
    sns.barplot(sdf, x='Sensitivity', y='New %',
                color='blue',
                ax=ax)
    ax.set_ylabel('peaks %')
    name = f'{ds} {m} {c} {r}'
    ax.set_title(name)
    # Points
    # sp = sps[name]
    # ax.axvline(x=sp[2], ymin=0, ymax=100, color='blue', lw=2)
    # ax.axvline(x=sp[1], ymin=0, ymax=100, color='red', lw=2)
    # ax.axvline(x=sp[0], ymin=0, ymax=100, color='green', lw=2)
    # Limit ticks number
    for i, label in enumerate(ax.get_xticklabels()):
        if 0 < i < len(sdf) - 1 and i % 20 != 0:
            label.set_visible(False)
    ax.invert_xaxis()
    # add legend
    top_bar = mpatches.Patch(color='orange', label='old %')
    bottom_bar = mpatches.Patch(color='blue', label='new %')
    plt.legend(handles=[bottom_bar, top_bar])
    sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
    plt.show()


##  All

In [None]:
# for m in ['ATAC'] + MODIFICATIONS:
#     for ds in ['ENCODE', 'Roadmap', 'ImmGen']:
#         print(ds, m)
#         dft = all_bws[(all_bws['dataset'] == ds) & (all_bws['modification'] == m)].copy().reset_index(drop=True)
#
#         n = len(dft)
#         print(n)
#         if n <= 6:
#             ncols, nrows = n, 1
#         else:
#             ncols = int(math.floor(math.sqrt(n) + 4))
#             nrows = int(math.ceil(n / ncols))
#         plt.figure(figsize=(3 * ncols + 1, 3 * nrows))
#         axs = [plt.subplot(nrows, ncols, i + 1) for i in range(n)]
#         for i, dftr in dft.iterrows():
#             c, r, bw_path = dftr['cell'], dftr['replicate'], dftr['file']
#             print(ds, m, c, r, bw_path)
#             regions_df, region_stats = collect_bw_stats(
#                 bw_path, chrom_sizes_part, blacklist_df, REGION_LEN, TOP_REGIONS, WORK_REGIONS, RESOLUTION
#             )
#             coverages = np.concatenate(region_stats, axis=0)
#             tt = analyze_candidates(coverages)
#             ax = axs[i]
#             sns.lineplot(data=tt, x="CandidatesN", y="CandidatesAL", hue='Gap', estimator=None,
#                      marker='o', markersize=3,
#                      palette='tab20',
#                     sort=False,
#                     ax=ax)
#             for x, y, s in zip(tt['CandidatesN'], tt['CandidatesAL'], tt['Sensitivity']):
#                 if s in [1e-3, 0.1]:
#                     ax.text(x, y, str(s), fontsize=5)
#             # pivot = full_sensitivity[(full_sensitivity['dataset'] == ds) &
#             #                   (full_sensitivity['modification'] == m) &
#             #                   (full_sensitivity['cell'] == c) &
#             #                   (full_sensitivity['replicate'] == r)]['pivot'].values[0]
#             # ax.set_title(f'{ds} {m} {c} {r} {"" if pivot == "null" else ""}')
#             ax.set_title(f'{ds} {m} {c} {r}')
#             ax.set(yscale='log')
#             ax.set(xscale='log')
#             if i == n - 1:
#                 sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
#             else:
#                 ax.legend().set_visible(False)
#         plt.tight_layout()
#         plt.show()

# Chips quality analysis

In [None]:
CHIPS_PATH = os.path.expanduser('~/data/2023_chips')
N = 5
MULTIPLIERS = [1.0, 0.7, 0.5, 0.2, 0.1]

In [None]:
def load_chips_sensitivity(path):
    ts = []
    for (m, n, mult) in product(
            ['H3K4me3', 'H3K27ac', 'H3K4me1', 'H3K27me3', 'H3K36me3', 'mixed'],
            range(1, N + 1),
            MULTIPLIERS):
        res = glob.glob(f'{path}/{m}*_{n}_{mult}_*.sensitivity.tsv')
        if len(res) == 0:
            print(f'Nothing found for {path}/{m}*_{n}_{mult}_*.sensitivity.tsv')
            continue
        if len(res) > 1:
            print(f'More than 1 found for {path}/{m}*_{n}_{mult}_*.sensitivity.tsv:\n{res}')
            continue
        tsv_path = res[0]
        print(tsv_path, m, n, mult)
        t = pd.read_csv(tsv_path, sep='\t')
        t['file'] = tsv_path
        t['modification'] = m
        t['n'] = n
        t['multiplier'] = mult
        ts.append(t)
    df = pd.concat(ts).reset_index(drop=True)
    return df

In [None]:
chips_sens = load_chips_sensitivity(CHIPS_PATH + '/span')
# For log
chips_sens['CandidatesN'] += 1
chips_sens['CandidatesAL'] += 1
chips_sens.sample(3)

In [None]:
for m in MODIFICATIONS:
    print(m)
    dfdsm = chips_sens[chips_sens['modification'] == m].copy()
    dfdsm.sort_values(by=['multiplier', 'Sensitivity'], ascending=False, inplace=True)
    # t = dfdsm[(dfdsm['Sensitivity'] >= 1e-3) & (dfdsm['Sensitivity']<=1)]
    t = dfdsm
    minx, maxx = t['CandidatesN'].min(), t['CandidatesN'].max()
    miny, maxy = t['CandidatesAL'].min(), t['CandidatesAL'].max()
    n = len(dfdsm[['n', 'multiplier']].copy().drop_duplicates())
    print(n)
    if n <= 6:
        ncols, nrows = n, 1
    else:
        ncols = int(math.floor(math.sqrt(n) + 4))
        nrows = int(math.ceil(n / ncols))
    plt.figure(figsize=(3 * ncols + 1, 3 * nrows))
    axs = [plt.subplot(nrows, ncols, i + 1) for i in range(n)]
    for i, ((mult, n), t) in enumerate(dfdsm.groupby(['multiplier', 'n'])):
        # tt = t[(t['Sensitivity'] >= 1e-3) & (t['Sensitivity']<=1)].copy()
        tt = t
        ax = axs[i]
        sns.lineplot(data=tt, x="CandidatesN", y="CandidatesAL", hue='Gap',
                     # marker='o', markersize=2,
                     palette='tab20',
                     sort=False,
                     alpha=0.5,
                     ax=ax)
        # sns.scatterplot(data=tt,
        #                 x="CandidatesN", y="CandidatesAL", hue='Gap', s=10, palette='tab20',
        #                 legend=False, ax=ax)
        # sns.scatterplot(data=tt,
        #                 x="CandidatesN", y="CandidatesAL", hue='Gap', s=30, palette='tab20',
        #                 legend=False, ax=ax)
        # for x, y, v in zip(tt['CandidatesN'], tt['CandidatesAL'], tt['Sensitivity']):
        #     if v in [1.0, 0.001]:
        #         ax.text(x, y, str(v), fontsize=5)
        ax.set(yscale='log')
        ax.set(xscale='log')
        # pivot = tt['pivot'].unique()[0]
        # ax.set_title(f'{m} {mult} {n} {f"P {pivot}" if pivot != "null" else ""}')
        ax.set_title(f'{m} {mult} {n}')
        ax.set_xlim(minx, maxx)
        ax.set_ylim(miny, maxy)
        if i == n - 1:
            sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
        else:
            ax.legend().set_visible(False)
    plt.tight_layout()
    plt.show()

In [None]:
for (m, n) in [
    ('H3K4me3', 1),
]:
    t = chips_sens[(chips_sens['modification'] == m) &
                   (chips_sens['n'] == n)].copy()
    t.sort_values(by=['Sensitivity'], ascending=False, inplace=True)
    plt.figure(figsize=(15, 3))
    axs = [plt.subplot(1, len(MULTIPLIERS), i + 1) for i in range(len(MULTIPLIERS))]
    for i, mult in enumerate(sorted(MULTIPLIERS, reverse=True)):
        ax = axs[i]
        tt = t[t['multiplier'] == mult]
        # tt = t[(t['Sensitivity'] >= 1e-3) & (t['Sensitivity'] <= 1.0)]
        # tt = t
        sns.lineplot(data=tt, x="CandidatesN", y="CandidatesAL", hue='Gap', estimator=None,
                     # marker='o', markersize=3,
                     palette='tab20',
                     sort=False,
                     alpha=0.5,
                     ax=ax)
        ttt = tt[tt['Gap'] == 0].sort_values(by=['Sensitivity']).copy().reset_index(drop=True)
        sp = detect_sensitivity_triangle(ttt)
        sns.scatterplot(data=ttt[ttt.index == sp[0]],
                        x="CandidatesN", y="CandidatesAL", color='green', s=20,
                        legend=False, ax=ax)
        sns.scatterplot(data=ttt[ttt.index == sp[1]],
                        x="CandidatesN", y="CandidatesAL", color='red', s=40,
                        legend=False, ax=ax)
        sns.scatterplot(data=ttt[ttt.index == sp[2]],
                        x="CandidatesN", y="CandidatesAL", color='blue', s=20,
                        legend=False, ax=ax)

        # sns.scatterplot(data=tt[tt['Sensitivity'].isin([1.0, 0.001])],
        #                 x="CandidatesN", y="CandidatesAL", hue='Gap', s=10, palette='tab20',
        #                 legend=False, ax=ax)
        # sns.scatterplot(data=tt[tt['Sensitivity'].isin([0.1])],
        #                 x="CandidatesN", y="CandidatesAL", hue='Gap', s=30, palette='tab20',
        #                 legend=False, ax=ax)
        # for x, y, v in zip(tt['CandidatesN'], tt['CandidatesAL'], tt['Sensitivity']):
        #     if v in [1.0, 0.001]:
        #         ax.text(x, y, str(v), fontsize=5)
        ax.set(yscale='log')
        ax.set(xscale='log')
        # pivot = tt['pivot'].values[0]
        # ax.set_title(f'{m} {mult} {n} {f"P {pivot}" if pivot != "null" else ""}')
        ax.set_title(f'{m} {mult} {n}')
        if i == len(MULTIPLIERS) - 1:
            sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
        else:
            ax.legend().set_visible(False)

    plt.tight_layout()
    plt.show()

# Mix control

In [None]:
MIX_PATH = os.path.expanduser('~/data/2023_mix_chr1')
MIX_SIGNAL_FRACTIONS = [0.7, 0.5, 0.2, 0.1]

In [None]:
def load_mix_sensitivity(path):
    ts = []
    for (m, c, r, f) in product(
            ['H3K4me3', 'H3K27ac', 'H3K4me1', 'H3K27me3', 'H3K36me3'],
            IMMUNE_CELLS,
            IMMUNE_REPS,
            MIX_SIGNAL_FRACTIONS,
    ):
        res = glob.glob(f'{path}/{m}_{c}_{r}*_{f}_*.sensitivity.tsv')
        if r == '':
            res = [f for f in res if len([ro for ro in IMMUNE_REPS if ro != r and ro in f]) == 0]
        if len(res) == 0:
            print(f'Nothing found for {path}/{m}_{c}_{r}*_{f}_*.sensitivity.tsv')
            continue
        if len(res) > 1:
            print(f'More than 1 found for {path}/{m}_{c}_{r}*_{f}_*.sensitivity.tsv:\n{res}')
            continue
        tsv_path = res[0]
        print(tsv_path, m, f)
        t = pd.read_csv(tsv_path, sep='\t')
        t['file'] = tsv_path
        t['modification'] = m
        t['cell'] = c
        t['replicate'] = r
        t['fraction'] = f
        t['SensitivityN'] = range(len(t))
        ts.append(t)
    df = pd.concat(ts).reset_index(drop=True)
    return df

In [None]:
mix_sens = load_mix_sensitivity(MIX_PATH + '/span')
# For log
mix_sens.fillna(0, inplace=True)
mix_sens['CandidatesN'] += 1
mix_sens['CandidatesAL'] += 1
mix_sens.sample(3)

In [None]:
# Examples
sps_mix = {}
for (m, c, r) in [
    ('H3K4me3', 'BCell', ''),
    # ('H3K4me3', 'CD4', ''),
    ('H3K27ac', 'CD34', ''),
    ('H3K4me1', 'CD4', 'rep1'),
    ('H3K27me3', 'NK', ''),
    ('H3K36me3', 'BCell', ''),
]:
    t = mix_sens[(mix_sens['modification'] == m) &
                 (mix_sens['cell'] == c) &
                 (mix_sens['replicate'] == r)].copy()
    t.sort_values(by=['fraction', 'Sensitivity'], ascending=False, inplace=True)
    plt.figure(figsize=(12, 3))
    axs = [plt.subplot(1, 4, i + 1) for i in range(4)]
    for i, f in enumerate(sorted(MIX_SIGNAL_FRACTIONS, reverse=True)):
        ax = axs[i]
        tt = t[t['fraction'] == f]
        assert len(tt['file'].unique()) == 1
        sns.lineplot(data=tt, x="CandidatesN", y="CandidatesAL", hue='Gap', estimator=None,
                     palette='tab20',
                     sort=False,
                     alpha=0.5,
                     ax=ax)
        ttt = tt[tt['Gap'] == 0].sort_values(by=['Sensitivity']).copy().reset_index(drop=True)
        sp = detect_sensitivity_triangle(ttt)
        sps_mix[(m, c, r, i)] = (tt['file'].values[0], ttt, sp)
        sns.scatterplot(data=ttt[ttt.index == sp[0]],
                        x="CandidatesN", y="CandidatesAL", color='green', s=20,
                        legend=False, ax=ax)
        sns.scatterplot(data=ttt[ttt.index == sp[1]],
                        x="CandidatesN", y="CandidatesAL", color='red', s=40,
                        legend=False, ax=ax)
        sns.scatterplot(data=ttt[ttt.index == sp[2]],
                        x="CandidatesN", y="CandidatesAL", color='blue', s=20,
                        legend=False, ax=ax)
        ax.set(yscale='log')
        ax.set(xscale='log')
        # pivot = tt['pivot'].unique()[0]
        # ax.set_title(f'{m} {c} {r} {f} {f"P {pivot}" if pivot != "null" else ""}')
        ax.set_title(f'{m} {c} {r} {f}')
        if i == len(axs) - 1:
            sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
        else:
            ax.legend().set_visible(False)
    plt.tight_layout()
    plt.show()

In [None]:
mfs = list(sorted(MIX_SIGNAL_FRACTIONS, reverse=True))
ts = []
for (m, c, r, i), (file, df, sp) in sps_mix.items():
    print(m, c, r, mfs[i])
    df['modification'] = m
    df['cell'] = c
    df['replicate'] = r
    ts.append(df)
sps_mix_df = pd.concat(ts).reset_index(drop=True)
del ts

In [None]:
# Examples
for (m, c, r) in [
    ('H3K4me3', 'BCell', ''),
    # ('H3K4me3', 'CD4', ''),
    ('H3K27ac', 'CD34', ''),
    ('H3K4me1', 'CD4', 'rep1'),
    ('H3K27me3', 'NK', ''),
    ('H3K36me3', 'BCell', ''),
]:
    plt.figure(figsize=(15, 1.5))
    axs = [plt.subplot(1, 5, i + 1) for i in range(5)]
    t = sps_mix_df[(sps_mix_df['modification'] == m) & (sps_mix_df['cell'] == c) & (sps_mix_df['replicate'] == r)]
    for i, y in enumerate(['CandidatesN', 'CandidatesAL', 'CandidatesML', 'SignalNoiseRatio', 'SignalControlRatio']):
        ax = axs[i]
        sns.lineplot(data=t, x='SensitivityN', y=y, estimator=None,
                     hue='fraction',
                     hue_order=[0.7, 0.5, 0.2, 0.1],
                     # palette='tab20',
                     alpha=0.8,
                     sort=False,
                     ax=ax)
        # ax.set(xscale='log')
        ax.set(yscale='log')
        # ax.invert_xaxis()
        ax.set_title(f'{m} {c} {r}')
        if i == len(axs) - 1:
            sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
        else:
            ax.legend().set_visible(False)
    plt.tight_layout()
    plt.show()

In [None]:
for (m, c, r) in [
    ('H3K4me3', 'BCell', ''),
    ('H3K27ac', 'CD34', ''),
    ('H3K4me1', 'CD4', 'rep1'),
    ('H3K27me3', 'NK', ''),
    ('H3K36me3', 'BCell', ''),
]:
    plt.figure(figsize=(20, 2))
    axs = [plt.subplot(1, 4, i + 1) for i in range(4)]
    for i, f in enumerate(sorted(MIX_SIGNAL_FRACTIONS, reverse=True)):
        ax = axs[i]
        file, _, sp = sps_mix[(m, c, r, i)]
        segments = file.replace('.sensitivity.tsv', '.segments.tsv')
        sdf = pd.read_csv(segments, sep='\t')
        sdf['New %'] = [d(n, t) * 100 for n, t in zip(sdf['New'], sdf['CandidatesN'])]
        sdf['Total %'] = 100
        sdf['SensitivityN'] = range(len(sdf))
        # Background
        sns.barplot(sdf, x='SensitivityN', y='Total %',
                    color='orange',
                    ax=ax)
        # Foreground
        sns.barplot(sdf, x='SensitivityN', y='New %',
                    color='blue',
                    ax=ax)
        ax.set_ylabel('peaks %')
        name = f'{ds} {m} {c} {r} {f}'
        ax.set_title(name)
        # Points
        ax.axvline(x=sp[2], ymin=0, ymax=100, color='blue', lw=2)
        ax.axvline(x=sp[1], ymin=0, ymax=100, color='red', lw=2)
        ax.axvline(x=sp[0], ymin=0, ymax=100, color='green', lw=2)
        # Limit ticks number
        for i, label in enumerate(ax.get_xticklabels()):
            label.set_visible(False)
        # add legend
        if i == len(axs) - 1:
            top_bar = mpatches.Patch(color='orange', label='old %')
            bottom_bar = mpatches.Patch(color='blue', label='new %')
            plt.legend(handles=[bottom_bar, top_bar])
            sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
        new_value = 1
        for patch in ax.patches:
            current_width = patch.get_width()
            diff = current_width - new_value
            # we change the bar width
            patch.set_width(new_value)
            # we recenter the bar
            patch.set_x(patch.get_x() + diff * .5)
    plt.tight_layout()
    plt.show()