# Peaks length

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

import pandas as pd
import seaborn as sns
from IPython.display import display

# sns.set_style("whitegrid")
sns.set_style("white")
import matplotlib.pyplot as plt
import numpy as np
import os
from tqdm.auto import tqdm

In [None]:
def bedl(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[2] - tf[1]
    except:
        return np.zeros(0)  # Empty file


def lines(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return len(tf)
    except:
        return 0  # Empty file


def scores(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[4]
    except:
        return np.zeros(0)  # Empty file

def positions(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[0], tf[1], tf[2]
    except:
        return [], [], []  # Empty file

In [None]:
def find_peak_files(path, suffix, cells, modifications, replicates, fdrs):
    df = pd.DataFrame(
        columns=['modification', 'cell', 'replicate', 'fdr', 'file'],
        dtype=object
    )
    for f in tqdm(os.listdir(path)):
        if suffix not in f:
            continue
        mod = next((m for m in modifications if m in f), None)
        cell = next((cc for cc in cells if cc in f), None)
        rep = next((r for r in replicates if r in f), None)
        fdr = next((fdr for fdr in fdrs if fdr in f), None)
        if mod:
            df.loc[len(df)] = (mod, cell, rep, fdr, os.path.join(path, f))
    return df

In [None]:
def load_peaks(df, lowq=.01, highq=.99):
    dfs = []
    for _, (m, t, file) in tqdm(df[['modification', 'tool', 'file']].iterrows()):
        peaks_info = []
        chromosomes, starts, ends = positions(file)
        peaks_info.extend((m, t, file, chr, start, end, score, length)
                          for chr, start, end, score, length in zip(chromosomes, starts, ends, scores(file), bedl(file)))
        t = pd.DataFrame(
            peaks_info,
            columns=['modification', 'tool', 'file', 'chromosome', 'start', 'end', 'score', 'length']
        )
        t['length'] = t['length'].astype(int)
        t['score'] = t['score'].astype(int)
        t = t.sample(min(len(t), 10_000))
        # Ignore extreme peaks
        t = t[(t['length'].quantile(lowq) <= t['length']) &
              (t['length'] <= t['length'].quantile(highq))].copy()
        dfs.append(t)
    return pd.concat(dfs, ignore_index=True)

# Encodeproject.org ATAC/TFs/ChIPseq

`xargs -L 1 curl -O -J -L < files.txt` and rename using `metadata.csv`
```
cd ~/data/2022_atacseq
for F in *.bed; do echo $F; mv $F $(grep "^${F/.bed/}" metadata.tsv | awk  -v FS='\t' '{printf("%s_%s_%s", $1, $11, "ATAC")}'  | sed 's# #_#g').bed; done

cd ~/data/2022_chipseq_narrow
for F in *.bed; do echo $F; mv $F $(grep "^${F/.bed/}" metadata.tsv | awk  -v FS='\t' '{printf("%s_%s_%s", $1, $11, $23)}'  | sed -E 's#[^a-zA-Z0-9]+#_#g').bed; done

cd ~/data/2022_chipseq_broad
for F in *.bed; do echo $F; mv $F $(grep "^${F/.bed/}" metadata.tsv | awk  -v FS='\t' '{printf("%s_%s_%s", $1, $11, $23)}'  | sed -E 's#[^a-zA-Z0-9]+#_#g').bed; done

# Simple analysis
for F in *.bed; do echo $(cat $F | awk '{N+=1;L+=$3-$2} END {printf("%d\n", L/N)}') $F; done | sort -k1,1n
```

In [None]:
def plot_boxes(df, value, title, hue='modification', violin=True, log=False, minx=50, maxx=1e4, order=None):
    plt.figure(figsize=(8, int(2 + 0.15 * len(set(df[hue])))))
    ax = plt.axes()
    if violin:
        g_results = sns.violinplot(data=df, y=hue, x=value, ax=ax, scale='width', order=order)
    else:
        g_results = sns.boxplot(data=df, y=hue, x=value, ax=ax, order=order, showfliers=False)
    sample_count = list(
        filter(lambda x: minx <= x <= maxx,
               [10, 100,  1000, 10_000, 100_000, 1_000_000])
    )
    if log:
        g_results.set(xscale='log')
        g_results.set(xticks=sample_count)
        g_results.set(xticklabels=sample_count)
    ax.set_xlim(minx, maxx)
    ax.title.set_text(title)
    plt.tight_layout()

In [None]:
def plot_length_per_file(df, value):
    for t, m in product(set(df['tool']), set(df['modification'])):
        ts = df[(df['tool'] == t) & (df['modification'] == m)]
        if len(ts) == 0:
            continue
        print(t, m, len(set(ts['file'])))
        plt.figure(figsize=(12, 8))
        # Plot
        ax = plt.axes()
        sns.violinplot(data=ts, x='file', y=value, ax=ax, scale='width')
        ax.title.set_text(f'{m} {t}')
        plt.xticks(rotation=90, fontsize=7)
        plt.xlabel('file')
        plt.tight_layout()
        plt.show()

# Reprocessed with MACS2

In [None]:
ENCODE_REPROCESSED_PATH = os.path.expanduser('~/data/2022_atac_chipseq_encode/macs2')
ENCODE_REPROCESSED_CELLS = ['']
ENCODE_REPROCESSED_MODIFICATIONS = [
    'ATAC', 'CTCF', 'H2AFZ', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'
]

FDRS = ['']
df_encodeprojectorg_reprocessed = pd.concat(
    [
        find_peak_files(
            ENCODE_REPROCESSED_PATH, '.narrowPeak', ENCODE_REPROCESSED_CELLS, ENCODE_REPROCESSED_MODIFICATIONS,
            ['rep1', 'rep2'], FDRS
        ),
        find_peak_files(

            ENCODE_REPROCESSED_PATH, '.broadPeak', ENCODE_REPROCESSED_CELLS, ENCODE_REPROCESSED_MODIFICATIONS,
            ['rep1', 'rep2'], FDRS
        )]
)
df_encodeprojectorg_reprocessed['tool'] = 'Macs2'
# df_encode_reprocessed.sample(3)

In [None]:
df_encodeprojectorg_reprocessed_peaks = load_peaks(df_encodeprojectorg_reprocessed)
display(df_encodeprojectorg_reprocessed_peaks.sample(5))

In [None]:
print('Encodeproject.org ATAC/TFs/ChIPseq SELECTED REPROCESSED MACS2')
plot_boxes(df_encodeprojectorg_reprocessed_peaks, 'length', 'Length by modification',
           violin=False, log=True, minx=30, maxx=2e3,
           order=['CTCF', 'ATAC', 'H2AFZ', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'])
plt.show()

# Encode reprocessed with SPAN

In [None]:
ENCODE_REPROCESSED_SPAN_PATH = os.path.expanduser('~/data/2022_atac_chipseq_encode/span')
df_encodeprojectorg_reprocessed_span = find_peak_files(
    ENCODE_REPROCESSED_SPAN_PATH, '.peak', ENCODE_REPROCESSED_CELLS, ENCODE_REPROCESSED_MODIFICATIONS,
    ['rep1', 'rep2'],
    FDRS
)
df_encodeprojectorg_reprocessed_span['tool'] = 'Span'
df_encodeprojectorg_reprocessed_span.sample(3)

In [None]:
df_encodeprojectorg_reprocessed_span_peaks = load_peaks(df_encodeprojectorg_reprocessed_span)
display(df_encodeprojectorg_reprocessed_span_peaks.sample(5))

In [None]:
print('Encodeproject.org ATAC/TFs/ChIPseq SELECTED REPROCESSED SPAN')
plot_boxes(df_encodeprojectorg_reprocessed_span_peaks, 'length', 'Length by modification',
           violin=False, log=True, minx=100, maxx=1e5,
           order=['CTCF', 'ATAC', 'H2AFZ', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'])
plt.show()

# GSE26320_RAW reprocessed

In [None]:
GSE26320_PATH = os.path.expanduser('~/data/2023_GSE26320')
GSE26320_CELLS = ['GM12878', 'HMEC', 'HSMM', 'K562', 'NHEK', 'NHLF', 'H1', 'Huvec', 'HepG2']
GSE26320_MODIFICATIONS = ['CTCF', 'H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3']
GSE26320_REPS = ['rep1', 'rep2']

In [None]:
GSE26320_MACS2_DIR = os.path.join(GSE26320_PATH, os.path.expanduser('macs2'))

MACS2_FDRS = ['0.05']
df_gse26320_macs2 = find_peak_files(
    GSE26320_MACS2_DIR, '.narrowPeak', GSE26320_CELLS, GSE26320_MODIFICATIONS, GSE26320_REPS, MACS2_FDRS
)
df_gse26320_macs2['tool'] = 'MACS2'

df_gse26320_macs2broad = find_peak_files(
    GSE26320_MACS2_DIR, '.broadPeak', GSE26320_CELLS, GSE26320_MODIFICATIONS, GSE26320_REPS, MACS2_FDRS
)
df_gse26320_macs2broad['tool'] = 'MACS2 broad'

GSE26320_SICER_DIR = os.path.join(GSE26320_PATH, os.path.expanduser('sicer'))
GSE26320_SICER_FDRS = ['0.01']
df_gse26320_sicer = find_peak_files(
    GSE26320_SICER_DIR, 'summary-FDR', GSE26320_CELLS, GSE26320_MODIFICATIONS,
    GSE26320_REPS, GSE26320_SICER_FDRS
)
df_gse26320_sicer['tool'] = 'SICER'

GSE26320_SPAN_DIR = os.path.join(GSE26320_PATH, os.path.expanduser('span'))
GSE26320_SPAN_FDRS = ['0.05']
df_gse26320_span = find_peak_files(
    GSE26320_SPAN_DIR, '.peak', GSE26320_CELLS, GSE26320_MODIFICATIONS, GSE26320_REPS, GSE26320_SPAN_FDRS
)
df_gse26320_span['tool'] = 'SPAN'

df_gse26320 = pd.concat(
    [df_gse26320_macs2, df_gse26320_macs2broad, df_gse26320_sicer, df_gse26320_span]
).reset_index(drop=True)
df_gse26320.sample(10)

In [None]:
df_gse26320_peaks = load_peaks(df_gse26320)
df_gse26320_peaks

In [None]:
def plot_kde(te, value, title, hue='modification', order=None, figx=8, figy=3, minx=1, maxx=1e5, maxy=2.5):
    plt.figure(figsize=(figx, figy))
    ax = plt.axes()
    g_results = sns.kdeplot(data=te, x=value, hue=hue,
                            common_norm=False,
                            log_scale=True,
                            hue_order=order)
    g_results.set(xscale='log')
    g_results.set_ylim(0, maxy)
    g_results.set_xlim(minx, maxx)
    ax.title.set_text(title)
    plt.tight_layout()

In [None]:
print('GSE26320 REPROCESSED  Length of peaks')
plot_kde(df_gse26320_peaks, 'length', 'Peaks length',
         order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'],
         minx=5e1, maxx=1e5)
plt.savefig(f'{GSE26320_PATH}/pics/modifications_lengths.pdf', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
for tool in ['MACS2', 'MACS2 broad', 'SPAN', 'SICER']:
    print(tool)
    plot_kde(df_gse26320_peaks[df_gse26320_peaks['tool'] == tool], 'length', f'{tool} Peaks length',
             order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'],
             minx=5e1, maxx=1e5)
    plt.savefig(f'{GSE26320_PATH}/pics/modifications_{tool}_lengths.pdf', bbox_inches='tight', dpi=300)
    plt.show()

In [None]:
print('GSE26320 REPROCESSED Length of peaks')
plot_boxes(df_gse26320_peaks, 'length', 'Length by modification',
           log=True, violin=False, minx=5e1, maxx=1e5,
           order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'],
           )
plt.show()

In [None]:
print('GSE26320 REPROCESSED Length of peaks')
for tool in ['MACS2', 'MACS2 broad', 'SPAN', 'SICER']:
    print(tool)
    plot_boxes(df_gse26320_peaks[df_gse26320_peaks['tool'] == tool], 'length', f'{tool} Peaks length',
               log=True, violin=False, minx=5e1, maxx=1e5,
               order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'])
    plt.show()

In [None]:
from math import floor

def compute_length_percentiles(ts, steps = 10):
    percentiles = []
    for m in set(ts['modification']):
        tm = ts[ts['modification'] == m].copy()
        tm.sort_values(by=['length'], inplace=True)
        tmn = int(floor(len(tm) / steps))
        for i in range(steps):
            p = int(100 / steps * (i + 1))
            percentiles.extend((m, p, v) for v in tm['length'].values[i * tmn: (i + 1) * tmn])

    return pd.DataFrame(columns=['modification', 'percentile', 'length'], data=percentiles)

def plot_percentiles(df, title, miny=5e1, maxy=1e5):
    percentiles_df = compute_length_percentiles(df)
    plt.figure(figsize=(8, 6))
    ax = plt.axes()
    g_results = sns.boxplot(data=percentiles_df, x="percentile", y="length", hue="modification",
                            hue_order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'])
    sample_count = list(
        filter(lambda y: miny <= y <= maxy,
               [10, 20, 50, 100, 200, 500, 1000, 2000, 5000,
                10_000, 20_000, 50_000, 100_000, 200_000, 500_000, 1_000_000])
    )
    g_results.set(yscale='log')
    g_results.set(yticks=sample_count)
    g_results.set(yticklabels=sample_count)
    ax.set_ylim(miny, maxy)
    ax.title.set_text(title)

In [None]:
plot_percentiles(df_gse26320_peaks[df_gse26320_peaks['tool'] == 'MACS2'], 'MACS2 percentile average lengths')
plt.show()

In [None]:
plot_percentiles(df_gse26320_peaks[df_gse26320_peaks['tool'] == 'MACS2 broad'], 'MACS2 broad percentile average lengths')
plt.show()

In [None]:
plot_percentiles(df_gse26320_peaks[df_gse26320_peaks['tool'] == 'SICER'], 'SICER percentile average lengths')
plt.show()

In [None]:
plot_percentiles(df_gse26320_peaks[df_gse26320_peaks['tool'] == 'SPAN'], 'SPAN percentile average lengths')
plt.show()

# Distance between peaks

In [None]:
from itertools import product

def distance_between_peaks(df, hue='modification'):
    distances = []
    for (m, t, f), dft in tqdm(df.groupby([hue, 'tool', 'file'])):
        for chr in sorted(set(dft['chromosome'])):
            dftc = dft[dft['chromosome'] == chr].copy().sort_values(by=['start'])
            starts = dftc['start']
            ends = dftc['end']
            ends = np.roll(ends, 1)
            ds = starts - ends
            # Ignore first and last
            distances.extend((m, t, f, chr, d) for d in ds[1: ds.size - 1])
    return pd.DataFrame(columns=[hue, 'tool', 'file', 'chr', 'distance'], data=distances)

In [None]:
df_gse26320_distances = distance_between_peaks(df_gse26320_peaks)
df_gse26320_distances

In [None]:
print('GSE26320 REPROCESSED Length of peaks')
for tool in ['MACS2', 'MACS2 broad', 'SPAN', 'SICER']:
    print(tool)
    plot_boxes(df_gse26320_distances[df_gse26320_distances['tool'] == tool],
               'distance', 'Distance between peaks by modification',
               log=True, violin=False, minx=10, maxx=1e6,
               order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'])
    plt.show()

In [None]:
print('GSE26320 REPROCESSED Length of peaks')
for tool in ['MACS2', 'MACS2 broad', 'SPAN', 'SICER']:
    print(tool)
    plot_kde(df_gse26320_distances[df_gse26320_distances['tool'] == tool],
             'distance', f'{tool} distance between peaks',
             order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'],
             minx=50, maxx=2e6, maxy=0.75)
    plt.show()

# hg38 genes length and distance between genes

In [None]:
GTF = os.path.expanduser(
    '~/data/2023_Immune/gencode.GRCh38.p13.v41.annotation.gtf')
gtf_df = pd.read_csv(os.path.join(GSE26320_PATH, GTF), sep='\t', comment='#',
                     names=['chromosome', 'db', 'type', 'start', 'end', 'point1', 'strand', 'point2', 'aux'])
gtf_df.sample(10)

In [None]:
print('Parse GTF aux data')
auxes = {}
for i, aux in enumerate(tqdm(gtf_df['aux'])):
    for pair in aux.split(';'):
        kv = pair.strip().split(' ')
        if len(kv) != 2:
            continue
        k, v = kv
        if k not in auxes:
            auxes[k] = vs = []
        else:
            vs = auxes[k]
        vs.append(v.strip('"'))

for k, vs in auxes.items():
    if len(vs) == len(gtf_df):
        gtf_df[k] = vs
    else:
        print(f'Ignoring {k}')
del auxes
gtf_df.drop('aux', axis=1, inplace=True)
gtf_df.sample(3)

In [None]:
gtf_df['length'] = gtf_df['end'] - gtf_df['start']
gtf_df[gtf_df['type'] == 'gene'].sample(5)

In [None]:
print('GTF all genes lengths')
gtf_genes_df = gtf_df[gtf_df['type'] == 'gene'].copy()
plot_boxes(gtf_genes_df, 'length', 'Length of genes',
           hue='gene_type',
           log=True, violin=False, minx=5, maxx=2_000_000)
plt.show()

In [None]:
print('GTF all genes lengths')
plot_kde(gtf_genes_df, 'length', 'Length of genes', hue='gene_type', figx=15, figy=10, minx=5, maxx=2e6, maxy=23)
plt.show()

In [None]:
GENES_TYPES = ['protein_coding', 'pseudogene']
# GENES_TYPES = ['protein_coding']
print('GTF genes lengths')
gtf_genes_df2 = gtf_genes_df[gtf_genes_df['gene_type'].isin(GENES_TYPES)].copy()
plot_boxes(gtf_genes_df2, 'length', 'Length of genes', hue='gene_type',
           log=True, violin=False, minx=10, maxx=2_000_000,
           order=['protein_coding', 'pseudogene'])
gtf_genes_df2['file'] = 'gtf'
gtf_genes_df2['tool'] = 'gtf'
plt.show()

In [None]:
print('GTF all genes lengths')
plot_kde(gtf_genes_df2, 'length', 'Length of genes', hue='gene_type', order=GENES_TYPES,
         minx=10, maxx=2e6, maxy=2)
plt.show()

In [None]:
print('GSE26320 Lengths of peaks and genes')
for tool in ['MACS2', 'MACS2 broad', 'SPAN', 'SICER']:
    print(tool)
    ts = df_gse26320_peaks[df_gse26320_peaks['tool'] == tool]
    tss = pd.concat([ts, gtf_genes_df2.rename({'gene_type': 'modification'}, axis=1)]).reset_index()
    plot_kde(tss,
             'length', f'{tool} peaks lengths',
             order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'] + GENES_TYPES,
             minx=100, maxx=2e6, maxy=2)
    plt.show()

In [None]:
gtf_genes_df_distances = distance_between_peaks(gtf_genes_df2, hue='gene_type')
gtf_genes_df_distances

In [None]:
print('Distance between genes')
plot_boxes(gtf_genes_df_distances, 'distance', 'Distance between genes', hue='gene_type',
           log=True, violin=False, minx=1, maxx=5e6,
           order=GENES_TYPES)
plt.show()

In [None]:
print('Distance between genes')
plot_kde(gtf_genes_df_distances[gtf_genes_df_distances['distance'] > 10],
         'distance', 'Distance between genes', hue='gene_type',
         order=GENES_TYPES,
         minx=10, maxx=5e6, maxy=0.6)
plt.show()

In [None]:
print('GSE26320 Distances between peaks and genes')
for tool in ['MACS2', 'MACS2 broad', 'SPAN', 'SICER']:
    print(tool)
    ts = df_gse26320_distances[df_gse26320_distances['tool'] == tool]
    tss = pd.concat([ts, gtf_genes_df_distances.rename({'gene_type': 'modification'}, axis=1)]).reset_index()
    tss = tss[tss['distance'] > 10]
    plot_kde(tss,
             'distance', f'{tool} distance',
             order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'] + GENES_TYPES,
             minx=50, maxx=2e6, maxy=0.8)
    plt.savefig(f'{GSE26320_PATH}/pics/distances_{tool}.pdf', bbox_inches='tight', dpi=300)
    plt.show()

# Immgen ATAC-seq reprocessed

In [None]:
IMMGEN_PATH = os.path.expanduser('~/data/2022_Immgen')
IMMGEN_CELLS = ['Monocyte']
IMMGEN_MODIFICATIONS = ['ATAC']

IMMGEN_MACS2_DIR = os.path.join(IMMGEN_PATH, os.path.expanduser('macs2'))
# MACS2_FDRS = ['0.1', '0.01', '0.05', '1e-3', '1-e4', '1e-6', '1e-8', '1e-10']
MACS2_FDRS = ['0.05']
df_immgen = find_peak_files(IMMGEN_MACS2_DIR, '.narrowPeak', IMMGEN_CELLS, IMMGEN_MODIFICATIONS, [''], MACS2_FDRS)
df_immgen['tool'] = 'Macs2'
df_immgen['dir'] = IMMGEN_MACS2_DIR
df_immgen.sample(3)

In [None]:
ts = load_peaks(df_immgen)
ts

In [None]:
print('Immgen MACS2 REPROCESSED Length of peaks')
plot_boxes(ts, 'length', 'Length by modification',
           log=True, violin=False, minx=5e1, maxx=3e3)
plt.show()

In [None]:
print('Immgen MACS2 REPROCESSED Length of peaks')
plot_kde(ts, 'length', 'MACS2 Peaks lengths', minx=5e1, maxx=3e3, maxy=3.2)
plt.show()

In [None]:
IMMGEN_SPAN_DIR = os.path.join(IMMGEN_PATH, 'span')
# MACS2_FDRS = ['0.1', '0.01', '0.05', '1e-3', '1-e4', '1e-6', '1e-8', '1e-10']
df_immgen_span = find_peak_files(IMMGEN_SPAN_DIR, '.peak', IMMGEN_CELLS, IMMGEN_MODIFICATIONS, [''], MACS2_FDRS)
df_immgen_span['tool'] = 'SPAN'
df_immgen_span['dir'] = IMMGEN_SPAN_DIR
df_immgen_span

In [None]:
ts = load_peaks(df_immgen_span)
ts

In [None]:
print('Immgen SPAN REPROCESSED Length of peaks')
plot_boxes(ts, 'length', 'Length by modification',
           log=True, violin=False, minx=5e1, maxx=3e3)
plt.show()

In [None]:
print('Immgen SPAN REPROCESSED Length of peaks')
plot_kde(ts, 'length', 'SPAN Peaks lengths', minx=5e1, maxx=3e3, maxy=3.2)
plt.show()

# Theoretical peaks length

In [None]:
data = {
    'TF': np.random.negative_binomial(2000, 0.8, size=300) + 200,
    'ATAC-seq': np.random.negative_binomial(1000, 0.6, size=250) + 500,
    'H3K27ac / H3K4me3':  np.random.negative_binomial(100, 0.2, size=200) + 2000,
    'H3K36me3 / H3K27me3': np.random.negative_binomial(5000, 0.4, size=2000)  * 50 - 350000,
}
t = pd.concat([
    pd.DataFrame(data=dict(name=[name] * len(vals), length=vals))
    for name, vals in data.items()
]).reset_index(drop=True)

plt.figure(figsize=(8, 3))
ax = plt.axes()
sns.kdeplot(data=t, x='length', hue='name',
            common_norm=True,
            log_scale=True,
            ax=ax
            )

# g_results.set(xscale='log')
# g_results.set_ylim(0, 3)
# g_results.set_xlim(1e2, 2e4)
plt.title('Peaks length')
# Put a legend to the right of the current axis
# ax.legend(loc='center left', bbox_to_anchor=(0.62, 0.82))

plt.tight_layout()
# ax.legend(loc='upper right')
plt.savefig(f'{GSE26320_PATH}/pics/theoretical_lengths.pdf', bbox_inches='tight', dpi=300)
plt.show()

# Peaks length for RoadmapEpigenomics

In [None]:
PATH = os.path.expanduser('~/data/2023_Immune')

IMMUNE_CELLS = ['BCell', 'TCell', 'Monocyte']
MODIFICATIONS = ['H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K36me3']


In [None]:
df_roadmap_macs2 = find_peak_files(
    os.path.join(PATH, 'macs2'), '.narrowPeak', IMMUNE_CELLS, MODIFICATIONS, [''], ['0.05']
)
df_roadmap_macs2['tool'] = 'MACS2'
print('MACS2', len(df_roadmap_macs2))

df_roadmap_macs2broad = find_peak_files(
    os.path.join(PATH, 'macs2'), '.broadPeak', IMMUNE_CELLS, MODIFICATIONS,  [''], ['0.1']
)
df_roadmap_macs2broad['tool'] = 'MACS2 broad'
print('MACS2 broad', len(df_roadmap_macs2broad))

df_roadmap_sicer = find_peak_files(
    os.path.join(PATH, 'sicer'), 'summary-FDR', IMMUNE_CELLS, MODIFICATIONS, [''], ['0.01']
)
df_roadmap_sicer['tool'] = 'SICER'
print('SICER', len(df_roadmap_sicer))

df_roadmap_span = find_peak_files(
    os.path.join(PATH, 'span'), '.peak', IMMUNE_CELLS, MODIFICATIONS, [''], ['0.05']
)
df_roadmap_span['tool'] = 'SPAN'
print('SPAN', len(df_roadmap_span))

df_roadmap = pd.concat(
    [df_roadmap_macs2, df_roadmap_macs2broad, df_roadmap_sicer, df_roadmap_span]
).reset_index(drop=True)
df_roadmap.sample(5)

In [None]:
df_roadmap_peaks = load_peaks(df_roadmap)
df_roadmap_peaks

In [None]:
print('RoadmapEpigenomics REPROCESSED  Length of peaks')
plot_kde(df_roadmap_peaks, 'length', 'Peaks length',
         order=['H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K36me3'],
         minx=1e2, maxx=1e5)
plt.savefig(f'{PATH}/pics/modifications_lengths.pdf', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
for tool in ['MACS2', 'MACS2 broad', 'SPAN', 'SICER']:
    print(tool)
    plot_kde(df_gse26320_peaks[df_gse26320_peaks['tool'] == tool], 'length', f'{tool} Peaks length',
             order=['H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K36me3'],
             minx=1e2, maxx=1e5)
    plt.savefig(f'{PATH}/pics/modifications_{tool}_lengths.pdf', bbox_inches='tight', dpi=300)
    plt.show()

# Merged Encode and RoadmapEpigenomics dataset

In [None]:
df_roadmap_peaks['replicate'] = None
t = pd.concat([df_gse26320_peaks, df_roadmap_peaks]).reset_index(drop=True)

In [None]:
print('Merged REPROCESSED  Length of peaks')
plot_kde(t, 'length', 'Peaks length',
         order=['H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K36me3'],
         minx=1e2, maxx=1e5)
plt.savefig(f'{GSE26320_PATH}/pics/merged_modifications_lengths.pdf', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
for tool in ['MACS2', 'MACS2 broad', 'SPAN', 'SICER']:
    print(tool)
    plot_kde(t[t['tool'] == tool], 'length', f'{tool} Peaks length',
             order=['H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K36me3'],
             minx=1e2, maxx=1e5)
    plt.savefig(f'{GSE26320_PATH}/pics/merged_{tool}_modifications_lengths.pdf', bbox_inches='tight', dpi=300)
    plt.show()

# END