# Peaks length

In [None]:
# % matplotlib inline
# % config InlineBackend.figure_format='retina'

import pandas as pd
import seaborn as sns
from IPython.display import display

sns.set_style("whitegrid")
import matplotlib.pyplot as plt
import numpy as np
import os
from tqdm.auto import tqdm

In [None]:
def bedl(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[2] - tf[1]
    except:
        return np.zeros(0)  # Empty file


def lines(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return len(tf)
    except:
        return 0  # Empty file


def scores(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[4]
    except:
        return np.zeros(0)  # Empty file

def positions(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[0], tf[1], tf[2]
    except:
        return [], [], []  # Empty file

In [None]:
def find_peak_files(path, suffix, cells, modifications, fdrs):
    df = pd.DataFrame(
        columns=['file', 'modification', 'cell', 'replicate', 'fdr', 'peaks', 'avlength'],
        dtype=object
    )
    for f in tqdm(os.listdir(path)):
        if suffix not in f:
            continue
        fdr = next((fdr for fdr in fdrs if fdr in f), None)
        cell = next((cc for cc in cells if cc in f), None)
        mod = next((m for m in modifications if m in f), None)
        rep = 'rep1' if 'rep1' in f else 'rep2' if 'rep2' in f else None
        if mod:
            ps, ls = lines(os.path.join(path, f)), bedl(os.path.join(path, f))
            avls = 0 if ps == 0 else sum(ls) / ps
            df.loc[len(df)] = (f, mod, cell, rep, fdr, ps, avls)
    return df

In [None]:
def load_peaks(df, lowq=.01, highq=.99):
    dfs = []
    for _, (f, m, t, d) in tqdm(df[['file', 'modification', 'tool', 'dir']].iterrows()):
        peaks_info = []
        file = os.path.join(d, f)
        chromosomes, starts, ends = positions(file)
        peaks_info.extend((m, t, f, chr, start, end, score, length)
                          for chr, start, end, score, length in zip(chromosomes, starts, ends, scores(file), bedl(file)))
        t = pd.DataFrame(
            peaks_info,
            columns=['modification', 'tool', 'file', 'chromosome', 'start', 'end', 'score', 'length']
        )
        t['length'] = t['length'].astype(int)
        t['score'] = t['score'].astype(int)
        # Ignore extreme peaks
        t = t[(t['length'].quantile(lowq) <= t['length']) &
              (t['length'] <= t['length'].quantile(highq))].copy()
        dfs.append(t)
    return pd.concat(dfs, ignore_index=True)

# Encodeproject.org ATAC/TFs/ChIPseq

`xargs -L 1 curl -O -J -L < files.txt` and rename using `metadata.csv`
```
cd ~/data/2022_atacseq
for F in *.bed; do echo $F; mv $F $(grep "^${F/.bed/}" metadata.tsv | awk  -v FS='\t' '{printf("%s_%s_%s", $1, $11, "ATAC")}'  | sed 's# #_#g').bed; done

cd ~/data/2022_chipseq_narrow
for F in *.bed; do echo $F; mv $F $(grep "^${F/.bed/}" metadata.tsv | awk  -v FS='\t' '{printf("%s_%s_%s", $1, $11, $23)}'  | sed -E 's#[^a-zA-Z0-9]+#_#g').bed; done

cd ~/data/2022_chipseq_broad
for F in *.bed; do echo $F; mv $F $(grep "^${F/.bed/}" metadata.tsv | awk  -v FS='\t' '{printf("%s_%s_%s", $1, $11, $23)}'  | sed -E 's#[^a-zA-Z0-9]+#_#g').bed; done

# Simple analysis
for F in *.bed; do echo $(cat $F | awk '{N+=1;L+=$3-$2} END {printf("%d\n", L/N)}') $F; done | sort -k1,1n
```

In [None]:
print('Encodeproject.org ATAC/TFs/ChIPseq ATAC')
ATAC_PATH = os.path.expanduser('~/data/2022_atacseq')
ATAC_CELLS = ['']
ATAC_MODIFICATIONS = ['ATAC']

FDRS = ['']
df_encodeprojectorg_atacseq = find_peak_files(ATAC_PATH, '.bed', ATAC_CELLS, ATAC_MODIFICATIONS, FDRS)
df_encodeprojectorg_atacseq['tool'] = 'Macs2'
df_encodeprojectorg_atacseq['dir'] = ATAC_PATH
# df_atacseq.sample(3)

In [None]:
print('Encodeproject.org ATAC/TFs/ChIPseq Narrow Chipseq')
CHIP_NARROW_PATH = os.path.expanduser('~/data/2022_chipseq_narrow')
CHIP_NARROW_MODIFICATIONS = ['H3K27ac', 'H3K4me3', 'CTCF', 'H2AFZ']
CHIP_CELLS = ['']

FDRS = ['']
df_encodeprojectorg_narrow = find_peak_files(CHIP_NARROW_PATH, '.bed', CHIP_CELLS, CHIP_NARROW_MODIFICATIONS, FDRS)
df_encodeprojectorg_narrow['tool'] = 'Macs2'
df_encodeprojectorg_narrow['dir'] = CHIP_NARROW_PATH
# df_chipseq_narrow.sample(3)

In [None]:
print('Encodeproject.org ATAC/TFs/ChIPseq Broad Chipseq')
CHIP_BROAD_PATH = os.path.expanduser('~/data/2022_chipseq_broad')
CHIP_BROAD_MODIFICATIONS = ['H3K27me3', 'H3K36me3', 'H3K4me1']
CHIP_CELLS = ['']

FDRS = ['']
df_encodeprojectorg_broad = find_peak_files(CHIP_BROAD_PATH, '.bed', CHIP_CELLS, CHIP_BROAD_MODIFICATIONS, FDRS)
df_encodeprojectorg_broad['tool'] = 'Macs2Broad'
df_encodeprojectorg_broad['dir'] = CHIP_BROAD_PATH
# df_chipseq_broad.sample(3)

In [None]:
df_encodeprojectorg_peaks = load_peaks(
    pd.concat([df_encodeprojectorg_atacseq, df_encodeprojectorg_narrow, df_encodeprojectorg_broad]))
display(df_encodeprojectorg_peaks.sample(5))

In [None]:
print('Encodeproject.org ATAC/TFs/ChIPseq downloaded peaks')
# plot_distribution_by_tool(df_encode_peaks, value='length')

In [None]:
print('Encodeproject.org ATAC/TFs/ChIPseq downloaded peaks')
# t = df_encode_peaks.copy()
# t['tool'] = 'all'
# plot_distribution_by_tool(t, value='length', maxx=1e4)

In [None]:
print('Encodeproject.org ATAC/TFs/ChIPseq long peaks')
t = df_encodeprojectorg_peaks[(df_encodeprojectorg_peaks['length'] > 1e4)].copy()
t['count'] = 1
t[['modification', 'tool', 'count']].groupby(['modification', 'tool']).count()

In [None]:
def plot_boxes(df, value, title, hue='modification', violin=True, log=False, minx=50, maxx=1e4, order=None):
    plt.figure(figsize=(20, int(2 + 0.3 * len(set(df[hue])))))
    ax = plt.axes()
    if violin:
        g_results = sns.violinplot(data=df, y=hue, x=value, ax=ax, scale='width', order=order)
    else:
        g_results = sns.boxplot(data=df, y=hue, x=value, ax=ax, order=order)
    sample_count = list(
        filter(lambda x: minx <= x <= maxx,
               [10, 20, 50, 100, 200, 500, 1000, 2000, 5000,
                10_000, 20_000, 50_000, 100_000, 200_000, 500_000, 1_000_000])
    )
    if log:
        g_results.set(xscale='log')
        g_results.set(xticks=sample_count)
        g_results.set(xticklabels=sample_count)
    ax.set_xlim(minx, maxx)
    ax.title.set_text(title)
    plt.tight_layout()

In [None]:
print('Encodeproject.org ATAC/TFs/ChIPseq Length of modifications')
plot_boxes(df_encodeprojectorg_peaks, 'length', 'Length by modification',
           violin=False, log=True, minx=50, maxx=5e5,
           order=['CTCF', 'ATAC', 'H2AFZ', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'])
plt.show()

In [None]:
def plot_length_per_file(df, value):
    for t, m in product(set(df['tool']), set(df['modification'])):
        ts = df[(df['tool'] == t) & (df['modification'] == m)]
        if len(ts) == 0:
            continue
        print(t, m, len(set(ts['file'])))
        plt.figure(figsize=(12, 8))
        # Plot
        ax = plt.axes()
        sns.violinplot(data=ts, x='file', y=value, ax=ax, scale='width')
        ax.title.set_text(f'{m} {t}')
        plt.xticks(rotation=90, fontsize=7)
        plt.xlabel('file')
        plt.tight_layout()
        plt.show()

In [None]:
print('Encodeproject.org ATAC/TFs/ChIPseq Length of modifications per file')
# plot_length_per_file(df_encodeprojectorg_peaks, 'length')

# Filtered peaks

In [None]:
ATAC_GOOD = [
    'ENCFF883YXV_GM19468_ATAC.bed',
    'ENCFF089MEL_HG03575_ATAC.bed',
    'ENCFF861UUN_HG03558_ATAC.bed'
]

CHIP_NARROW_GOOD = [
    'ENCFF664CQH_endothelial_cell_of_umbilical_vein_CTCF_human.bed',
    'ENCFF028IIR_keratinocyte_CTCF_human.bed',
    'ENCFF833DGJ_keratinocyte_CTCF_human.bed',

    'ENCFF298YTQ_peripheral_blood_mononuclear_cell_H3K4me3_human.bed',
    'ENCFF641FUA_CD4_positive_alpha_beta_memory_T_cell_H3K4me3_human.bed',
    'ENCFF207ZEY_B_cell_H3K4me3_human.bed',

    'ENCFF332IIF_fibroblast_of_lung_H2AFZ_human.bed',
    'ENCFF637SMB_keratinocyte_H2AFZ_human.bed',
    'ENCFF155PLT_skeletal_muscle_myoblast_H2AFZ_human.bed',

    'ENCFF200EDZ_foreskin_melanocyte_H3K27ac_human.bed',
    'ENCFF962SXN_foreskin_fibroblast_H3K27ac_human.bed',
    'ENCFF832RWT_peripheral_blood_mononuclear_cell_H3K27ac_human.bed'
]

CHIP_BROAD_GOOD = [
    'ENCFF001TBK_fibroblast_of_dermis_H3K4me1_human.bed',
    'ENCFF001SXL_skeletal_muscle_myoblast_H3K4me1_human.bed',
    'ENCFF001TCP_fibroblast_of_lung_H3K4me1_human.bed',

    'ENCFF001TBN_fibroblast_of_dermis_H3K36me3_human.bed',
    'ENCFF001SXK_skeletal_muscle_myoblast_H3K36me3_human.bed',
    'ENCFF001SWY_mammary_epithelial_cell_H3K36me3_human.bed',

    'ENCFF001TCN_fibroblast_of_lung_H3K27me3_human.bed',
    'ENCFF001SWX_mammary_epithelial_cell_H3K27me3_human.bed',
    'ENCFF001SXJ_skeletal_muscle_myoblast_H3K27me3_human.bed'
]
ENCODE_GOOD = set(ATAC_GOOD + CHIP_NARROW_GOOD + CHIP_BROAD_GOOD)

In [None]:
df_encodeprojectorg_good = df_encodeprojectorg_peaks[df_encodeprojectorg_peaks['file'].isin(ENCODE_GOOD)]

In [None]:
print('Encodeproject.org ATAC/TFs/ChIPseq SELECTED Length of selected modifications')
plot_boxes(df_encodeprojectorg_good, 'length', 'Length by modification',
           violin=False, log=True, minx=50, maxx=5e5,
           order=['CTCF', 'ATAC', 'H2AFZ', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'])
plt.show()

In [None]:
print('Encodeproject.org ATAC/TFs/ChIPseq SELECTED length of modifications')
# t = df_encode_peaks_good.copy()
# t['tool'] = 'all'
# plot_distribution_by_tool(t, value='length', maxx=1e5)

# Reprocessed with MACS2

In [None]:
ENCODE_REPROCESSED_PATH = os.path.expanduser('~/data/2022_atac_chipseq_encode/macs2')
ENCODE_REPROCESSED_CELLS = ['']
ENCODE_REPROCESSED_MODIFICATIONS = [
    'ATAC', 'CTCF', 'H2AFZ', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'
]

FDRS = ['']
df_encodeprojectorg_reprocessed = pd.concat(
    [
        find_peak_files(
            ENCODE_REPROCESSED_PATH, '.narrowPeak', ENCODE_REPROCESSED_CELLS, ENCODE_REPROCESSED_MODIFICATIONS, FDRS
        ),
        find_peak_files(
            ENCODE_REPROCESSED_PATH, '.broadPeak', ENCODE_REPROCESSED_CELLS, ENCODE_REPROCESSED_MODIFICATIONS, FDRS
        )]
)
df_encodeprojectorg_reprocessed['tool'] = 'Macs2'
df_encodeprojectorg_reprocessed['dir'] = ENCODE_REPROCESSED_PATH
# df_encode_reprocessed.sample(3)

In [None]:
df_encodeprojectorg_reprocessed_peaks = load_peaks(df_encodeprojectorg_reprocessed)
display(df_encodeprojectorg_reprocessed_peaks.sample(5))

In [None]:
print('Encodeproject.org ATAC/TFs/ChIPseq SELECTED REPROCESSED MACS2')
plot_boxes(df_encodeprojectorg_reprocessed_peaks, 'length', 'Length by modification',
           violin=False, log=True, minx=30, maxx=2e5,
           order=['CTCF', 'ATAC', 'H2AFZ', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'])
plt.show()

# Encode reprocessed with SPAN

In [None]:
ENCODE_REPROCESSED_SPAN_PATH = os.path.expanduser('~/data/2022_atac_chipseq_encode/span')
df_encodeprojectorg_reprocessed_span = find_peak_files(
    ENCODE_REPROCESSED_SPAN_PATH, '.peak', ENCODE_REPROCESSED_CELLS, ENCODE_REPROCESSED_MODIFICATIONS, FDRS
)
df_encodeprojectorg_reprocessed_span['tool'] = 'Span'
df_encodeprojectorg_reprocessed_span['dir'] = ENCODE_REPROCESSED_SPAN_PATH
df_encodeprojectorg_reprocessed_span.sample(3)

In [None]:
df_encodeprojectorg_reprocessed_span_peaks = load_peaks(df_encodeprojectorg_reprocessed_span)
display(df_encodeprojectorg_reprocessed_span_peaks.sample(5))

In [None]:
print('Encodeproject.org ATAC/TFs/ChIPseq SELECTED REPROCESSED SPAN')
plot_boxes(df_encodeprojectorg_reprocessed_span_peaks, 'length', 'Length by modification',
           violin=False, log=True, minx=30, maxx=2e5,
           order=['CTCF', 'ATAC', 'H2AFZ', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'])
plt.show()

# GSE26320_RAW reprocessed

In [None]:
# Don't have H1 H3K27ac rep2
# Don't have Huvec H3K4me3 rep1
# Don't have HepG2 H3K4me1 rep2
GSE26320_PATH = os.path.expanduser('~/data/GSE26320_RAW')
GSE26320_CELLS = ['GM12878', 'HMEC', 'HSMM', 'K562', 'NHEK', 'NHLF', 'H1', 'Huvec', 'HepG2']
GSE26320_MODIFICATIONS = ['CTCF', 'H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3']
GSE26320_REPS = ['rep1', 'rep2']

In [None]:
GSE26320_MACS2_DIR = os.path.join(GSE26320_PATH, os.path.expanduser('macs2'))

MACS2_FDRS = ['0.05']
df_gse26320_macs2 = pd.concat([
    find_peak_files(GSE26320_MACS2_DIR, '.narrowPeak', GSE26320_CELLS, GSE26320_MODIFICATIONS, MACS2_FDRS),
    find_peak_files(GSE26320_MACS2_DIR, '.broadPeak', GSE26320_CELLS, GSE26320_MODIFICATIONS, MACS2_FDRS)
])
df_gse26320_macs2['tool'] = 'Macs2'
df_gse26320_macs2['dir'] = GSE26320_MACS2_DIR
df_gse26320_macs2.sample(5)

In [None]:
# GSE26320_SICER_DIR = os.path.join(GSE26320_PATH, os.path.expanduser('sicer'))
# # GSE26320_SICER_FDRS = ['0.1', '0.05', '0.01', '0.001', '0.0001', '1e-06', '1e-08', '1e-10']
# GSE26320_SICER_FDRS = ['0.01']
# df_gse26320_sicer = find_peak_files(
#     GSE26320_SICER_DIR, 'summary-FDR', GSE26320_CELLS, GSE26320_MODIFICATIONS, GSE26320_SICER_FDRS
# )
# df_gse26320_sicer['tool'] = 'Sicer'
# df_gse26320_sicer['dir'] = GSE26320_SICER_DIR
# df_gse26320_sicer.sample(5)

In [None]:
GSE26320_SPAN_DIR = os.path.join(GSE26320_PATH, os.path.expanduser('span'))
GSE26320_SPAN_FDRS = ['0.05']
df_gse26320_span = find_peak_files(
    GSE26320_SPAN_DIR, '.peak', GSE26320_CELLS, GSE26320_MODIFICATIONS, GSE26320_SPAN_FDRS
)
df_gse26320_span['tool'] = 'Span'
df_gse26320_span['dir'] = GSE26320_SPAN_DIR
df_gse26320_span.sample(5)

In [None]:
df_gse26320 = pd.concat([df_gse26320_macs2, df_gse26320_span])
df_gse26320.sample(10)

In [None]:
df_gse26320_peaks = load_peaks(df_gse26320)
df_gse26320_peaks

In [None]:
print('GSE26320 REPROCESSED Macs2 Length of peaks')
plot_boxes(df_gse26320_peaks[df_gse26320_peaks['tool'] == 'Macs2'], 'length', 'Length by modification',
           log=True, violin=False, minx=20, maxx=2e5,
           order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'])
plt.show()

In [None]:
print('GSE26320 REPROCESSED SPAN Length of peaks')
plot_boxes(df_gse26320_peaks[df_gse26320_peaks['tool'] == 'Span'], 'length', 'Length by modification',
           log=True, violin=False, minx=20, maxx=2e5,
           order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'])
plt.show()

In [None]:
print('GSE26320 K562 rep1 REPROCESSED Macs2 Length of peaks')
ts = df_gse26320_peaks[(df_gse26320_peaks['tool'] == 'Macs2') &
                       (df_gse26320_peaks['file'].str.contains('K562')) &
                       (df_gse26320_peaks['file'].str.contains('rep1'))]
plot_boxes(ts, 'length', 'Length by modification',
           log=True, violin=False, minx=150, maxx=2e5,
           order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'])
plt.show()

In [None]:
def plot_log(df, value, title, hue='modification', minx=100, maxx=5e4, order=None):
    plt.figure(figsize=(20, 5))
    ax = plt.axes()
    if order is not None:
        g_results = sns.histplot(data=df, x=value, hue=hue, ax=ax,
                                 stat='density', common_norm=False, log_scale=True, kde=True, alpha=0.2,
                                 hue_order=order)
    else:
        g_results = sns.histplot(data=df, x=value, ax=ax,
                                 stat='density', log_scale=True, kde=True, alpha=0.5)
    sample_count = list(
        filter(lambda x: minx <= x <= maxx,
               [50, 100, 200, 500, 1000, 2000, 5000,
                10_000, 20_000, 50_000, 100_000, 200_000, 500_000, 1_000_000])
    )
    g_results.set(xscale='log')
    g_results.set(xticks=sample_count)
    g_results.set(xticklabels=sample_count)
    ax.set_xlim(minx, maxx)
    ax.title.set_text(title)

In [None]:
print('GSE26320 K562 rep1 REPROCESSED Macs2 Length of peaks')
plot_log(ts, 'length', 'MACS2 Peaks lengths',
         order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'])
plt.show()

In [None]:
from math import floor

def compute_length_percentiles(ts, steps = 10):
    percentiles = []
    for m in set(ts['modification']):
        tm = ts[ts['modification'] == m].copy()
        tm.sort_values(by=['length'], inplace=True)
        tmn = int(floor(len(tm) / steps))
        for i in range(steps):
            p = int(100 / steps * (i + 1))
            percentiles.extend((m, p, v) for v in tm['length'].values[i * tmn: (i + 1) * tmn])

    return pd.DataFrame(columns=['modification', 'percentile', 'length'], data=percentiles)

def plot_percentiles(df, title, miny=10, maxy=1e5):
    percentiles_df = compute_length_percentiles(df)
    plt.figure(figsize=(10, 8))
    ax = plt.axes()
    g_results = sns.boxplot(data=percentiles_df, x="percentile", y="length", hue="modification",
                            hue_order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'])
    sample_count = list(
        filter(lambda y: miny <= y <= maxy,
               [10, 20, 50, 100, 200, 500, 1000, 2000, 5000,
                10_000, 20_000, 50_000, 100_000, 200_000, 500_000, 1_000_000])
    )
    g_results.set(yscale='log')
    g_results.set(yticks=sample_count)
    g_results.set(yticklabels=sample_count)
    ax.set_ylim(miny, maxy)
    ax.title.set_text(title)

In [None]:
plot_percentiles(ts, 'MACS2 Percentile average lengths')
plt.show()

In [None]:
print('GSE26320 K562 rep1 REPROCESSED SPAN Length of peaks')
ts = df_gse26320_peaks[(df_gse26320_peaks['tool'] == 'Span') &
                       (df_gse26320_peaks['file'].str.contains('K562')) &
                       (df_gse26320_peaks['file'].str.contains('rep1'))]
plot_boxes(ts, 'length', 'Length by modification',
           log=True, violin=False, minx=150, maxx=2e5,
           order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'])
plt.show()

In [None]:
print('GSE26320 K562 rep1 REPROCESSED SPAN Length of peaks')
plot_log(ts, 'length', 'SPAN Peaks lengths',
         order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'])
plt.show()

In [None]:
plot_percentiles(ts, 'SPAN Percentile average lengths')
plt.show()

# Distance between peaks

In [None]:
from itertools import product

def distance_between_peaks(df, hue='modification'):
    distances = []
    for (m, t, f), dft in tqdm(df.groupby([hue, 'tool', 'file'])):
        for chr in sorted(set(dft['chromosome'])):
            dftc = dft[dft['chromosome'] == chr].copy().sort_values(by=['start'])
            starts = dftc['start']
            ends = dftc['end']
            ends = np.roll(ends, 1)
            ds = starts - ends
            # Ignore first and last
            distances.extend((m, t, f, chr, d) for d in ds[1: ds.size - 1])
    return pd.DataFrame(columns=[hue, 'tool', 'file', 'chr', 'distance'], data=distances)

In [None]:
df_gse26320_distances = distance_between_peaks(df_gse26320_peaks)
df_gse26320_distances

In [None]:
print('GSE26320 K562 rep1 REPROCESSED MACS2 Distance of peaks')
ts = df_gse26320_distances[(df_gse26320_distances['tool'] == 'Macs2') &
                           (df_gse26320_distances['file'].str.contains('K562')) &
                           (df_gse26320_distances['file'].str.contains('rep1'))].copy()
plot_boxes(ts, 'distance', 'Distance between peaks by modification',
           log=True, violin=False, minx=100, maxx=1e6,
           order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'])
plt.show()

In [None]:
print('GSE26320 K562 rep1 REPROCESSED Macs2 Distance between of peaks')
plot_log(ts, 'distance', 'MACS2 Peaks distance', maxx=1e6,
         order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'])
plt.show()

In [None]:
print(f'Filter distance <= 5e4 {int(100 * sum(ts["distance"] <= 5e4) / len(ts))}%')
plot_boxes(ts.loc[ts['distance'] <= 5e4], 'distance', 'Distance between peaks by modification',
           log=False, violin=True,
           minx=-1e4, maxx=6e4,
           order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'])
plt.show()

In [None]:
print('GSE26320 K562 rep1 REPROCESSED SPAN Distance between peaks')
ts = df_gse26320_distances[(df_gse26320_distances['tool'] == 'Span') &
                           (df_gse26320_distances['file'].str.contains('K562')) &
                           (df_gse26320_distances['file'].str.contains('rep1'))]
plot_boxes(ts, 'distance', 'Distance between peaks by modification',
           log=True, violin=False, minx=100, maxx=1e6,
           order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'])
plt.show()

In [None]:
print('GSE26320 K562 rep1 REPROCESSED SPAN Distance between of peaks')
plot_log(ts, 'distance', 'SPAN Peaks distance', maxx=1e6,
         order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'])
plt.show()

In [None]:
print(f'Filter distance <= 5e4 {int(100 * sum(ts["distance"] <= 5e4) / len(ts))}%')
plot_boxes(ts.loc[ts['distance'] <= 5e4], 'distance', 'Distance between peaks by modification',
           log=False, violin=True,
           minx=-1e4, maxx=6e4,
           order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'])
plt.show()

# hg19 genes length and distance between genes

In [None]:
GTF = 'gencode.v19.annotation.gtf.gz'
gtf_df = pd.read_csv(os.path.join(GSE26320_PATH, GTF), sep='\t', comment='#',
                     names=['chromosome', 'db', 'type', 'start', 'end', 'point1', 'strand', 'point2', 'aux'])
gtf_df.sample(10)

In [None]:
print('Parse GTF aux data')
auxes = {}
for i, aux in enumerate(tqdm(gtf_df['aux'])):
    for pair in aux.split(';'):
        kv = pair.strip().split(' ')
        if len(kv) != 2:
            continue
        k, v = kv
        if k not in auxes:
            auxes[k] = vs = []
        else:
            vs = auxes[k]
        vs.append(v.strip('"'))

for k, vs in auxes.items():
    if len(vs) == len(gtf_df):
        gtf_df[k] = vs
    else:
        print(f'Ignoring {k}')
del auxes
gtf_df.drop('aux', axis=1, inplace=True)
gtf_df.sample(3)

In [None]:
gtf_df['length'] = gtf_df['end'] - gtf_df['start']
gtf_df[gtf_df['type'] == 'gene'].sample(5)

In [None]:
print('GTF all genes lengths')
gtf_genes_df = gtf_df[gtf_df['type'] == 'gene'].copy()
plot_boxes(gtf_genes_df, 'length', 'Length of genes',
           hue='gene_type',
           log=True, violin=False, minx=5, maxx=2_000_000)
plt.show()

In [None]:
print('GTF all genes lengths')
plot_log(gtf_genes_df, 'length', 'Length of genes', hue='gene_type', minx=5, maxx=2_000_000)
plt.show()

In [None]:
GENES_TYPES = ['protein_coding', 'pseudogene']
# GENES_TYPES = ['protein_coding']
print('GTF genes lengths')
gtf_genes_df2 = gtf_genes_df[gtf_genes_df['gene_type'].isin(GENES_TYPES)].copy()
plot_boxes(gtf_genes_df2, 'length', 'Length of genes', hue='gene_type',
           log=True, violin=False, minx=10, maxx=2_000_000,
           order=['protein_coding', 'pseudogene'])
gtf_genes_df2['file'] = 'gtf'
gtf_genes_df2['tool'] = 'gtf'
plt.show()

In [None]:
print('GTF genes lengths')
plot_log(gtf_genes_df2, 'length', 'Length of genes', hue='gene_type', minx=100, maxx=1_000_000,
         order=GENES_TYPES)
plt.show()

In [None]:
print('GSE26320 K562 rep1 Lengths of MACS2 peaks and genes')
ts = df_gse26320_peaks[(df_gse26320_peaks['tool'] == 'Macs2') &
                       (df_gse26320_peaks['file'].str.contains('K562')) &
                       (df_gse26320_peaks['file'].str.contains('rep1'))]
tss = pd.concat([ts, gtf_genes_df2.rename({'gene_type': 'modification'}, axis=1)]).reset_index()
plot_log(tss, 'length', 'Lengths of MACS2 peaks and genes',
         minx=100, maxx=1_000_000,
         order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'] + GENES_TYPES)
plt.show()

In [None]:
print('GSE26320 K562 rep1 Lengths of SPAN peaks and genes')
ts = df_gse26320_peaks[(df_gse26320_peaks['tool'] == 'Span') &
                       (df_gse26320_peaks['file'].str.contains('K562')) &
                       (df_gse26320_peaks['file'].str.contains('rep1'))]
tss = pd.concat([ts, gtf_genes_df2.rename({'gene_type': 'modification'}, axis=1)]).reset_index()
plot_log(tss, 'length', 'Lengths of SPAN peaks and genes',
         minx=100, maxx=1_000_000,
         order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'] + GENES_TYPES)
plt.show()

In [None]:
gtf_genes_df_distances = distance_between_peaks(gtf_genes_df2, hue='gene_type')
gtf_genes_df_distances

In [None]:
print('Distance between genes')
plot_boxes(gtf_genes_df_distances, 'distance', 'Distance between genes', hue='gene_type',
           log=True, violin=False, minx=1, maxx=5e6,
           order=GENES_TYPES)
plt.show()

In [None]:
print('Distance between genes')
plot_log(gtf_genes_df_distances[gtf_genes_df_distances['distance'] > 10],
         'distance', 'Distance between genes', hue='gene_type',
         order=GENES_TYPES,
         minx=10, maxx=5e6)
plt.show()

In [None]:
print('GSE26320 K562 rep1 Distance between MACS2 peaks and genes')
ts = df_gse26320_distances[(df_gse26320_distances['tool'] == 'Macs2') &
                           (df_gse26320_distances['file'].str.contains('K562')) &
                           (df_gse26320_distances['file'].str.contains('rep1'))].copy()
tss = pd.concat([ts, gtf_genes_df_distances.rename({'gene_type': 'modification'}, axis=1)]).reset_index()
tss = tss[tss['distance'] > 10]
plot_log(tss, 'distance', 'Distance between MACS2 peaks and genes', maxx=5e6,
         order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'] + GENES_TYPES)
plt.show()

In [None]:
print('GSE26320 K562 rep1 Distance between SPAN peaks and genes')
ts = df_gse26320_distances[(df_gse26320_distances['tool'] == 'Span') &
                           (df_gse26320_distances['file'].str.contains('K562')) &
                           (df_gse26320_distances['file'].str.contains('rep1'))].copy()
tss = pd.concat([ts, gtf_genes_df_distances.rename({'gene_type': 'modification'}, axis=1)]).reset_index()
tss = tss[tss['distance'] > 10]
plot_log(tss, 'distance', 'Distance between SPAN peaks and genes', maxx=5e6,
         order=['CTCF', 'H3K27ac', 'H3K4me3', 'H3K4me1', 'H3K27me3', 'H3K36me3'] + GENES_TYPES)
plt.show()

# Immgen ATAC-seq reprocessed

In [None]:
IMMGEN_PATH = os.path.expanduser('~/data/2022_Immgen')
IMMGEN_CELLS = ['Monocyte']
IMMGEN_MODIFICATIONS = ['ATAC']

IMMGEN_MACS2_DIR = os.path.join(IMMGEN_PATH, os.path.expanduser('macs2'))
# MACS2_FDRS = ['0.1', '0.01', '0.05', '1e-3', '1-e4', '1e-6', '1e-8', '1e-10']
MACS2_FDRS = ['0.05']
df_immgen = find_peak_files(IMMGEN_MACS2_DIR, '.narrowPeak', IMMGEN_CELLS, IMMGEN_MODIFICATIONS, MACS2_FDRS)
df_immgen['tool'] = 'Macs2'
df_immgen['dir'] = IMMGEN_MACS2_DIR
df_immgen.sample(3)

In [None]:
ts = load_peaks(df_immgen)
ts

In [None]:
print('Immgen MACS2 REPROCESSED Length of peaks')
plot_boxes(ts, 'length', 'Length by modification',
           log=True, violin=False, minx=10, maxx=2000)
plt.show()

In [None]:
print('Immgen MACS2 REPROCESSED Length of peaks')
plot_log(ts, 'length', 'MACS2 Peaks lengths', minx=10, maxx=2000)
plt.show()

In [None]:
IMMGEN_SPAN_DIR = os.path.join(IMMGEN_PATH, os.path.expanduser('span50'))
# MACS2_FDRS = ['0.1', '0.01', '0.05', '1e-3', '1-e4', '1e-6', '1e-8', '1e-10']
df_immgen_span = find_peak_files(IMMGEN_SPAN_DIR, '.peak', IMMGEN_CELLS, IMMGEN_MODIFICATIONS, MACS2_FDRS)
df_immgen_span['tool'] = 'SPAN'
df_immgen_span['dir'] = IMMGEN_SPAN_DIR
df_immgen_span

In [None]:
ts = load_peaks(df_immgen_span)
ts

In [None]:
print('Immgen SPAN 100 REPROCESSED Length of peaks')
plot_boxes(ts, 'length', 'Length by modification',
           log=True, violin=False, minx=10, maxx=2000)
plt.show()

In [None]:
print('Immgen SPAN 100 REPROCESSED Length of peaks')
plot_log(ts, 'length', 'MACS2 Peaks lengths', minx=10, maxx=2000)
plt.show()

# 2022_GSE35583_RAW downloaded

In [None]:
GSE35583_PATH = os.path.expanduser('~/data/2022_GSE35583_RAW')
GSE35583_CELLS = ['']
GSE35583_MODIFICATIONS = ['H3k4me3', 'H3k36me3', 'H3k27ac', 'H3k27me3', 'H3k09me3']

df_gse35583 = pd.concat([
    find_peak_files(GSE35583_PATH, '.narrowPeak', GSE35583_CELLS, GSE35583_MODIFICATIONS, FDRS),
    find_peak_files(GSE35583_PATH, '.broadPeak', GSE35583_CELLS, GSE35583_MODIFICATIONS, FDRS)
])
df_gse35583['tool'] = 'Macs2'
df_gse35583['dir'] = GSE35583_PATH
df_gse35583.sample(3)

In [None]:
df_gse35583_peaks = load_peaks(df_gse35583)
display(df_gse35583_peaks.sample(5))

In [None]:
print('2022_GSE35583_RAW DOWNLOADED Length of peaks')
plot_boxes(df_gse35583_peaks, 'length', 'Length by modification',
           violin=False, log=True, minx=10, maxx=1e4)
plt.show()

# GSE29911 downloaded

In [None]:
GSE29911_PATH = os.path.expanduser('~/data/GSE29611_RAW')
GSE29911_CELLS = ['']
GSE29911_MODIFICATIONS = ['Ctcf', 'Pol2b', 'H2az', 'H3k09me3', 'H3k79me2', 'H4k20me1', 'H3k4me1', 'H3k4me2', 'H3k4me3',
                          'H3k36me3', 'H3k27ac', 'H3k27me3', 'H3k9me1', 'H3k9ac']

df_gse29911 = find_peak_files(GSE29911_PATH, '.broadPeak', GSE29911_CELLS, GSE29911_MODIFICATIONS, FDRS)
df_gse29911['tool'] = 'Macs2'
df_gse29911['dir'] = GSE29911_PATH

df_gse29911_peaks = load_peaks(df_gse29911)

In [None]:
print('GSE29911_RAW DOWNLOADED Length of modifications')
plot_boxes(df_gse29911_peaks, 'length', 'Length by modification',
           violin=False, log=True, minx=80, maxx=1e6,
           order=['Ctcf', 'H2az', 'Pol2b', 'H3k27ac', 'H3k4me3', 'H3k4me2', 'H3k9ac',
                    'H3k4me1', 'H3k9me1', 'H3k09me3', 'H3k27me3', 'H3k36me3', 'H3k79me2', 'H4k20me1'])
plt.show()

# END