# ChromHMM

Compare peak calling results with chromhmm

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

import pandas as pd
import seaborn as sns
from IPython.display import display
from tqdm.auto import tqdm

sns.set_style("whitegrid")
import matplotlib.pyplot as plt
import numpy as np
import os
import re
from itertools import product
import tempfile

In [None]:
def bedl(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[2] - tf[1]
    except:
        return np.zeros(0)  # Empty file


def lines(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return len(tf)
    except:
        return 0  # Empty file


def d(a, b):
    return a / b if b != 0 else 0


def last_col(file):
    try:
        cols = len(pd.read_csv(file, sep='\t', nrows=1, header=None).columns)
        return pd.read_csv(file, sep='\t', header=None, usecols=[cols - 1])[cols - 1]
    except:
        return np.zeros(0)  # Empty file


def sorted_file(file):
    ts = tempfile.mktemp()
    ! cat {file} | sort -k1,1 -k2,2n -k3,3n > {ts}
    return ts

# Load hg38 peaks

Launch peak calling

```
conda activate snakemake
snakemake -p -s ~/work/chipseq-smk-pipeline/Snakefile all --cores all  --use-conda  --directory $(pwd) --config genome=hg38 fastq_dir=$(pwd) start_with_bams=True bams_dir=bams macs2=True span=False sicer=False --rerun-trigger mtime;
snakemake -p -s ~/work/chipseq-smk-pipeline/Snakefile all --cores all  --use-conda  --directory $(pwd) --config genome=hg38 fastq_dir=$(pwd) start_with_bams=True bams_dir=bams macs2=True macs2_mode=broad macs2_params="--broad --broad-cutoff=0.1" macs2_suffix="broad0.1"  span=False sicer=False --rerun-trigger mtime; 
snakemake -p -s ~/work/chipseq-smk-pipeline/Snakefile all --cores all  --use-conda  --directory $(pwd) --config genome=hg38 fastq_dir=$(pwd) start_with_bams=True bams_dir=bams macs2=False span=True sicer=True --rerun-trigger mtime;
```


In [None]:
GSE26320_PATH = os.path.expanduser('~/data/2023_GSE26320')
GSE26320_CELLS = ['GM12878', 'HMEC', 'HSMM', 'K562', 'NHEK', 'NHLF', 'H1', 'Huvec', 'HepG2']
MODIFICATIONS = ['H3K4me3', 'H3K27ac', 'H3K4me1', 'H3K27me3', 'H3K36me3']
GSE26320_REPS = ['rep1', 'rep2']

IMMUNE_PATH = os.path.expanduser('~/data/2023_Immune')
IMMUNE_CELLS = ['CD4ABT', 'TCellBB', 'BCell', 'TCell', 'Monocyte', 'PBMC', 'NK', 'CD34', 'CD4', ]  ## Longest first
IMMUNE_REPS = ['rep1', 'rep2', 'rep3', '']

! mkdir -p {GSE26320_PATH}/pics
! mkdir -p {IMMUNE_PATH}/pics

In [None]:
TOOLS = ['MACS2', 'MACS2 broad', 'SPAN', 'SICER']
palette = plt.get_cmap('tab10')
TOOLS_PALETTE = {t: palette(i) for i, t in enumerate(TOOLS)}

In [None]:
def load_peaks(path, suffix, modifications, cells, replicates):
    df = pd.DataFrame(columns=['file', 'modification', 'cell', 'replicate', 'peaks'],
                      dtype=object)
    for f in tqdm(os.listdir(path)):
        if not f.endswith(suffix):
            continue
        cell = next((c for c in cells if c in f), None)
        mod = next((m for m in modifications if m in f), None)
        rep = next((r for r in replicates if r in f), None)
        if cell and mod and rep is not None:
            peaks_path = os.path.join(path, f)
            ps = lines(peaks_path)
            df.loc[len(df)] = (peaks_path, mod, cell, rep, ps)
    return df

In [None]:
def load_peaks_path(path, modifications, cells, replicates):
    df_macs2 = load_peaks(os.path.join(path, 'macs2'), '.narrowPeak', modifications, cells, replicates)
    df_macs2['tool'] = 'MACS2'
    print('MACS2', len(df_macs2))

    df_macs2broad = load_peaks(os.path.join(path, 'macs2'), '.broadPeak', modifications, cells, replicates)
    df_macs2broad['tool'] = 'MACS2 broad'
    print('MACS2 broad', len(df_macs2broad))

    df_sicer = load_peaks(os.path.join(path, 'sicer'), 'summary-FDR0.01', modifications, cells, replicates)
    if len(df_sicer) == 0:
        df_sicer = load_peaks(os.path.join(path, 'sicer'), '.scoreisland', modifications, cells, replicates) 
    df_sicer['tool'] = 'SICER'
    print('SICER', len(df_sicer))

    df_span = load_peaks(os.path.join(path, 'span'), '.peak', modifications, cells, replicates)
    df_span['tool'] = 'SPAN'
    print('SPAN', len(df_span))

    return pd.concat([df_macs2, df_macs2broad, df_sicer, df_span]).reset_index(drop=True)

In [None]:
df_encode_peaks = load_peaks_path(GSE26320_PATH, MODIFICATIONS, GSE26320_CELLS, GSE26320_REPS)
df_encode_peaks['dataset'] = 'ENCODE'
df_immune_peaks = load_peaks_path(IMMUNE_PATH, MODIFICATIONS, IMMUNE_CELLS, IMMUNE_REPS)
df_immune_peaks['dataset'] = 'Roadmap'
df_peaks = pd.concat([df_encode_peaks, df_immune_peaks]).reset_index(drop=True)
df_peaks.sample(3)

In [None]:
# Remove outliers
OUTLIERS = [('H3K4me3', 'NK', ''),
            ('H3K4me1', 'NK', ''),
            ('H3K27me3', 'TCell', ''),
            ('H3K27me3', 'NHLF', 'rep2'),
            ('H3K27me3', 'GM12878', 'rep1')]
df_peaks = df_peaks.loc[[(m, c, r) not in OUTLIERS
                         for _, (m, c, r) in df_peaks[['modification', 'cell', 'replicate']].iterrows()]]

# ChromHMM

In [None]:
ENCODE_CHROMHMM_MAP = {
    'GM12878': GSE26320_PATH + '/chromhmm/GM12878_chromhmm_hg38_ENCFF338RIC.bed',
    'H1': GSE26320_PATH + '/chromhmm/H1_chromhmm_hg38_ENCFF323HNB.bed',
    'HepG2': GSE26320_PATH + '/chromhmm/HepG2_chromhmm_hg38_ENCFF808IZE.bed',
    'K562': GSE26320_PATH + '/chromhmm/K562_chromhmm_hg38_ENCFF649FCE.bed',
}


print('Split chromHMM into separate files')
CHROMM_STATES = set()
for c, chromhmm in ENCODE_CHROMHMM_MAP.items():
    print(c)
    t = pd.read_csv(chromhmm, sep='\t', header=None, usecols=[3])[3]
    for state in t.unique():
        # print(state)
        state = state.replace('/', '_')
        CHROMM_STATES.add(state)
        f = chromhmm.replace('.bed', f'_{state}.bed')
        ! cat {chromhmm} | grep {state} > {f}

In [None]:
IMMUNE_CHROMHMM_MAP = {
    'BCell': IMMUNE_PATH + '/chromhmm/BCell_Chromhmm_hg38_ENCFF885QXH.bed',
    'TCell': IMMUNE_PATH + '/chromhmm/TCell_Chromhmm_hg38_ENCFF037LJR.bed',
    'Monocyte': IMMUNE_PATH + '/chromhmm/Monocyte_Chromhmm_hg38_ENCFF560DUU.bed',
    'NK': IMMUNE_PATH + '/chromhmm/NK_Chromhmm_hg38_ENCFF489XBL.bed',
    'CD4ABT': IMMUNE_PATH + '/chromhmm/CD4ABT_Chromhmm_hg38_ENCFF632AEU.bed',
    'CD34': IMMUNE_PATH + '/chromhmm/CD34_Chromhmm_hg38_ENCFF639FAH.bed',
    'PBMC': IMMUNE_PATH + '/chromhmm/PBMC_Chromhmm_hg38_ENCFF863IVL.bed'
}

print('Split chromHMM into separate files')
for c, chromhmm in IMMUNE_CHROMHMM_MAP.items():
    print(c)
    t = pd.read_csv(chromhmm, sep='\t', header=None, usecols=[3])[3]
    for state in t.unique():
        # print(state)
        state = state.replace('/', '_')
        CHROMM_STATES.add(state)
        f = chromhmm.replace('.bed', f'_{state}.bed')
        ! cat {chromhmm} | grep {state} > {f}

CHROMM_STATES = list(sorted(CHROMM_STATES))
print(CHROMM_STATES)

In [None]:
def compare_with_chromhmm(df_peaks, chromm_map):
    tf = tempfile.mktemp()
    tf2 = tempfile.mktemp()

    rows = []

    for (m, c, r), dft in tqdm(df_peaks.groupby(['modification', 'cell', 'replicate'])):
        print(m, c, r)
        if c not in chromm_map:
            continue
        chromhmm = chromm_map[c]
        for tool in TOOLS:
            t = dft[dft['tool'] == tool]
            if len(t) == 0: 
                continue
            peaks_file = sorted_file(t['file'].values[0])
            peaks = lines(peaks_file)
            row = [m, c, r, tool, peaks]
            for state in CHROMM_STATES:
                state_file = sorted_file(chromhmm.replace('.bed', f'_{state}.bed'))
                !bedtools intersect -a {peaks_file} -b {state_file} -wa -u > {tf}
                state_peaks = lines(tf)
                row.append(state_peaks)
            rows.append(row)

            if tool == 'SPAN':
                span_file = peaks_file

        # Processing single tools information
        for tool in TOOLS:
            if tool == 'SPAN':
                continue
            t = dft[dft['tool'] == tool]
            if len(t) == 0:
                continue
            peaks_file = sorted_file(t['file'].values[0])
            for name, args in [
                (f'SPAN - {tool}', f' -a {span_file} -b {peaks_file} '),
                (f'{tool} - SPAN', f' -b {span_file} -a {peaks_file} ')
            ]:
                !bedtools intersect {args} -wa -v > {tf}
                peaks = lines(tf)
                row = [m, c, r, name, peaks]
                for state in CHROMM_STATES:
                    state_file = sorted_file(chromhmm.replace('.bed', f'_{state}.bed'))
                    !bedtools intersect -a {tf} -b {state_file} -wa -u > {tf2}
                    state_peaks = lines(tf2)
                    row.append(state_peaks)
                rows.append(row)

    return pd.DataFrame(rows, columns=['modification', 'cell', 'replicate', 'name', 'peaks'] + CHROMM_STATES)

In [None]:
encode_chromm = compare_with_chromhmm(df_encode_peaks, ENCODE_CHROMHMM_MAP)
encode_chromm['dataset'] = 'ENCODE'
encode_chromm.sample(3)

In [None]:
immune_chromm = compare_with_chromhmm(df_immune_peaks, IMMUNE_CHROMHMM_MAP)
immune_chromm['dataset'] = 'Roadmap'
immune_chromm.sample(3)

In [None]:
encode_chromm['dataset'] = 'ENCODE'
immune_chromm['dataset'] = 'Roadmap'
chromhmm_df = pd.concat([encode_chromm, immune_chromm]).reset_index(drop=True)
chromhmm_df.sample(3)

In [None]:
chromhmm_df[['dataset', 'modification', 'name'] + CHROMM_STATES].groupby(
    ['dataset', 'modification', 'name']).mean().reset_index()

In [None]:
chromhmm_df_mean = chromhmm_df[['dataset', 'modification', 'name'] + CHROMM_STATES].groupby(
    ['dataset', 'modification', 'name']).mean().reset_index()
chromhmm_df_mean.sample(3)

In [None]:
for (ds, m), dft in chromhmm_df_mean.groupby(['dataset', 'modification']):
    print(ds, m)
    plt.figure(figsize=(6, 3))
    t = dft[['name'] + CHROMM_STATES]
    t.set_index('name', inplace=True)
    sns.heatmap(t.loc[['MACS2', 'MACS2 broad', 'SPAN', 'SICER',
                       'SPAN - MACS2', 'SPAN - MACS2 broad', 'SPAN - SICER',
                       'MACS2 - SPAN', 'MACS2 broad - SPAN', 'SICER - SPAN',
                       ]], cmap='coolwarm')
    plt.title('Peaks number in ChromHMM states')
    plt.tight_layout()
    plt.show()

In [None]:
chromhmm_rel_df = chromhmm_df[['dataset', 'modification', 'name']].copy()
for state in CHROMM_STATES:
    chromhmm_rel_df[state] = [d(ov, p) for ov, p in zip(chromhmm_df[state], chromhmm_df['peaks'])]
chromhmm_rel_df_mean = chromhmm_rel_df[['dataset', 'modification', 'name'] + CHROMM_STATES].groupby(
    ['dataset', 'modification', 'name']).mean().reset_index()

In [None]:
for (ds, m), dft in chromhmm_rel_df_mean.groupby(['dataset', 'modification']):
    print(ds, m)
    plt.figure(figsize=(6, 3))
    t = dft[['name'] + CHROMM_STATES]
    t.set_index('name', inplace=True)
    sns.heatmap(t.loc[['MACS2', 'MACS2 broad', 'SPAN', 'SICER',
                       'SPAN - MACS2', 'SPAN - MACS2 broad', 'SPAN - SICER',
                       'MACS2 - SPAN', 'MACS2 broad - SPAN', 'SICER - SPAN',
                       ]], cmap='coolwarm')
    plt.title('Peaks fraction in ChromHMM states')
    plt.tight_layout()
    plt.show()