# GSE53643 - Replicated H3K4me2 consistency, SPAN automarkup

Logbook: https://docs.google.com/document/d/1VGH4fA20LbhGGKWvBg28E7G1JBm1rLlmCEtD2X239Eg/edit#heading=h.44kd47qfiiva


In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

from IPython.display import display
import pandas as pd

import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
import scipy as sp
import os, re
import glob
from tqdm.auto import tqdm
import subprocess, tempfile

import plotly.graph_objects as go
import plotly.express as px

# MACS2

For different Q values:

```
# Narrow
snakemake all_macs2_results --use-conda --cores 28  --config work_dir=/mnt/stripe/bio/raw-data/geo-samples/GSE53643 fastq_dir=/mnt/stripe/bio/raw-data/geo-samples/GSE53643/fastq genome=hg38 macs2_mode=narrow macs2_suffix="q0.01" macs2_params="-q 0.01"

# 50k strongest peaks
for F in *peaks.narrowPeak; do echo $F; cat $F | sort -k9,9nr | head -n 50000 | sort -k1,1 -k2,2n > ${F/.narrowPeak/_50k.narrowPeak}; done

# Broad
snakemake all_macs2_results --use-conda --cores 28  --config work_dir=/mnt/stripe/bio/raw-data/geo-samples/GSE53643 fastq_dir=/mnt/stripe/bio/raw-data/geo-samples/GSE53643/fastq genome=hg38 macs2_mode=broad macs2_suffix="broad_0.1" macs2_params="--broad --broad-cutoff 0.1"

```

In [None]:
MACS2_FOLDER='/mnt/stripe/bio/raw-data/geo-samples/GSE53643/macs2'
MACS2_LEVELS = ['50k', 'q0.05', 'q0.01', 'q1e-4', 'q1e-6', 
                'broad_0.1', 'broad_0.05', 'broad_0.01', 'broad_1e-4', 'broad_1e-6']

dfm = pd.DataFrame(columns=['gsm', 'name', 'replicate', 'level', 'file', 'peaks', 'length'])
for file in tqdm(glob.glob(MACS2_FOLDER + '/*.*Peak')):
    level = next((l for l in MACS2_LEVELS if f'_{l}' in file), None) # 
    if level:
        gsm = re.sub('_H3K4me2.*', '', os.path.basename(file))
        name = re.sub('(GSM[0-9]+_)|(-rep[0-9].*)', '', os.path.basename(file))
        replicate = re.match('rep[0-9]+', os.path.basename(file))
        out = ! awk '{{N+=1;L+=($$3-$$2)}} END{{print(N,L)}}' {file}
        peaks, length = out[0].split(' ')
        dfm.loc[len(dfm)] = (gsm, name, replicate, f'macs2 {level}', file, peaks, length)

# Fix types
dfm['peaks'] = dfm['peaks'].astype(int)
dfm['length'] = dfm['length'].astype(int)

In [None]:
dfm_mean = dfm.groupby(['name', 'level'])['peaks'].mean().reset_index().sort_values(by=['name', 'level'])
dfm_std = dfm.groupby(['name', 'level'])['peaks'].std().reset_index().fillna(0).sort_values(by=['name', 'level'])

fig = go.Figure(layout=go.Layout(title=go.layout.Title(text="Peaks")))
for l in sorted(set(dfm_mean['level'])):
    fig.add_trace(go.Scatter(x=dfm_mean.loc[dfm_mean['level']==l]['name'], 
                             y=dfm_mean.loc[dfm_mean['level']==l]['peaks'], 
                             name=f"{l} mean", line_shape='linear'))
    fig.add_trace(go.Scatter(x=dfm_std.loc[dfm_std['level']==l]['name'], 
                             y=dfm_std.loc[dfm_std['level']==l]['peaks'], 
                             name=f"{l} std", line_shape='linear', 
                             line=dict(dash='dot')))
fig.show()

## Overlaps

In [None]:
from pathlib import Path
import downstream.bed_metrics as bm

def overlap_with_name_level(overlaps, n, l):
    dfo = overlaps[(n, l)].melt(value_name='overlap')
    dfo['name'] = n
    dfo['level'] = l
    return dfo

def show_overlap(df):
    levels = sorted(set(df['level']))
    overlaps = {}
    for n in set(df['name']):
        for l in levels:
            print('Processing', n, l)
            files = df.loc[np.logical_and(df['name'] == n, df['level'] == l)]['file']
            paths = [Path(f) for f in files]
            df_path = f'/tmp/overlap_{n}_{l}.tsv'
            overlaps[(n, l)] = bm.load_or_build_metrics_table(paths, paths, Path(df_path), jaccard=False)

    dfo = pd.concat([overlap_with_name_level(overlaps, n, l) for (n, l) in overlaps])        
    dfo_mean = dfo.groupby(['name', 'level'])['overlap'].mean().reset_index().sort_values(by=['name'])
    dfo_std = dfo.groupby(['name', 'level'])['overlap'].std().reset_index().fillna(0).sort_values(by=['name']) 
    fig = go.Figure(layout=go.Layout(title=go.layout.Title(text="Overlap")))
    for l in levels:
        fig.add_trace(go.Scatter(x=dfo_mean.loc[dfo_mean['level']==l]['name'], 
                                 y=dfo_mean.loc[dfo_mean['level']==l]['overlap'], 
                                 name=f"{l} mean", line_shape='linear'))
        fig.add_trace(go.Scatter(x=dfo_std.loc[dfo_std['level']==l]['name'], 
                                 y=dfo_std.loc[dfo_std['level']==l]['overlap'], 
                                 name=f"{l} std", line_shape='linear', 
                                 line=dict(dash='dot')))
    fig.show()

In [None]:
macs2levels2process = set(['macs2 q0.05', 'macs2 q0.01', 'macs2 q1e-4',
                          'macs2 broad_0.05', 'macs2 broad_0.01', 'macs2 broad_1e-4'])
show_overlap(dfm.loc[[l in macs2levels2process for l in dfm['level']]])

# SPAN automated markup

In [None]:
%%bash
# Bash commands to create markup by Immgen
DIR=/mnt/stripe/bio/raw-data/geo-samples/GSE53643
OUT=${DIR}/intersect.tsv;
T=$'\t'; 
printf %s "chr${T}start${T}end" > ${OUT}; 
FILES=(); 
for F in $(find ${DIR}/macs2/ -name "*.narrowPeak"); do 
    FILES+=("$F"); 
    printf %s "${T}${F}" >> ${OUT}; 
done; 
echo >> ${OUT};
bedtools multiinter -i "${FILES[@]}" |\
    bedtools merge -c $(seq -s, 6 1 $((${#FILES[@]} + 5))) -o max |\
    awk '{if (NR > 1) printf("\n"); printf("%s\t%s\t%s", $1, $2, $3); for (i=4; i<=NF; i++) printf("\t%d", int($i)); }' >> ${OUT};

# Find out regions where all the peaks present
ALL=""; 
for F in $(seq 1 1 ${#FILES[@]}); do 
    ALL="${ALL}${T}1"; 
done; 
cat ${OUT} | grep "${ALL}" | awk -v OFS='\t' '{print $1,$2,$3}' > ${DIR}/intersect_all.bed

# Find any of the peaks to get scores
F=$(find ${DIR}/macs2/ -name "*.narrowPeak" | head -n 1);
bedtools intersect -a ${F} -b ${DIR}/intersect_all.bed -wa > ${DIR}/intersect_all.narrowPeak

In [None]:
from sklearn.utils import shuffle

idf = pd.read_csv('/mnt/stripe/bio/raw-data/geo-samples/GSE53643/intersect_all.narrowPeak',
                  names=['chr', 'start', 'end', 'name', 'score', 'strand', 'summitfc', 'mlogp', 'mlogq', 'summit'], 
                  sep='\t')
idf.sort_values(by=['mlogq'], ascending=False, inplace=True)

markup_size = 2000
peaks_file = f'/mnt/stripe/bio/raw-data/geo-samples/GSE53643/peaks_{markup_size}.bed'
step = int(len(idf) / markup_size)
markup_df = idf.loc[[i % step == 0 for i in range(len(idf))]]
shuffle(markup_df[['chr', 'start', 'end']]).to_csv(peaks_file, sep='\t', header=None, index=False)
print(f'Saved {markup_size} peaks stratified by p-value to {peaks_file}')

In [None]:
%%bash
DIR=/mnt/stripe/bio/raw-data/geo-samples/GSE53643

# Total 2000 peaks 1000,500,500
# peaks
head -n 1000 ${DIR}/peaks_2000.bed | while read -r LINE; do \
    echo "$LINE" | awk -v OFS='\t' '{print $1,$2,$3,"peaks"}'; \
done > ${DIR}/markup.bed

# peakStart
head -n 1500 ${DIR}/peaks_2000.bed | tail -n 500 | while read -r LINE; do \
    echo "$LINE" | awk '{ printf("%s\t%d\t%d\t%s\n", $1,$2-1000,($2+$3)/2 - 1,"peakStart")}'; \
done >> ${DIR}/markup.bed

# peakEnd
head -n 2000 ${DIR}/peaks_2000.bed | tail -n 500 | while read -r LINE; do \
    echo "$LINE" | awk '{printf("%s\t%d\t%d\t%s\n", $1,($2+$3)/2 + 1,$3+1000,"peakEnd")}'; \
done >> ${DIR}/markup.bed

# extended markup
cat ${DIR}/markup.bed | while read -r LINE; do \
    echo "$LINE" | awk '{print($1,$2-2000,$3+2000)}'; \
done > ${DIR}/markup_ext.bed

## SPAN tuning
```
cd /mnt/stripe/shpynov/chipseq-smk-pipeline
snakemake all_span_tuned --use-conda --cores 28  --config work_dir=/mnt/stripe/bio/raw-data/geo-samples/GSE53643 fastq_dir=/mnt/stripe/bio/raw-data/geo-samples/GSE53643/fastq genome=hg38 span_bin=200 span_markup=/mnt/stripe/bio/raw-data/geo-samples/GSE53643/markup.bed -n

## Rename tuned
mkdir /mnt/stripe/bio/raw-data/geo-samples/GSE53643/span_tuned
for F in /mnt/stripe/bio/raw-data/geo-samples/GSE53643/span/*tuned.peak; do echo $F; P=$(head
-n 1 $F | sed -E 's/(^.*_200_)|(_1\t.*$)//g'); cp -f $F "/mnt/stripe/bio/raw-data/geo-samples/GSE53643/span_tuned/$(echo $F | sed "s/tuned/$P/g" | sed 's#.*/##g')"; done
```

# SPAN

In [None]:
SPAN_FOLDER='/mnt/stripe/bio/raw-data/geo-samples/GSE53643/span'
SPAN_LEVELS = ['100_0.01_0', '100_1e-06_0', '100_tuned', '200_0.01_0', '200_1e-06_5', '200_tuned']

dfs = pd.DataFrame(columns=['gsm', 'name', 'replicate', 'level', 'file', 'peaks', 'length'])
for file in tqdm(glob.glob(SPAN_FOLDER + '/*.peak')):
    level = next((l for l in SPAN_LEVELS if f'_{l}.' in file), None) # 
    if level:
        gsm = re.sub('_H3K4me2.*', '', os.path.basename(file))
        name = re.sub('(GSM[0-9]+_)|(-rep[0-9].*)', '', os.path.basename(file))
        replicate = re.match('rep[0-9]+', os.path.basename(file))
        out = ! awk '{{N+=1;L+=($$3-$$2)}} END{{print(N,L)}}' {file}
        peaks, length = out[0].split(' ')
        dfs.loc[len(dfs)] = (gsm, name, replicate, f'span {level}', file, peaks, length)

# Fix types
dfs['peaks'] = dfs['peaks'].astype(int)
dfs['length'] = dfs['length'].astype(int)

In [None]:
import plotly.graph_objects as go
import plotly.express as px

dfs_mean = dfs.groupby(['name', 'level'])['peaks'].mean().reset_index().sort_values(by=['name', 'level'])
dfs_std = dfs.groupby(['name', 'level'])['peaks'].std().reset_index().fillna(0).sort_values(by=['name', 'level'])

fig = go.Figure(layout=go.Layout(title=go.layout.Title(text="Peaks")))
for l in sorted(set(dfs_mean['level'])):
    fig.add_trace(go.Scatter(x=dfs_mean.loc[dfs_mean['level']==l]['name'], 
                             y=dfs_mean.loc[dfs_mean['level']==l]['peaks'], 
                             name=f"{l} mean", line_shape='linear'))
    fig.add_trace(go.Scatter(x=dfs_std.loc[dfs_std['level']==l]['name'], 
                             y=dfs_std.loc[dfs_std['level']==l]['peaks'], 
                             name=f"{l} std", line_shape='linear', 
                             line=dict(dash='dot')))
fig.show()

In [None]:
spanlevels2process = set(['span 200_1e-06_5', 'span 200_tuned', 'span 100_1e-06_0', 'span 100_tuned'])
show_overlap(dfs.loc[[l in spanlevels2process for l in dfs['level']]])

# Summary

In [None]:
dfa = pd.concat([dfm.loc[[l in macs2levels2process for l in dfm['level']]], 
                 dfs.loc[[l in spanlevels2process for l in dfs['level']]]])

In [None]:
dfa_mean = dfa.groupby(['name', 'level'])['peaks'].mean().reset_index().sort_values(by=['name', 'level'])
dfa_std = dfa.groupby(['name', 'level'])['peaks'].std().reset_index().fillna(0).sort_values(by=['name', 'level'])

In [None]:
fig = go.Figure(layout=go.Layout(title=go.layout.Title(text="Peaks")))
for l in sorted(set(dfa_mean['level'])):
    fig.add_trace(go.Scatter(x=dfa_mean.loc[dfa_mean['level']==l]['name'], 
                             y=dfa_mean.loc[dfa_mean['level']==l]['peaks'], 
                             name=f"{l} mean", line_shape='linear'))
    fig.add_trace(go.Scatter(x=dfa_std.loc[dfa_std['level']==l]['name'], 
                             y=dfa_std.loc[dfa_std['level']==l]['peaks'], 
                             name=f"{l} std", line_shape='linear', 
                             line=dict(dash='dot')))
fig.show()

In [None]:
show_overlap(dfa)

In [None]:
dfa['avg_length'] = dfa['length'] / dfa['peaks']
dfa.loc[~np.isfinite(dfa["avg_length"]), "avg_length"] = 0.0

In [None]:
def plot_data(df, value):
    levels = sorted(set(df['level']))
    fig = plt.figure(figsize=(len(levels), 5))
    ax = plt.gca()
    sns.barplot(data=df, 
                 x="level", y=value,
                 ci="sd", capsize=.2, errwidth=2,
                 edgecolor="black",
                 ax = ax)

    sns.swarmplot(data=df,
                  x="level", y=value,
                  color="black",
                  size=2,
                  alpha=0.5,
                  ax = ax)
    plt.xticks(rotation=45)
    plt.tight_layout()

In [None]:
plot_data(dfa, 'peaks')
plt.show()

In [None]:
plot_data(dfa, 'avg_length')
plt.show()

# Groups analysis CC4Naive / CC4Neg / Naive T-Cells

In [None]:
def cell(sample):
    for v in ['CCR4Neg', 'CCR4pos', 'Naive']:
        if v in sample:
            return v
    raise Exception(f'Unknown sample cell {sample}')

dfa['cell'] = list(map(cell, dfa['name']))

In [None]:
def plot_data_cells(df, value, description):
    cells = sorted(set(df['cell']))
    levels = sorted(set(df['level']))
    axs = {}
    total = len(levels) * len(cells)
    fig = plt.figure(figsize=(int(total * .75), 4))
    offset = 0
    for c in cells:
        data = df.loc[df['cell'] == c].sort_values(by=['level'])
        xlabels = []
        for l in data['level']:
            if l not in xlabels:
                xlabels.append(l)
        w = len(levels)
        ax = plt.subplot2grid((1, total), (0, offset), colspan=w)

        sns.barplot(data=data, 
                     x="level", y=value,
                     ci="sd", capsize=.2, errwidth=2,
                     edgecolor="black",
                     ax = ax)

        sns.swarmplot(data=data,
                      x="level", y=value,
                      size=1,
                      color="black",
                      alpha=0.5,
                      ax = ax)
        ax.legend().set_visible(False)
        axs[ax] = plt.ylim()
        if offset > 0:
            ax.get_yaxis().set_ticklabels([])
            ax.set_ylabel('')
        else:
            ax.set_ylabel(description)
        
        offset += w
        ax.set_xlabel('')
        ax.set_title(c)
        plt.xticks(range(0, len(xlabels)), xlabels, rotation=45)
            
    ymin = np.min([v[0] for v in axs.values()])
    ymax = np.max([v[1] for v in axs.values()])
 
    for ax in axs.keys():
        ax.set_ylim(bottom = ymin, top = ymax)
    plt.tight_layout()

In [None]:
plot_data_cells(dfa, 'peaks', 'Peaks')
plt.show()
plot_data_cells(dfa, 'avg_length', 'Average peak length')
plt.show()

# Overlap

In [None]:
def compute_overlap(df):
    # Compute overlaps
    dft = pd.DataFrame(columns=['id', 'cell', 'level', 'overlap'])
    cells = sorted(set(df['cell']))
    levels = sorted(set(df['level']))
    for c in cells:
        for l in levels:
            paths = [Path(f) for f in df.loc[np.logical_and(df['cell']==c, df['level']==l)]['file']]
            df_path = f'/tmp/overlap_{c}_{l}.tsv'
            mt = bm.load_or_build_metrics_table(paths, paths, Path(df_path),
                                                jaccard=False,
                                                threads=30)
            for row in mt.index:
                for col in mt.columns:
                    overlap = mt.loc[row][col]
                    dft.loc[len(dft)] = (f'{row}@{col}', c, l, overlap)
    return dft

df_overlap = compute_overlap(dfa)

In [None]:
plot_data_cells(df_overlap, 'overlap', 'Overlaps')