# GSE53643 - Replicated H3K4me2 consistency

Logbook: https://docs.google.com/document/d/1VGH4fA20LbhGGKWvBg28E7G1JBm1rLlmCEtD2X239Eg/edit#heading=h.44kd47qfiiva


In [9]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

from IPython.display import display
import pandas as pd

import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
import scipy as sp
import os, re
import glob
from pybedtools import BedTool
from tqdm.auto import tqdm

# MACS2

In [121]:
MACS2_FOLDER='/mnt/stripe/bio/raw-data/geo-samples/GSE53643/macs2'
MACS2_LEVELS = ['q0.05']

dfm = pd.DataFrame(columns=['gsm', 'name', 'replicate', 'level', 'file', 'peaks'])
for file in tqdm(glob.glob(MACS2_FOLDER + '/*.narrowPeak')):
    level = next((l for l in MACS2_LEVELS if f'_{l}_' in file), None) # 
    if level:
        gsm = re.sub('_H3K4me2.*', '', os.path.basename(file))
        name = re.sub('(GSM[0-9]+_)|(-rep[0-9].*)', '', os.path.basename(file))
        replicate = re.sub(f'(.*-Donor[0-9]+-rep)|(_{level}.*)','', os.path.basename(file))
        peaks = int(BedTool(file).count())
        if np.isnan(peaks):
            raise Exception(f'Wrong number of peaks {file}')
        dfm.loc[len(dfm)] = (gsm, name, replicate, f'macs2 {level}', file, peaks)

# Fix types
dfm['replicate'] = dfm['replicate'].astype(int)
dfm['peaks'] = dfm['peaks'].astype(int)

HBox(children=(IntProgress(value=0, max=120), HTML(value='')))




In [122]:
dfm_mean = dfm.groupby(['name', 'level'])['peaks'].mean().reset_index().sort_values(by=['name', 'level'])
dfm_std = dfm.groupby(['name', 'level'])['peaks'].std().reset_index().fillna(0).sort_values(by=['name', 'level'])

In [123]:
import plotly.graph_objects as go
import plotly.express as px

fig = go.Figure(layout=go.Layout(title=go.layout.Title(text="MACS2 peaks")))
fig.add_trace(go.Scatter(x=dfm_mean['name'], y=dfm_mean['peaks'], name="macs2 mean", line_shape='linear'))
fig.add_trace(go.Scatter(x=dfm_std['name'], y=dfm_std['peaks'], name="macs2 std", line_shape='linear'))
fig.show()

In [100]:
fig = go.Figure()
fig.add_trace(go.Bar(
    name='macs2 q0.05',
    x=dfm_mean['name'], y=dfm_mean['peaks'],
    error_y=dict(type='data', array=dfm_std['peaks'])
))
fig.update_layout(barmode='group')
fig.show()

## Overlaps

In [129]:
from pathlib import Path
import downstream.bed_metrics as bm

def overlap_with_name_level(overlaps, n, l):
    dfo = overlaps[(n, l)].melt(value_name='overlap')
    dfo['name'] = n
    dfo['level'] = l
    return dfo

def show_overlap(df):
    levels = sorted(set(df['level']))
    overlaps = {}
    for n in set(df['name']):
        for l in levels:
            print('Processing', n, l)
            files = df.loc[np.logical_and(df['name'] == n, df['level'] == l)]['file']
            paths = [Path(f) for f in files]
            df_path = f'/tmp/overlap_{n}_{l}.tsv'
            overlaps[(n, l)] = bm.load_or_build_metrics_table(paths, paths, Path(df_path), jaccard=False)

    dfo = pd.concat([overlap_with_name_level(overlaps, n, l) for (n, l) in overlaps])        
    dfo_mean = dfo.groupby(['name', 'level'])['overlap'].mean().reset_index().sort_values(by=['name'])
    dfo_std = dfo.groupby(['name', 'level'])['overlap'].std().reset_index().fillna(0).sort_values(by=['name']) 
    fig = go.Figure(layout=go.Layout(title=go.layout.Title(text="Overlap")))
    for l in levels:
        fig.add_trace(go.Scatter(x=dfo_mean.loc[dfo_mean['level']==l]['name'], 
                                 y=dfo_mean.loc[dfo_mean['level']==l]['overlap'], 
                                 name=f"{l} mean", line_shape='linear'))
        fig.add_trace(go.Scatter(x=dfo_std.loc[dfo_std['level']==l]['name'], 
                                 y=dfo_std.loc[dfo_std['level']==l]['overlap'], 
                                 name=f"{l} std", line_shape='linear', 
                                 line=dict(dash='dot')))
    fig.show()
#     Barplots
#     fig = go.Figure()
#     for l in levels:
#         fig.add_trace(go.Bar(
#             name=str(l),
#             x=dfo_mean.loc[dfo_mean['level']==l]['name'], 
#             y=dfo_mean.loc[dfo_mean['level']==l]['overlap'],
#             error_y=dict(type='data', array=dfo_std.loc[dfo_std['level']==l]['overlap'])
#         ))
#     fig.update_layout(barmode='group')
#     fig.show()

In [130]:
show_overlap(dfm)

Processing H3K4me2_ChIPSeq_Naive-Donor2 macs2 q0.05
[Skipped]: Already exists /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor2_macs2 q0.05.tsv
Processing H3K4me2_ChIPSeq_CCR4Neg-Donor12 macs2 q0.05
[Skipped]: Already exists /tmp/overlap_H3K4me2_ChIPSeq_CCR4Neg-Donor12_macs2 q0.05.tsv
Processing H3K4me2_ChIPSeq_CCR4pos-Donor17 macs2 q0.05
[Skipped]: Already exists /tmp/overlap_H3K4me2_ChIPSeq_CCR4pos-Donor17_macs2 q0.05.tsv
Processing H3K4me2_ChIPSeq_Naive-Donor22 macs2 q0.05
[Skipped]: Already exists /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor22_macs2 q0.05.tsv
Processing H3K4me2_ChIPSeq_CCR4Neg-Donor22 macs2 q0.05
[Skipped]: Already exists /tmp/overlap_H3K4me2_ChIPSeq_CCR4Neg-Donor22_macs2 q0.05.tsv
Processing H3K4me2_ChIPSeq_CCR4pos-Donor14 macs2 q0.05
[Skipped]: Already exists /tmp/overlap_H3K4me2_ChIPSeq_CCR4pos-Donor14_macs2 q0.05.tsv
Processing H3K4me2_ChIPSeq_CCR4pos-Donor8 macs2 q0.05
[Skipped]: Already exists /tmp/overlap_H3K4me2_ChIPSeq_CCR4pos-Donor8_macs2 q0.05.tsv
Processing H3K4me2

Processing H3K4me2_ChIPSeq_CCR4Neg-Donor11 macs2 q0.05
[Skipped]: Already exists /tmp/overlap_H3K4me2_ChIPSeq_CCR4Neg-Donor11_macs2 q0.05.tsv
Processing H3K4me2_ChIPSeq_CCR4pos-Donor20 macs2 q0.05
[Skipped]: Already exists /tmp/overlap_H3K4me2_ChIPSeq_CCR4pos-Donor20_macs2 q0.05.tsv
Processing H3K4me2_ChIPSeq_CCR4Neg-Donor23 macs2 q0.05
[Skipped]: Already exists /tmp/overlap_H3K4me2_ChIPSeq_CCR4Neg-Donor23_macs2 q0.05.tsv


# SPAN

In [131]:
SPAN_FOLDER='/mnt/stripe/bio/raw-data/geo-samples/GSE53643/span'
SPAN_LEVELS = ['0.05_0', '0.05_5', '1e-06_5']

dfs = pd.DataFrame(columns=['gsm', 'name', 'replicate', 'level', 'file', 'peaks'])
for file in tqdm(glob.glob(SPAN_FOLDER + '/*.peak')):
    level = next((l for l in SPAN_LEVELS if f'_{l}.' in file), None) # 
    if level:
        gsm = re.sub('_H3K4me2.*', '', os.path.basename(file))
        name = re.sub('(GSM[0-9]+_)|(-rep[0-9].*)', '', os.path.basename(file))
        replicate = re.sub(f'(.*-Donor[0-9]+-rep)|(_{level}.*)','', os.path.basename(file))
        peaks = int(BedTool(file).count())
        if np.isnan(peaks):
            raise Exception(f'Wrong number of peaks {file}')
        dfs.loc[len(dfs)] = (gsm, name, replicate, f'span {level}', file, peaks)

# Fix types
dfs['replicate'] = dfs['replicate'].astype(int)
dfs['peaks'] = dfs['peaks'].astype(int)

HBox(children=(IntProgress(value=0, max=360), HTML(value='')))




In [None]:
dfs_mean = dfs.groupby(['name', 'level'])['peaks'].mean().reset_index().sort_values(by=['name', 'level'])
dfs_std = dfs.groupby(['name', 'level'])['peaks'].std().reset_index().fillna(0).sort_values(by=['name', 'level'])

In [134]:
import plotly.graph_objects as go
import plotly.express as px

fig = go.Figure(layout=go.Layout(title=go.layout.Title(text="Peaks")))
for l in sorted(set(dfs_mean['level'])):
    fig.add_trace(go.Scatter(x=dfs_mean.loc[dfs_mean['level']==l]['name'], 
                             y=dfs_mean.loc[dfs_mean['level']==l]['peaks'], 
                             name=f"{l} mean", line_shape='linear'))
    fig.add_trace(go.Scatter(x=dfs_std.loc[dfs_std['level']==l]['name'], 
                             y=dfs_std.loc[dfs_std['level']==l]['peaks'], 
                             name=f"{l} std", line_shape='linear', 
                             line=dict(dash='dot')))
fig.show()

In [136]:
fig = go.Figure()
for l in sorted(set(dfs['level'])):
    fig.add_trace(go.Bar(
        name=l,
        x=dfs_mean.loc[dfs_mean['level']==l]['name'], 
        y=dfs_mean.loc[dfs_mean['level']==l]['peaks'],
        error_y=dict(type='data', array=dfs_std.loc[dfs_std['level']==l]['peaks'])
    ))
fig.update_layout(barmode='group')
fig.show()

## Overlaps

In [137]:
show_overlap(dfs)

Processing H3K4me2_ChIPSeq_Naive-Donor2 span 0.05_0
Calculating metrics:  /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor2_span 0.05_0.tsv
  [Saved] /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor2_span 0.05_0.tsv
Processing H3K4me2_ChIPSeq_Naive-Donor2 span 0.05_5
Calculating metrics:  /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor2_span 0.05_5.tsv
  [Saved] /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor2_span 0.05_5.tsv
Processing H3K4me2_ChIPSeq_Naive-Donor2 span 1e-06_5
Calculating metrics:  /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor2_span 1e-06_5.tsv
  [Saved] /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor2_span 1e-06_5.tsv
Processing H3K4me2_ChIPSeq_CCR4Neg-Donor12 span 0.05_0
Calculating metrics:  /tmp/overlap_H3K4me2_ChIPSeq_CCR4Neg-Donor12_span 0.05_0.tsv
  [Saved] /tmp/overlap_H3K4me2_ChIPSeq_CCR4Neg-Donor12_span 0.05_0.tsv
Processing H3K4me2_ChIPSeq_CCR4Neg-Donor12 span 0.05_5
Calculating metrics:  /tmp/overlap_H3K4me2_ChIPSeq_CCR4Neg-Donor12_span 0.05_5.tsv
  [Saved] /tmp/overlap_H3K4me2_ChIPSeq_CCR4Neg-Don

  [Saved] /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor21_span 0.05_5.tsv
Processing H3K4me2_ChIPSeq_Naive-Donor21 span 1e-06_5
Calculating metrics:  /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor21_span 1e-06_5.tsv
  [Saved] /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor21_span 1e-06_5.tsv
Processing H3K4me2_ChIPSeq_CCR4Neg-Donor21 span 0.05_0
Calculating metrics:  /tmp/overlap_H3K4me2_ChIPSeq_CCR4Neg-Donor21_span 0.05_0.tsv
  [Saved] /tmp/overlap_H3K4me2_ChIPSeq_CCR4Neg-Donor21_span 0.05_0.tsv
Processing H3K4me2_ChIPSeq_CCR4Neg-Donor21 span 0.05_5
Calculating metrics:  /tmp/overlap_H3K4me2_ChIPSeq_CCR4Neg-Donor21_span 0.05_5.tsv
  [Saved] /tmp/overlap_H3K4me2_ChIPSeq_CCR4Neg-Donor21_span 0.05_5.tsv
Processing H3K4me2_ChIPSeq_CCR4Neg-Donor21 span 1e-06_5
Calculating metrics:  /tmp/overlap_H3K4me2_ChIPSeq_CCR4Neg-Donor21_span 1e-06_5.tsv
  [Saved] /tmp/overlap_H3K4me2_ChIPSeq_CCR4Neg-Donor21_span 1e-06_5.tsv
Processing H3K4me2_ChIPSeq_CCR4Neg-Donor7 span 0.05_0
Calculating metrics:  /tmp/overlap_H3K4me

  [Saved] /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor1_span 1e-06_5.tsv
Processing H3K4me2_ChIPSeq_CCR4pos-Donor21 span 0.05_0
Calculating metrics:  /tmp/overlap_H3K4me2_ChIPSeq_CCR4pos-Donor21_span 0.05_0.tsv
  [Saved] /tmp/overlap_H3K4me2_ChIPSeq_CCR4pos-Donor21_span 0.05_0.tsv
Processing H3K4me2_ChIPSeq_CCR4pos-Donor21 span 0.05_5
Calculating metrics:  /tmp/overlap_H3K4me2_ChIPSeq_CCR4pos-Donor21_span 0.05_5.tsv
  [Saved] /tmp/overlap_H3K4me2_ChIPSeq_CCR4pos-Donor21_span 0.05_5.tsv
Processing H3K4me2_ChIPSeq_CCR4pos-Donor21 span 1e-06_5
Calculating metrics:  /tmp/overlap_H3K4me2_ChIPSeq_CCR4pos-Donor21_span 1e-06_5.tsv
  [Saved] /tmp/overlap_H3K4me2_ChIPSeq_CCR4pos-Donor21_span 1e-06_5.tsv
Processing H3K4me2_ChIPSeq_Naive-Donor11 span 0.05_0
Calculating metrics:  /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor11_span 0.05_0.tsv
  [Saved] /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor11_span 0.05_0.tsv
Processing H3K4me2_ChIPSeq_Naive-Donor11 span 0.05_5
Calculating metrics:  /tmp/overlap_H3K4me2_Ch

  [Saved] /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor19_span 0.05_0.tsv
Processing H3K4me2_ChIPSeq_Naive-Donor19 span 0.05_5
Calculating metrics:  /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor19_span 0.05_5.tsv
  [Saved] /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor19_span 0.05_5.tsv
Processing H3K4me2_ChIPSeq_Naive-Donor19 span 1e-06_5
Calculating metrics:  /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor19_span 1e-06_5.tsv
  [Saved] /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor19_span 1e-06_5.tsv
Processing H3K4me2_ChIPSeq_Naive-Donor14 span 0.05_0
Calculating metrics:  /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor14_span 0.05_0.tsv
  [Saved] /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor14_span 0.05_0.tsv
Processing H3K4me2_ChIPSeq_Naive-Donor14 span 0.05_5
Calculating metrics:  /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor14_span 0.05_5.tsv
  [Saved] /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor14_span 0.05_5.tsv
Processing H3K4me2_ChIPSeq_Naive-Donor14 span 1e-06_5
Calculating metrics:  /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor

  [Saved] /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor16_span 0.05_5.tsv
Processing H3K4me2_ChIPSeq_Naive-Donor16 span 1e-06_5
Calculating metrics:  /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor16_span 1e-06_5.tsv
  [Saved] /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor16_span 1e-06_5.tsv
Processing H3K4me2_ChIPSeq_CCR4Neg-Donor6 span 0.05_0
Calculating metrics:  /tmp/overlap_H3K4me2_ChIPSeq_CCR4Neg-Donor6_span 0.05_0.tsv
  [Saved] /tmp/overlap_H3K4me2_ChIPSeq_CCR4Neg-Donor6_span 0.05_0.tsv
Processing H3K4me2_ChIPSeq_CCR4Neg-Donor6 span 0.05_5
Calculating metrics:  /tmp/overlap_H3K4me2_ChIPSeq_CCR4Neg-Donor6_span 0.05_5.tsv
  [Saved] /tmp/overlap_H3K4me2_ChIPSeq_CCR4Neg-Donor6_span 0.05_5.tsv
Processing H3K4me2_ChIPSeq_CCR4Neg-Donor6 span 1e-06_5
Calculating metrics:  /tmp/overlap_H3K4me2_ChIPSeq_CCR4Neg-Donor6_span 1e-06_5.tsv
  [Saved] /tmp/overlap_H3K4me2_ChIPSeq_CCR4Neg-Donor6_span 1e-06_5.tsv
Processing H3K4me2_ChIPSeq_Naive-Donor13 span 0.05_0
Calculating metrics:  /tmp/overlap_H3K4me2_ChIPSeq_

# Summary

In [139]:
dfa = pd.concat([dfm, dfs.loc[dfs['level'] == 'span 0.05_0']])

In [140]:
dfa_mean = dfa.groupby(['name', 'level'])['peaks'].mean().reset_index().sort_values(by=['name', 'level'])
dfa_std = dfa.groupby(['name', 'level'])['peaks'].std().reset_index().fillna(0).sort_values(by=['name', 'level'])

In [142]:
fig = go.Figure(layout=go.Layout(title=go.layout.Title(text="Peaks")))
for l in sorted(set(dfa_mean['level'])):
    fig.add_trace(go.Scatter(x=dfa_mean.loc[dfa_mean['level']==l]['name'], 
                             y=dfa_mean.loc[dfa_mean['level']==l]['peaks'], 
                             name=f"{l} mean", line_shape='linear'))
    fig.add_trace(go.Scatter(x=dfa_std.loc[dfa_std['level']==l]['name'], 
                             y=dfa_std.loc[dfa_std['level']==l]['peaks'], 
                             name=f"{l} std", line_shape='linear', 
                             line=dict(dash='dot')))
fig.show()

In [143]:
show_overlap(dfa)

Processing H3K4me2_ChIPSeq_Naive-Donor2 macs2 q0.05
[Skipped]: Already exists /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor2_macs2 q0.05.tsv
Processing H3K4me2_ChIPSeq_Naive-Donor2 span 0.05_0
[Skipped]: Already exists /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor2_span 0.05_0.tsv
Processing H3K4me2_ChIPSeq_CCR4Neg-Donor12 macs2 q0.05
[Skipped]: Already exists /tmp/overlap_H3K4me2_ChIPSeq_CCR4Neg-Donor12_macs2 q0.05.tsv
Processing H3K4me2_ChIPSeq_CCR4Neg-Donor12 span 0.05_0
[Skipped]: Already exists /tmp/overlap_H3K4me2_ChIPSeq_CCR4Neg-Donor12_span 0.05_0.tsv
Processing H3K4me2_ChIPSeq_CCR4pos-Donor17 macs2 q0.05
[Skipped]: Already exists /tmp/overlap_H3K4me2_ChIPSeq_CCR4pos-Donor17_macs2 q0.05.tsv
Processing H3K4me2_ChIPSeq_CCR4pos-Donor17 span 0.05_0
[Skipped]: Already exists /tmp/overlap_H3K4me2_ChIPSeq_CCR4pos-Donor17_span 0.05_0.tsv
Processing H3K4me2_ChIPSeq_Naive-Donor22 macs2 q0.05
[Skipped]: Already exists /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor22_macs2 q0.05.tsv
Processing H3K4me2_ChI

Processing H3K4me2_ChIPSeq_CCR4Neg-Donor15 span 0.05_0
[Skipped]: Already exists /tmp/overlap_H3K4me2_ChIPSeq_CCR4Neg-Donor15_span 0.05_0.tsv
Processing H3K4me2_ChIPSeq_Naive-Donor16 macs2 q0.05
[Skipped]: Already exists /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor16_macs2 q0.05.tsv
Processing H3K4me2_ChIPSeq_Naive-Donor16 span 0.05_0
[Skipped]: Already exists /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor16_span 0.05_0.tsv
Processing H3K4me2_ChIPSeq_CCR4Neg-Donor6 macs2 q0.05
[Skipped]: Already exists /tmp/overlap_H3K4me2_ChIPSeq_CCR4Neg-Donor6_macs2 q0.05.tsv
Processing H3K4me2_ChIPSeq_CCR4Neg-Donor6 span 0.05_0
[Skipped]: Already exists /tmp/overlap_H3K4me2_ChIPSeq_CCR4Neg-Donor6_span 0.05_0.tsv
Processing H3K4me2_ChIPSeq_Naive-Donor13 macs2 q0.05
[Skipped]: Already exists /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor13_macs2 q0.05.tsv
Processing H3K4me2_ChIPSeq_Naive-Donor13 span 0.05_0
[Skipped]: Already exists /tmp/overlap_H3K4me2_ChIPSeq_Naive-Donor13_span 0.05_0.tsv
Processing H3K4me2_ChIPSeq