Hunter Bennett | Glass Lab | Brain Aging Project | 19 Feb 2021

This notebook finds peaks and annotates them with H3K27Ac reads for downstream analysis. The steps accomplished are as follows:
1. Call both variable width and nucleosome free regions using HOMER.
2. Merge peaks into timepoint specific merged peak sets and overall merged peak sets.
3. Annotate overall merged peak sets with H3K27Ac reads from all tag directories.

In [2]:
### header ###
__author__ = "Hunter Bennett"
__license__ = "BSD"
__email__ = "hunter.r.bennett@gmail.com"
%load_ext autoreload
%autoreload 2
### imports ###
import sys
%matplotlib inline
import os
import re
import glob
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns
matplotlib.rcParams['savefig.dpi'] = 200
sns.set(font_scale=1)
sns.set_context('talk')
sns.set_style('white')

# import custom functions
import sys
sys.path.insert(0, '/home/h1bennet/code/')
from hbUtils import ngs_qc, quantile_normalize_df
from plotting_scripts import label_point, pca_rpkm_mat
from homer_preprocessing import read_annotated_peaks

# Set working paths

In [3]:
dataDirectory = '/data/mm10/Brain_MPSIIIA/ChIP/H3K27AC/NeuN/WT/'
inputDirectory = '/data/mm10/Brain_MPSIIIA/ChIP/input/NeuN/'
workingDirectory = '/home/h1bennet/brain_aging/results/01_NeuN_H3K27Ac/'
if not os.path.isdir(workingDirectory):
    os.mkdir(workingDirectory)
os.chdir(workingDirectory)

# Call peaks using matched inputs.

## Set list of tag directories

In [9]:
tagdirs = ['00_mouse_BL6_M_10day_NeuN_ChIP_H3K27ac_1_AL_20200610_GCCTAGCC_TTGGTCTG',
           '00_mouse_BL6_M_12day_NeuN_ChIP_H3K27ac_1_AL_20201121_GTCGGAGC_GGTTATAA',
           '00_mouse_BL6_M_12day_NeuN_ChIP_H3K27ac_2_AL_20201121_ATGAGGCC_GTTAATTG',
           '02_mouse_BL6_M_3week_NeuN_ChIP_H3K27ac_1_AL_l20200925_TGGCCGGT_TAGAGCGC',
           '02_mouse_BL6_M_3week_NeuN_ChIP_H3K27ac_2_AL_20201121_TCCAACGC_TTGGACTT',
           '03_mouse_BL6_M_8week_NeuN_ChIP_H3K27ac_1_JOS_20190801_ACTGAT',
           '03_mouse_BL6_M_8week_NeuN_ChIP_H3K27ac_2_JOS_20191009_GTAGAG',
           '03_mouse_BL6_M_8week_NeuN_ChIP_H3K27ac_3_AL_20191226_AGTCAA',
           '03_mouse_BL6_M_8week_NeuN_ChIP_H3K27ac_4_AL_20191226_GTCCGC',
           '03_mouse_BL6_M_9week_NeuN_ChIP_H3K27ac_1_JOS_20190809_CGATGT',
           '04_mouse_MPSIIIAhet_M_4month_NeuN_ChIP_H3K27ac_1_AL_l20200925_ACAGGCGC_CTCTGCCT',
           '04_mouse_MPSIIIAhet_M_4month_NeuN_ChIP_H3K27ac_2_AL_l20200925_GAACCGCG_TGACCTTA',
           '04_mouse_MPSIIIAhet_M_4month_NeuN_ChIP_H3K27ac_3_AL_l20200925_TGCGAGAC_CATTGTTG',
           '04_mouse_MPSIIIAhet_M_4month_NeuN_ChIP_H3K27ac_4_AL_20201111_AACTGTAG_ACGCCGCA',
           '04_mouse_MPSIIIAhet_M_4month_NeuN_ChIP_H3K27ac_5_AL_20201121_TACCGAGG_CCTGAACT',
           '05_mouse_MPSIIIAhet_M_P240_NeuN_ChIP_H3K27ac_1_AL_20191226_GAGTGG',
           '05_mouse_MPSIIIAhet_M_P240_NeuN_ChIP_H3K27ac_2_AL_20191122_CGATGT',
           '06_mouse_BL6_M_23month_NeuN_ChIP_H3K27ac_1_AL_20201121_TAATACAG_ATATTCAC',
           '06_mouse_BL6_M_28month_NeuN_ChIP_H3K27ac_1_AL_20201121_GCACGGAC_GTCTCGCA',
           '06_mouse_BL6_M_31month_NeuN_ChIP_H3K27ac_1_AL_20201111_TCATCCTT_AGCGAGCT']

## Set list of corresponding input directories

In [10]:
inputdirs = ['01_mouse_C57_M_P21_NeuN_ChIP_input_409_AL_l20191122_CTATAC',
           '01_mouse_C57_M_P21_NeuN_ChIP_input_409_AL_l20191122_CTATAC',
           '01_mouse_C57_M_P21_NeuN_ChIP_input_409_AL_l20191122_CTATAC',
           '01_mouse_C57_M_P21_NeuN_ChIP_input_409_AL_l20191122_CTATAC',
           '01_mouse_C57_M_P21_NeuN_ChIP_input_409_AL_l20191122_CTATAC',
           '02_mouse_BL6_M_8week_NeuN_input_1_JOS_20190801_CATTTT',
           '02_mouse_BL6_M_8week_NeuN_input_1_JOS_20190801_CATTTT',
           '02_mouse_BL6_M_8week_NeuN_input_1_JOS_20190801_CATTTT',
           '02_mouse_BL6_M_8week_NeuN_input_1_JOS_20190801_CATTTT',
           '02_mouse_BL6_M_8week_NeuN_input_1_JOS_20190801_CATTTT',
           '03_mouse_MPSIIIAhet_NeuN_ChIP_input_4_month_AL_l20200925_TACTCATA_CCTGTGGC',
           '03_mouse_MPSIIIAhet_NeuN_ChIP_input_4_month_AL_l20200925_TACTCATA_CCTGTGGC',
           '03_mouse_MPSIIIAhet_NeuN_ChIP_input_4_month_AL_l20200925_TACTCATA_CCTGTGGC',
           '03_mouse_MPSIIIAhet_NeuN_ChIP_input_4_month_AL_l20200925_TACTCATA_CCTGTGGC',
           '03_mouse_MPSIIIAhet_NeuN_ChIP_input_4_month_AL_l20200925_TACTCATA_CCTGTGGC',
           '05_mouse_MPSIIIAhet_M_P240_NeuN_ChIP_H3K27ac_1_AL_20191226_GAGTGG',
           '05_mouse_MPSIIIAhet_M_P240_NeuN_ChIP_H3K27ac_2_AL_20191122_CGATGT',
           '04_mouse_MPSIIIAhet_M_P240_NeuN_input_2_AL_20191122_CCAACA',
           '04_mouse_MPSIIIAhet_M_P240_NeuN_input_2_AL_20191122_CCAACA',
           '04_mouse_MPSIIIAhet_M_P240_NeuN_input_2_AL_20191122_CCAACA']

# Call Peaks with matched inputs

In [12]:

with open('./peakCalling_homer.sh', 'w') as f:
    for tagdir, inputdir in zip(tagdirs, inputdirs):
        print('analyzing:', tagdir)
        print('input:', inputdir)
        print()
        
        find_peaks_vw = ['findPeaks', dataDirectory + '/' + tagdir,
                          '-i', inputDirectory + '/' + inputdir,
                          '-region',
                          '-size 1000 -minDist 2500',
                          '-o',
                          workingDirectory + '/peak_files/' + tagdir + '_variablewidth_peaks.tsv',
                          '&', '\n\n']
        
        find_peaks_nfr = ['findPeaks', dataDirectory + '/' + tagdir,
                          '-i', inputDirectory + '/' + inputdir,
                          '-nfr', '-size 200',
                          '-o',
                          workingDirectory + '/peak_files/' + tagdir + '_nfr_peaks.tsv',
                          '&', '\n\n']

        # write commands to file
        f.write(' '.join(find_peaks_vw))
        f.write(' '.join(find_peaks_nfr))      
        
    f.close()

analyzing: 00_mouse_BL6_M_10day_NeuN_ChIP_H3K27ac_1_AL_20200610_GCCTAGCC_TTGGTCTG
input: 01_mouse_C57_M_P21_NeuN_ChIP_input_409_AL_l20191122_CTATAC

analyzing: 00_mouse_BL6_M_12day_NeuN_ChIP_H3K27ac_1_AL_20201121_GTCGGAGC_GGTTATAA
input: 01_mouse_C57_M_P21_NeuN_ChIP_input_409_AL_l20191122_CTATAC

analyzing: 00_mouse_BL6_M_12day_NeuN_ChIP_H3K27ac_2_AL_20201121_ATGAGGCC_GTTAATTG
input: 01_mouse_C57_M_P21_NeuN_ChIP_input_409_AL_l20191122_CTATAC

analyzing: 02_mouse_BL6_M_3week_NeuN_ChIP_H3K27ac_1_AL_l20200925_TGGCCGGT_TAGAGCGC
input: 01_mouse_C57_M_P21_NeuN_ChIP_input_409_AL_l20191122_CTATAC

analyzing: 02_mouse_BL6_M_3week_NeuN_ChIP_H3K27ac_2_AL_20201121_TCCAACGC_TTGGACTT
input: 01_mouse_C57_M_P21_NeuN_ChIP_input_409_AL_l20191122_CTATAC

analyzing: 03_mouse_BL6_M_8week_NeuN_ChIP_H3K27ac_1_JOS_20190801_ACTGAT
input: 02_mouse_BL6_M_8week_NeuN_input_1_JOS_20190801_CATTTT

analyzing: 03_mouse_BL6_M_8week_NeuN_ChIP_H3K27ac_2_JOS_20191009_GTAGAG
input: 02_mouse_BL6_M_8week_NeuN_input_1_JOS_201

# Merge peaks

In [13]:
%%bash
if [ ! -d ./merged_peaks/ ]; then mkdir ./merged_peaks; fi
mergePeaks ./peak_files/*variablewidth* > merged_peaks/vw_peaks_merged.txt
mergePeaks ./peak_files/*nfr_peaks.tsv* > merged_peaks/nfr_peaks_merged.txt

Process is interrupted.


### Make subgroup specific merged peaks - these will be used later

In [None]:
%%bash
if [ ! -d ./merged_peaks/ ]; then mkdir ./merged_peaks; fi
mergePeaks ./peak_files/00_mouse*nfr* > merged_peaks/00_NeuN_2WeekOrLess_nfr_peaks_merged.txt
mergePeaks ./peak_files/02_mouse*nfr* > merged_peaks/02_NeuN_P21_nfr_peaks_merged.txt
mergePeaks ./peak_files/03_mouse*nfr* > merged_peaks/03_NeuN_8Week_nfr_peaks_merged.txt
mergePeaks ./peak_files/04_mouse*nfr* > merged_peaks/04_NeuN_4Month_nfr_peaks_merged.txt
mergePeaks ./peak_files/06_mouse*nfr* > merged_peaks/06_NeuN_20Month_nfr_peaks_merged.txt
mergePeaks ./peak_files/00_mouse*variablewidth* > merged_peaks/00_NeuN_2WeekOrLess_vw_peaks_merged.txt
mergePeaks ./peak_files/02_mouse*variablewidth* > merged_peaks/02_NeuN_P21_vw_peaks_merged.txt
mergePeaks ./peak_files/03_mouse*variablewidth* > merged_peaks/03_NeuN_8Week_vw_peaks_merged.txt
mergePeaks ./peak_files/04_mouse*variablewidth* > merged_peaks/04_NeuN_4Month_vw_peaks_merged.txt
mergePeaks ./peak_files/06_mouse*variablewidth* > merged_peaks/06_NeuN_20Month_vw_peaks_merged.txt

## Convert merged peaks to bed files for upload to UCSC

In [14]:
%%bash
if [ ! -d ./bed_files/ ]; then mkdir ./bed_files; fi

# delete the existing script file
rm ./make_bed.sh
# create a script file
touch ./make_bed.sh

for peakfile in ./merged_peaks/*;
do bedfile=${peakfile/merged_peaks/bed_files};
bedfile=${bedfile/.txt/.bed}
echo "pos2bed.pl $peakfile > $bedfile" >> make_bed.sh
done


rm: cannot remove ‘./make_bed.sh’: No such file or directory


Best to add labels to the bed files so that we can use them on the browser

In [15]:
%%bash

echo 'track name="NeuN_H3K27Ac_NFR" description="NeuN Nuclei H3K27Ac Chip-seq Nucleosome Free Regions"' \
| cat - ./bed_files/nfr_peaks_merged.bed \
> temp && mv temp ./bed_files/nfr_peaks_merged.bed

echo 'track name="NeuN_H3K27Ac_VW" description="NeuN Nuclei H3K27Ac Chip-seq Variable Width Peaks"' \
| cat - ./bed_files/vw_peaks_merged.bed \
> temp && mv temp ./bed_files/vw_peaks_merged.bed

# Annotate peaks

In [16]:
tagdirs_full = [dataDirectory+i for i in tagdirs]

In [19]:
if not os.path.isdir('./annotated_peaks/'):
    os.mkdir('./annotated_peaks/')

with open('./annotatePeaks_homer.sh', 'w') as f:
    
    annotate_nfr_raw = ['annotatePeaks.pl', './merged_peaks/nfr_peaks_merged.txt',
                       'mm10', '-size 1000', '-raw', '-d \\\n',
                       ' \\\n'.join(tagdirs_full), '>',
                       './annotated_peaks/ann_raw_nfr_peaks_merged.txt &\n\n']
    
    annotate_nfr_norm = ['annotatePeaks.pl', './merged_peaks/nfr_peaks_merged.txt',
                       'mm10', '-size 1000', '-norm 1e7', '-d \\\n',
                       ' \\\n'.join(tagdirs_full), '>',
                       './annotated_peaks/ann_norm_nfr_peaks_merged.txt &\n\n']
    
    annotate_vw_raw = ['annotatePeaks.pl', './merged_peaks/vw_peaks_merged.txt',
                   'mm10', '-size given', '-raw', '-d \\\n',
                   ' \\\n'.join(tagdirs_full), '>',
                   './annotated_peaks/ann_raw_vw_peaks_merged.txt &\n\n']

    annotate_vw_norm = ['annotatePeaks.pl', './merged_peaks/vw_peaks_merged.txt',
                       'mm10', '-size given', '-norm 1e7', '-d \\\n',
                       ' \\\n'.join(tagdirs_full), '>',
                       './annotated_peaks/ann_norm_vw_peaks_merged.txt &']

    f.write(' '.join(annotate_nfr_raw))    
    f.write(' '.join(annotate_nfr_norm))
    f.write(' '.join(annotate_vw_raw))    
    f.write(' '.join(annotate_vw_norm))
    
    f.close()