In [1]:
### header ###
__author__ = "Hunter Bennett"
__license__ = "BSD"
__email__ = "hunter.r.bennett@gmail.com"
%load_ext autoreload
%autoreload 2
### imports ###
import sys
%matplotlib inline
import os
import re
import glob
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns
matplotlib.rcParams['savefig.dpi'] = 200
sns.set(font_scale=1)
sns.set_context('talk')
sns.set_style('white')

# import custom functions
import sys
sys.path.insert(0, '/home/h1bennet/code/')
from hbUtils import ngs_qc, quantile_normalize_df
from plotting_scripts import label_point, pca_rpkm_mat
from homer_preprocessing import read_annotated_peaks

Inge's annotate peak files located:

    /home/iholtman/nuclei_project/annotation_files/annotation_18_7_2019

The file that might work:

    annotated_peaks_tss_with_pooled_tagdirs_H3K27ac_norm.txt

Set working directory

In [2]:
dataDirectory = '/data/mm10/Brain_MPSIIIA/ChIP/H3K27AC/'
inputDirectory = '/data/mm10/Brain_MPSIIIA/ChIP/input/'
workingDirectory = '/home/h1bennet/brain_aging/results/04_Young_Cell_H3K27Ac/'
if not os.path.isdir(workingDirectory):
    os.mkdir(workingDirectory)
os.chdir(workingDirectory)

read in color scheme dictionary

In [32]:
hex_dict={}
rgb_dict={}
for line in open('/gpfs/data01/glasslab/home/h1bennet/brain_aging/data/brain_aging_color_table.txt','r'):
    split = line.strip().replace('\ufeff','').split('\t')
    # split2 = split[1]
    hex_dict[split[0]] = split[1]
    rgb_dict[split[0]] = split[2]    

In [33]:
print(rgb_dict)

{'olig2_young': '158,202,225', 'olig2_4month': '49,130,189', 'olig2_25month': '8,81,156', 'pu1_young': '252,146,114', 'pu1_4month': '222,45,38', 'pu1_25month': '165,15,21', 'neun_young': '161,217,155', 'neun_4month': '49,163,84', 'neun_25month': '0,109,44', 'sox9_young': '253,174,107', 'sox9_4month': '230,85,13', 'sox9_25month': '166,54,3'}


In [7]:
tagdirs = [
           # olig2 nuclei
           'Olig2/WT/03_mouse_MPSIIIAhet_M_4mongh_Olig2_ChIP_H3K27ac_1_AL_l20200925_CTCACCAA_CTAGGCAA',
           'Olig2/WT/03_mouse_MPSIIIAhet_M_4month_Olig2_ChIP_H3K27ac_2_AL_l20200925_GTGAATAT_TCTCATTC',
           'Olig2/WT/03_mouse_MPSIIIAhet_M_4month_Olig2_ChIP_H3K27ac_3_AL_20201111_GGTCACGA_GTATTATG',
           # neun nuclei
           'NeuN/WT/04_mouse_MPSIIIAhet_M_4month_NeuN_ChIP_H3K27ac_1_AL_l20200925_ACAGGCGC_CTCTGCCT',
           'NeuN/WT/04_mouse_MPSIIIAhet_M_4month_NeuN_ChIP_H3K27ac_2_AL_l20200925_GAACCGCG_TGACCTTA',
           'NeuN/WT/04_mouse_MPSIIIAhet_M_4month_NeuN_ChIP_H3K27ac_3_AL_l20200925_TGCGAGAC_CATTGTTG',
           'NeuN/WT/04_mouse_MPSIIIAhet_M_4month_NeuN_ChIP_H3K27ac_4_AL_20201111_AACTGTAG_ACGCCGCA',
           'NeuN/WT/04_mouse_MPSIIIAhet_M_4month_NeuN_ChIP_H3K27ac_5_AL_20201121_TACCGAGG_CCTGAACT',
           # pu1 nuclei
           'PU_1/WT/05_mouse_MPSIIIAhet_M_4month_PU1_ChIP_H3K27ac_1_AL_20200925_AGGTTATA_CAGTTCCG',
           'PU_1/WT/05_mouse_MPSIIIAhet_M_4month_PU1_ChIP_H3K27ac_2_AL_20200925_TCTGTTGG_TCGAATGG',
           'PU_1/WT/05_mouse_MPSIIIAhet_M_4month_PU1_ChIP_H3K27ac_3_AL_20201111_CTGCTTCC_GATAGATC',
           # sox9 nuclei
           'Sox9/WT/03_mouse_MPSIIIAhet_M_4month_SOX9_ChIP_H3K27ac_1_AL_20200925_CATAGAGT_TGCCACCA',
           'Sox9/WT/03_mouse_MPSIIIAhet_M_4month_SOX9_ChIP_H3K27ac_2_AL_20201121_AATGCCTC_TCGATCCA']

In [8]:
inputdirs = [
             # olig2 nuclei
             'Olig2/WT/03_mouse_MPSIIIAhet_Olig2_ChIP_input_4_month_AL_l20200925_AAGTCCAA_TATGAGTA',
             'Olig2/WT/03_mouse_MPSIIIAhet_Olig2_ChIP_input_4_month_AL_l20200925_AAGTCCAA_TATGAGTA',
             'Olig2/WT/03_mouse_MPSIIIAhet_Olig2_ChIP_input_4_month_AL_l20200925_AAGTCCAA_TATGAGTA',
              # neun nuclei
             'NeuN/03_mouse_MPSIIIAhet_NeuN_ChIP_input_4_month_AL_l20200925_TACTCATA_CCTGTGGC',
             'NeuN/03_mouse_MPSIIIAhet_NeuN_ChIP_input_4_month_AL_l20200925_TACTCATA_CCTGTGGC',
             'NeuN/03_mouse_MPSIIIAhet_NeuN_ChIP_input_4_month_AL_l20200925_TACTCATA_CCTGTGGC',
             'NeuN/03_mouse_MPSIIIAhet_NeuN_ChIP_input_4_month_AL_l20200925_TACTCATA_CCTGTGGC',
             'NeuN/03_mouse_MPSIIIAhet_NeuN_ChIP_input_4_month_AL_l20200925_TACTCATA_CCTGTGGC',
              # pu1 nuclei
             'PU1/02_mouse_MPSIIIAhet_M_4month_PU1_ChIP_input_AL_l20200925_ATCCACTG_AGGTGCGT',
             'PU1/02_mouse_MPSIIIAhet_M_4month_PU1_ChIP_input_AL_l20200925_ATCCACTG_AGGTGCGT',
             'PU1/02_mouse_MPSIIIAhet_M_4month_PU1_ChIP_input_AL_l20200925_ATCCACTG_AGGTGCGT',
             # sox9 nuclei
             'SOX9/01_mouse_BL6_M_3month_SOX9_input_1_AL_20200205_ATGTCA',
             'SOX9/01_mouse_BL6_M_3month_SOX9_input_1_AL_20200205_ATGTCA']

In [None]:
makeMultiWigHub.pl mpsiiia_vis_combined_wt_h3K27ac mm10 \
-colors \
116,196,118 49,163,84 0,109,44 \
197,27,138 122,1,119 \
8,81,156 166,54,3 \
-force -d ./merged_tagdirs/*

### Call Fixed and Variable Peaks Once Again

In [9]:
%%bash
# delete the existing script file
rm ./peakCalling_homer.sh
# create a script file
touch ./peakCalling_homer.sh

# if directories don't exist, create them
if [ ! -d ./peak_files/ ]; then mkdir ./peak_files; fi

rm: cannot remove ‘./peakCalling_homer.sh’: No such file or directory


In [11]:
with open('./peakCalling_homer.sh', 'w') as f:
    for tagdir, inputdir in zip(tagdirs, inputdirs):
        peakname = tagdir.split('/')[-1]
        print('analyzing:', tagdir)
        print('input:', inputdir)
        print()

        find_peaks_vw = ['findPeaks', dataDirectory + '/' + tagdir,
                          '-i', inputDirectory + '/' + inputdir,
                          '-region',
                          '-size 1000 -minDist 2500',
                          '-o',
                          workingDirectory + '/peak_files/' + peakname + '_variablewidth_peaks.tsv',
                          '\n\n']
        
        
        # write commands to file
        f.write(' '.join(find_peaks_vw))

    f.close()

analyzing: Olig2/WT/03_mouse_MPSIIIAhet_M_4mongh_Olig2_ChIP_H3K27ac_1_AL_l20200925_CTCACCAA_CTAGGCAA
input: Olig2/WT/03_mouse_MPSIIIAhet_Olig2_ChIP_input_4_month_AL_l20200925_AAGTCCAA_TATGAGTA

analyzing: Olig2/WT/03_mouse_MPSIIIAhet_M_4month_Olig2_ChIP_H3K27ac_2_AL_l20200925_GTGAATAT_TCTCATTC
input: Olig2/WT/03_mouse_MPSIIIAhet_Olig2_ChIP_input_4_month_AL_l20200925_AAGTCCAA_TATGAGTA

analyzing: Olig2/WT/03_mouse_MPSIIIAhet_M_4month_Olig2_ChIP_H3K27ac_3_AL_20201111_GGTCACGA_GTATTATG
input: Olig2/WT/03_mouse_MPSIIIAhet_Olig2_ChIP_input_4_month_AL_l20200925_AAGTCCAA_TATGAGTA

analyzing: NeuN/WT/04_mouse_MPSIIIAhet_M_4month_NeuN_ChIP_H3K27ac_1_AL_l20200925_ACAGGCGC_CTCTGCCT
input: NeuN/03_mouse_MPSIIIAhet_NeuN_ChIP_input_4_month_AL_l20200925_TACTCATA_CCTGTGGC

analyzing: NeuN/WT/04_mouse_MPSIIIAhet_M_4month_NeuN_ChIP_H3K27ac_2_AL_l20200925_GAACCGCG_TGACCTTA
input: NeuN/03_mouse_MPSIIIAhet_NeuN_ChIP_input_4_month_AL_l20200925_TACTCATA_CCTGTGGC

analyzing: NeuN/WT/04_mouse_MPSIIIAhet_M_4mon

## Merge peaks

### Make subgroup specific merged peaks - these will be used later

In [36]:
if not os.path.isdir('./merged_peaks/'):
    os.mkdir('././merged_peaks/')

    mergePeaks ./peak_files/*Olig2*variablewidth* > merged_peaks/Olig2_4month_vw_peaks_merged.txt
    mergePeaks ./peak_files/*NeuN*variablewidth* > merged_peaks/NeuN_4month_vw_peaks_merged.txt
    mergePeaks ./peak_files/*PU1*variablewidth* > merged_peaks/PU1_4month_vw_peaks_merged.txt
    mergePeaks ./peak_files/*SOX9*variablewidth* > merged_peaks/SOX9_4month_vw_peaks_merged.txt
    mergePeaks ./peak_files/*variablewidth* > merged_peaks/vw_peaks_merged.txt
    

### Next: Annotate peaks

In [37]:
%%bash
# delete the existing script file
rm ./annotatePeaks_homer.sh
# create a script file
touch ./annotatePeaks_homer.sh

rm: cannot remove ‘./annotatePeaks_homer.sh’: No such file or directory


In [38]:
tagdirs_full = [dataDirectory+i for i in tagdirs]

In [40]:
with open('./annotatePeaks_homer.sh', 'w') as f:
    annotate_vw_raw = ['annotatePeaks.pl', './merged_peaks/vw_peaks_merged.txt',
                   'mm10', '-size given', '-raw', '-d \\\n',
                   ' \\\n'.join(tagdirs_full), '>',
                   './merged_peaks/ann_raw_idr_vw_peaks_merged.txt &\n\n']

    annotate_vw_norm = ['annotatePeaks.pl', './merged_peaks/vw_peaks_merged.txt',
                       'mm10', '-size given', '-norm 1e7', '-d \\\n',
                       '  \\\n'.join(tagdirs_full), '>',
                       './merged_peaks/ann_norm_idr_vw_peaks_merged.txt &\n\n']
    
    annotate_prom_raw = ['annotatePeaks.pl', 'tss',
                         'mm10', '-size given', '-norm 1e7', '-d \\\n',
                         ' \\\n'.join(tagdirs_full), '>',
                         './merged_peaks/ann_norm_promoters.txt &\n\n']
    
    annotate_prom_norm = ['annotatePeaks.pl', 'tss',
                         'mm10', '-size given', '-raw', '-d \\\n',
                         ' \\\n'.join(tagdirs_full), '>',
                         './merged_peaks/ann_raw_promoters.txt &\n\n']
    
    f.write(' '.join(annotate_vw_raw))    
    f.write(' '.join(annotate_vw_norm))
    f.write(' '.join(annotate_prom_raw))    
    f.write(' '.join(annotate_prom_norm))
    
    f.close()

## Convert merged peaks to bed files for upload to UCSC

In [4]:
%%bash
if [ ! -d ./bed_files/ ]; then mkdir ./bed_files; fi

# delete the existing script file
rm ./make_bed.sh
# create a script file
touch ./make_bed.sh

for peakfile in ./merged_peaks/*4month*vw_peaks_merged*;
do bedfile=${peakfile/merged_peaks/bed_files};
bedfile=${bedfile/.txt/.bed}
echo "pos2bed.pl $peakfile > $bedfile" >> make_bed.sh
done

In [6]:
%%bash

echo 'track name="NeuN_H3K27Ac_regions" description="NeuN Nuclei H3K27Ac regions"' \
| cat - ./bed_files/NeuN_4month_vw_peaks_merged.bed \
> temp && mv temp ./bed_files/NeuN_4month_vw_peaks_merged.bed

echo 'track name="Olig2_H3K27Ac_peaks" description="Olig2 Nuclei H3K27Ac regions"' \
| cat - ./bed_files/Olig2_4month_vw_peaks_merged.bed \
> temp && mv temp ./bed_files/Olig2_4month_vw_peaks_merged.bed

echo 'track name="PU1_H3K27Ac_peaks" description="PU1 Nuclei H3K27Ac regions"' \
| cat - ./bed_files/PU1_4month_vw_peaks_merged.bed \
> temp && mv temp ./bed_files/PU1_4month_vw_peaks_merged.bed

echo 'track name="Sox9_H3K27Ac_peaks" description="Sox9 Nuclei H3K27Ac regions"' \
| cat - ./bed_files/SOX9_4month_vw_peaks_merged.bed \
> temp && mv temp ./bed_files/SOX9_4month_vw_peaks_merged.bed
