Hunter Bennett | Glass Lab | Brain Aging Project | 19 Feb 2021

This script takes a basic look at the quality control statistics of H3K27Ac ChIP-seq libraries. Mainly we look at clonality, total reads, mapping efficiency, and IP efficiency (call variable peaks for a quick and dirty assessment of IP efficiency). This script also generates a UCSC browser hub for visualization of data to aid in sample seletion based on ChIP quality.

In [24]:
### header ###
__author__ = "Hunter Bennett"
__license__ = "BSD"
__email__ = "hunter.r.bennett@gmail.com"
%load_ext autoreload
%autoreload 2
### imports ###
import sys
%matplotlib inline
import os
import re
import glob
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns
matplotlib.rcParams['savefig.dpi'] = 200
sns.set(font_scale=1)
sns.set_context('talk')
sns.set_style('white')

# import custom functions
import sys
sys.path.insert(0, '/home/h1bennet/code/')
from hbUtils import ngs_qc, quantile_normalize_df

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Set working directory

In [25]:
dataDirectory = '/data/mm10/Brain_MPSIIIA/ChIP/H3K27AC/PU_1/WT/'
workingDirectory = '/home/h1bennet/brain_aging/results/00_PU1_H3K27Ac_QC/'
if not os.path.isdir(workingDirectory):
    os.mkdir(workingDirectory)
os.chdir(workingDirectory)

# Tag Directory Quality control

In [26]:
qc = ngs_qc(dataDirectory, 'atac')

/data/mm10/Brain_MPSIIIA/ChIP/H3K27AC/PU_1/WT//
./WT_qc/


<Figure size 432x288 with 0 Axes>

In [27]:
qc

Unnamed: 0,uniquePositions,fragmentLengthEstimate,tagsPerBP,clonality,GC_Content,totalReads,uniquelyMappedReads,multiMappedReads,unmappedReads,uniquelyMappedFraction,mappedFraction,frac_unmappedReads_mismatch,frac_unmappedReads_short,frac_unmappedReads_other
03_mouse_BL6_M_9week_PU1_ChIP_H3K27ac_1_JOS_20190809_CTTGTA,7042900.0,80.0,0.005611,2.172,2726148000.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0
05_mouse_MPSIIIAhet_M_4month_PU1_ChIP_H3K27ac_1_AL_20200925_AGGTTATA_CAGTTCCG,6322557.0,81.0,0.002761,1.19,0.458,10165475.0,7501594.0,1467722.0,1196159.0,0.737948,0.882331,,,
05_mouse_MPSIIIAhet_M_4month_PU1_ChIP_H3K27ac_2_AL_20200925_TCTGTTGG_TCGAATGG,7245475.0,80.0,0.00357,1.343,0.456,13473231.0,9720989.0,2012760.0,1739482.0,0.721504,0.870893,,,
05_mouse_MPSIIIAhet_M_4month_PU1_ChIP_H3K27ac_3_AL_20201111_CTGCTTCC_GATAGATC,2764577.0,88.0,0.002688,2.65,0.449,10167435.0,7173124.0,1588453.0,1405858.0,0.7055,0.861729,,,
06_mouse_MPSIIIAhet_M_P240_PU1_ChIP_H3K27ac_1_AL_20191226_ATTCCT,9898629.0,114.0,0.024253,6.68,2726316000.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0
06_mouse_MPSIIIAhet_M_P240_PU1_ChIP_H3K27ac_2D_JOS_20191122_CTTGTA,4367243.0,82.0,0.001775,1.108,0.418,,,,,,,,,
06_mouse_MPSIIIAhet_M_P240_PU1_ChIP_H3K27ac_2_AL_20191122_CTTGTA,8782138.0,83.0,0.007779,2.415,2726097000.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0
07_mouse_BL6_M_23month_PU1_ChIP_H3K27ac_1_AL_20201121_ATGTAAGT_ACTCTATG,11940205.0,185.0,0.004804,1.097,0.418,20380059.0,12957321.0,4517077.0,2905661.0,0.635784,0.857426,,,
07_mouse_BL6_M_25month_PU1_ChIP_H3K27ac_1_JOS_20191018_ACTTGA,7872907.0,161.0,0.016188,5.606,2726156000.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0
07_mouse_BL6_M_25month_PU1_ChIP_H3K27ac_2_JOS_20191018_AGTTCC,10370080.0,87.0,0.013013,3.421,2725987000.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0


### Plot tag count distribution

In [28]:
# tds = glob.glob(dataDirectory+'/*')
# tds = np.sort(tds)

# fig, axs = plt.subplots(2,3, figsize=(15, 10), sharex=True, sharey=True)

# for ax, td in zip(axs.flatten(), tds):
#     df = pd.read_csv(td+'/tagCountDistribution.txt', sep='\t', index_col=0)
#     df.loc[1:10, :].plot.bar(ax=ax, legend=False)
#     ax.set_xlabel('Tags per position')
#     ax.set_ylabel('Fraction of Positions')
#     ax.set_title(td.split('/')[-1].split('_AL')[0], fontsize=8)

# make browser hub

Browser hub naming strategy (CapitalizeFirstLetters):  
hrb_project_qc/viz_celltype_ChIPTarget/input

Browser color strategy:  
* QC:
    * Sox9: 99,99,99
    * Olig2: 49,163,84
    * NeuN: 222,45,38
    * PU1: 49,130,189
* Visualize: TBD

In [10]:
np.sort(os.listdir(dataDirectory))

array(['00_mouse_BL6_M_10day_NeuN_ChIP_H3K27ac_1_AL_20200610_GCCTAGCC_TTGGTCTG',
       '00_mouse_BL6_M_12day_NeuN_ChIP_H3K27ac_1_AL_20201121_GTCGGAGC_GGTTATAA',
       '00_mouse_BL6_M_12day_NeuN_ChIP_H3K27ac_2_AL_20201121_ATGAGGCC_GTTAATTG',
       '02_mouse_BL6_M_3week_NeuN_ChIP_H3K27ac_1_AL_l20200925_TGGCCGGT_TAGAGCGC',
       '02_mouse_BL6_M_3week_NeuN_ChIP_H3K27ac_2_AL_20201121_TCCAACGC_TTGGACTT',
       '03_mouse_BL6_M_8week_NeuN_ChIP_H3K27ac_1_JOS_20190801_ACTGAT',
       '03_mouse_BL6_M_8week_NeuN_ChIP_H3K27ac_2_JOS_20191009_GTAGAG',
       '03_mouse_BL6_M_8week_NeuN_ChIP_H3K27ac_3_AL_20191226_AGTCAA',
       '03_mouse_BL6_M_8week_NeuN_ChIP_H3K27ac_4_AL_20191226_GTCCGC',
       '03_mouse_BL6_M_9week_NeuN_ChIP_H3K27ac_1_JOS_20190809_CGATGT',
       '04_mouse_MPSIIIAhet_M_4month_NeuN_ChIP_H3K27ac_1_AL_l20200925_ACAGGCGC_CTCTGCCT',
       '04_mouse_MPSIIIAhet_M_4month_NeuN_ChIP_H3K27ac_2_AL_l20200925_GAACCGCG_TGACCTTA',
       '04_mouse_MPSIIIAhet_M_4month_NeuN_ChIP_H3K27ac_3_AL_l

In [6]:
makeMultiWigHub.pl hrb_BrainAging_QC_NeuN_H3K27Ac mm10 \
-gradient 158,202,225 8,81,156 \
-force -d /data/mm10/Brain_MPSIIIA/ChIP/H3K27AC/NeuN/WT/*

SyntaxError: invalid syntax (<ipython-input-6-21c86d5cec7e>, line 1)

 Based on the browser tracks and the qc for these samples this experiment will probably need to be redone.

# Find peaks

In [29]:
if not os.path.isdir('./peak_files/'):
    os.mkdir('./peak_files/')

In [30]:
# set input directory
inputdir = "/data/mm10/Brain_MPSIIIA/ChIP/input/PU1/02_mouse_MPSIIIAhet_M_4month_PU1_ChIP_input_AL_l20200925_ATCCACTG_AGGTGCGT/"

with open('./peakCalling_homer.sh', 'w') as f:
    for tagdir in np.sort(os.listdir(dataDirectory)):
        find_peaks_fw = ['findPeaks', dataDirectory + '/' + tagdir,
                          '-i', inputdir,
                          '-size 1000 -minDist 2500',
                          '-o',
                          workingDirectory + '/peak_files/' + tagdir + '_fixedwidth_peaks.tsv',
                          '&', '\n']
        find_peaks = ['findPeaks', dataDirectory + '/' + tagdir,
                          '-i', inputdir, '-region',
                          '-size 1000 -minDist 2500',
                          '-o',
                          workingDirectory + '/peak_files/' + tagdir + '_variablewidth_peaks.tsv',
                          '&', '\n']
        # write commands to file
        f.write(' '.join(find_peaks_fw))
        f.write(' '.join(find_peaks))
    f.close()

In [10]:
peak_stats={}

for td in os.listdir('./peak_files/'):

    peak_dict = {'total_peaks': 0,
                 'ip_eff': 0.0,
                 'peaks_filt_input': 0,
                 'peaks_filt_local': 0,
                 'peaks_filt_clonal':0}

    with open('./peak_files/'+td, 'r') as f:
        for line in f.readlines():
            if "total peaks" in line:
                peak_dict['total_peaks'] = int(line.split(' = ')[-1])
            if "Approximate IP efficiency" in line:
                peak_dict['ip_eff'] = float(line.replace('%\n', '').split(' = ')[-1])
            if "Putative peaks filtered by input" in line:
                peak_dict['peaks_filt_input'] = int(line.split(' = ')[-1])
            if "Putative peaks filtered by local signal" in line:
                peak_dict['peaks_filt_local'] = int(line.split(' = ')[-1])
            if "Putative peaks filtered for being too clonal" in line:
                peak_dict['peaks_filt_clonal'] = int(line.split(' = ')[-1])
                
    peak_stats[td] = peak_dict
            
df = pd.DataFrame(peak_stats).T
df.sort_index(inplace=True)

In [16]:
df.loc[df.index.str.contains('variable'), :]

Unnamed: 0,total_peaks,ip_eff,peaks_filt_input,peaks_filt_local,peaks_filt_clonal
00_mouse_BL6_M_10day_NeuN_ChIP_H3K27ac_1_AL_20200610_GCCTAGCC_TTGGTCTG_variablewidth_peaks.tsv,18474.0,22.52,18.0,0.0,14.0
00_mouse_BL6_M_12day_NeuN_ChIP_H3K27ac_1_AL_20201121_GTCGGAGC_GGTTATAA_variablewidth_peaks.tsv,24392.0,18.09,643.0,0.0,0.0
00_mouse_BL6_M_12day_NeuN_ChIP_H3K27ac_2_AL_20201121_ATGAGGCC_GTTAATTG_variablewidth_peaks.tsv,38197.0,31.29,1416.0,0.0,0.0
02_mouse_BL6_M_3week_NeuN_ChIP_H3K27ac_1_AL_l20200925_TGGCCGGT_TAGAGCGC_variablewidth_peaks.tsv,44062.0,40.58,2477.0,0.0,2.0
02_mouse_BL6_M_3week_NeuN_ChIP_H3K27ac_2_AL_20201121_TCCAACGC_TTGGACTT_variablewidth_peaks.tsv,41077.0,37.31,1864.0,0.0,1.0
03_mouse_BL6_M_8week_NeuN_ChIP_H3K27ac_1_JOS_20190801_ACTGAT_variablewidth_peaks.tsv,18556.0,49.31,1.0,0.0,14.0
03_mouse_BL6_M_8week_NeuN_ChIP_H3K27ac_2_JOS_20191009_GTAGAG_variablewidth_peaks.tsv,19367.0,21.11,722.0,0.0,0.0
03_mouse_BL6_M_8week_NeuN_ChIP_H3K27ac_3_AL_20191226_AGTCAA_variablewidth_peaks.tsv,15937.0,16.94,472.0,0.0,0.0
03_mouse_BL6_M_8week_NeuN_ChIP_H3K27ac_4_AL_20191226_GTCCGC_variablewidth_peaks.tsv,20467.0,28.23,530.0,0.0,0.0
03_mouse_BL6_M_9week_NeuN_ChIP_H3K27ac_1_JOS_20190809_CGATGT_variablewidth_peaks.tsv,27841.0,55.79,1311.0,0.0,0.0


### Write to files

In [17]:
df.loc[df.index.str.contains('variable'), :].to_csv('./variablewidth_peak_table.txt', sep='\t')
df.loc[df.index.str.contains('fixed'), :].to_csv('./fixedwidth_peak_table.txt', sep='\t')
# df.loc[df.index.str.contains('factor'), :].to_csv('./factor_peak_table.txt', sep='\t') 