Hunter Bennett | Glass Lab | Kupffer Strains Project | 25 Feb 2021  

In [109]:
### header ###
__author__ = "Hunter Bennett"
__license__ = "BSD"
__email__ = "hunter.r.bennett@gmail.com"
%load_ext autoreload
%autoreload 2
### imports ###
import sys
%matplotlib inline
import os
import re
import glob
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns
matplotlib.rcParams['savefig.dpi'] = 200
sns.set(font_scale=1)
sns.set_context('talk')
sns.set_style('white')

# import custom functions
import sys
sys.path.insert(0, '/home/h1bennet/code/')
from hbUtils import ngs_qc, quantile_normalize_df
from plotting_scripts import label_point, pca_rpkm_mat
from homer_preprocessing import read_annotated_peaks

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Set working directory

In [110]:
dataDirectory = '/data/scratch/nspann/'
inputDirectory = '/data/scratch/nspann/NJS_input_210204/tag_directories'
workingDirectory = '/home/h1bennet/strains/results/99_TFE_Family_ChIP_PeakCalling_test'
if not os.path.isdir(workingDirectory):
    os.mkdir(workingDirectory)
os.chdir(workingDirectory)


# We will start by using O'young's script to get the IDR peaks for each condition
____
These are rather annoying to make in my opinion but for the sake of consistancy with Ty we will use these going forward. Their structure is as follows:
* Column 1: Group ID
* Column 2: Color for analysis
* Column 3: absolute path to files on epiglass for each sample in group, separated by semicolon
* Column 4: corresponding short name for each sample in group, separated by semicolon

Colors
* C57Bl6/J: #4daf4a
* Balbc/J: #3773b8
* AJ: #e41a1c

The output of peakIDR.R includes many things, the main thing we use are the final .idr peak files for each group


In [111]:
mitf_vv = np.sort(glob.glob(dataDirectory+'NJS_AM91201MTIF_210204/tag_directories/*veh*veh*'))
mitf_vl = np.sort(glob.glob(dataDirectory+'NJS_AM91201MTIF_210204/tag_directories/*veh*LPS*'))
mitf_vm = np.sort(glob.glob(dataDirectory+'NJS_AM91201MTIF_210204/tag_directories/*veh*MSU*'))
mitf_jv = np.sort(glob.glob(dataDirectory+'NJS_AM91201MTIF_210204/tag_directories/*JNKin*veh*'))
mitf_jl = np.sort(glob.glob(dataDirectory+'NJS_AM91201MTIF_210204/tag_directories/*JNKin*LPS*'))
mitf_jm = np.sort(glob.glob(dataDirectory+'NJS_AM91201MTIF_210204/tag_directories/*JNKin*MSU*'))

In [112]:
tfeb_vv = np.sort(glob.glob(dataDirectory+'NJS_mixTfeB_210204/tag_directories/*veh*veh*'))
tfeb_vl = np.sort(glob.glob(dataDirectory+'NJS_mixTfeB_210204/tag_directories/*veh*LPS*'))
tfeb_vm = np.sort(glob.glob(dataDirectory+'NJS_mixTfeB_210204/tag_directories/*veh*MSU*'))
tfeb_jv = np.sort(glob.glob(dataDirectory+'NJS_mixTfeB_210204/tag_directories/*JNKin*veh*'))
tfeb_jl = np.sort(glob.glob(dataDirectory+'NJS_mixTfeB_210204/tag_directories/*JNKin*LPS*'))
tfeb_jm = np.sort(glob.glob(dataDirectory+'NJS_mixTfeB_210204/tag_directories/*JNKin*MSU*'))

In [113]:
tfec_vv = np.sort(glob.glob(dataDirectory+'NJS_mixTfeC_210204/tag_directories/*veh*veh*'))
tfec_vl = np.sort(glob.glob(dataDirectory+'NJS_mixTfeC_210204/tag_directories/*veh*LPS*'))
tfec_vm = np.sort(glob.glob(dataDirectory+'NJS_mixTfeC_210204/tag_directories/*veh*MSU*'))
tfec_jv = np.sort(glob.glob(dataDirectory+'NJS_mixTfeC_210204/tag_directories/*JNKin*veh*'))
tfec_jl = np.sort(glob.glob(dataDirectory+'NJS_mixTfeC_210204/tag_directories/*JNKin*LPS*'))
tfec_jm = np.sort(glob.glob(dataDirectory+'NJS_mixTfeC_210204/tag_directories/*JNKin*MSU*'))

In [114]:
tfe3_vv = np.sort(glob.glob(dataDirectory+'NJS_mixTfe3_210204/tag_directories/*veh*veh*'))
tfe3_vl = np.sort(glob.glob(dataDirectory+'NJS_mixTfe3_210204/tag_directories/*veh*LPS*'))
tfe3_vm = np.sort(glob.glob(dataDirectory+'NJS_mixTfe3_210204/tag_directories/*veh*MSU*'))
tfe3_jv = np.sort(glob.glob(dataDirectory+'NJS_mixTfe3_210204/tag_directories/*JNKin*veh*'))
tfe3_jl = np.sort(glob.glob(dataDirectory+'NJS_mixTfe3_210204/tag_directories/*JNKin*LPS*'))
tfe3_jm = np.sort(glob.glob(dataDirectory+'NJS_mixTfe3_210204/tag_directories/*JNKin*MSU*'))

In [115]:
!ls /data/scratch/nspann/NJS_input_210204/tag_directories/

mouse_C57Bl6_M_BMDM_RLNR1ChIP_input_JNKin_1h_LPS_5h_16i_NJS_l20210128_GCCGCAAC_TCATGTCT
mouse_C57Bl6_M_BMDM_RLNR1ChIP_input_JNKin_1h_MSU_5h_13i_NJS_l20210128_TTATATCT_GTCTGCAC
mouse_C57Bl6_M_BMDM_RLNR1ChIP_input_JNKin_1h_veh_1h_4i_NJS_l20210128_ACTGTATC_TGCTATTA
mouse_C57Bl6_M_BMDM_RLNR1ChIP_input_veh_1h_LPS_5h_7i_NJS_l20210128_CTGTGGCG_CACTAGCC
mouse_C57Bl6_M_BMDM_RLNR1ChIP_input_veh_1h_MSU_5h_10i_NJS_l20210128_TGTAATCA_AATCTCCA
mouse_C57Bl6_M_BMDM_RLNR1ChIP_input_veh_1h_veh_5h_1i_NJS_l20210128_CTACCAGG_CAACTGAT


In [116]:
paths = [';'.join(mitf_vv),
         ';'.join(mitf_vl),
         ';'.join(mitf_jv),
         ';'.join(mitf_jl),
         ';'.join(tfeb_vv),
         ';'.join(tfeb_vl),
         ';'.join(tfeb_jv),
         ';'.join(tfeb_jl),
#          ';'.join(tfec_vv),
#          ';'.join(tfec_vl),
#          ';'.join(tfec_jv),
#          ';'.join(tfec_jl),
         ';'.join(tfe3_vv),
         ';'.join(tfe3_vl),
         ';'.join(tfe3_jv),
         ';'.join(tfe3_jl)]

In [117]:
input_vv = inputDirectory+'/mouse_C57Bl6_M_BMDM_RLNR1ChIP_input_veh_1h_veh_5h_1i_NJS_l20210128_CTACCAGG_CAACTGAT'
input_vl = inputDirectory+'/mouse_C57Bl6_M_BMDM_RLNR1ChIP_input_veh_1h_LPS_5h_7i_NJS_l20210128_CTGTGGCG_CACTAGCC'
input_jv = inputDirectory+'/mouse_C57Bl6_M_BMDM_RLNR1ChIP_input_JNKin_1h_veh_1h_4i_NJS_l20210128_ACTGTATC_TGCTATTA'
input_jl = inputDirectory+'/mouse_C57Bl6_M_BMDM_RLNR1ChIP_input_JNKin_1h_LPS_5h_16i_NJS_l20210128_GCCGCAAC_TCATGTCT'

In [118]:
ids = ['mitf_bmdm_veh1h_veh5h',
       'mitf_bmdm_veh1h_LPS5h',
       'mitf_bmdm_JNKin1h_veh5h',
       'mitf_bmdm_JNKin1h_LPS5h',
       'tfeb_bmdm_veh1h_veh5h',
       'tfeb_bmdm_veh1h_LPS5h',
       'tfeb_bmdm_JNKin1h_veh5h',
       'tfeb_bmdm_JNKin1h_LPS5h',
#        'tfec_bmdm_veh1h_veh5h',
#        'tfec_bmdm_veh1h_LPS5h',
#        'tfec_bmdm_JNKin1h_veh5h',
#        'tfec_bmdm_JNKin1h_LPS5h',
       'tfe3_bmdm_veh1h_veh5h',
       'tfe3_bmdm_veh1h_LPS5h',
       'tfe3_bmdm_JNKin1h_veh5h',
       'tfe3_bmdm_JNKin1h_LPS5h']

#set colors iteratively
n = 4
cmaps = 'Reds', 'Blues', 'Greens'
colors = []
for c in cmaps:
    cmap = matplotlib.cm.get_cmap(c)
    for i in np.linspace(0.4, 0.9, num = n):
        colors.append(matplotlib.colors.rgb2hex(cmap(i)))


paths = [';'.join(mitf_vv),
         ';'.join(mitf_vl),
         ';'.join(mitf_jv),
         ';'.join(mitf_jl),
         ';'.join(tfeb_vv),
         ';'.join(tfeb_vl),
         ';'.join(tfeb_jv),
         ';'.join(tfeb_jl),
#          ';'.join(tfec_vv),
#          ';'.join(tfec_vl),
#          ';'.join(tfec_jv),
#          ';'.join(tfec_jl),
         ';'.join(tfe3_vv),
         ';'.join(tfe3_vl),
         ';'.join(tfe3_jv),
         ';'.join(tfe3_jl)]

inputs = [input_vv+';'+input_vv,
          input_vl+';'+input_vl,
          input_jv+';'+input_jv,
          input_jl+';'+input_jl] * 3

short_ids = ['%s_rep1;%s_rep2' % (i, i) for i in ids]

In [119]:
pd.DataFrame([ids,colors,paths,short_ids,inputs]).T.to_csv('./sampleDef.txt',
                                                    sep='\t',
                                                    header=False,
                                                    index=False)

Delete tfeb_bmdm_veh1h_veh5h since it only has one replicate.

Run the following command in the

    source activate r-ouyangPipe

    peakIDR.R sampleDef.txt

In [121]:
%%bash
# if directories don't exist, create them
if [ ! -d ./atac_peak_files/ ]; then mkdir ./atac_peak_files; fi
# move IDR peaks into atac_peak_folder
mv *bmdm*.idr ./atac_peak_files

### Merge peaks

In [124]:
if not os.path.isdir('./merged_peaks/'):
    os.mkdir('./merged_peaks/')

In [125]:
%%bash
if [ ! -d ./merged_peaks/ ]; then mkdir ./merged_peaks; fi

mergePeaks -d given ./atac_peak_files/mitf_bmdm* > ./merged_peaks/mitf_bmdm_merged.txt
mergePeaks -d given ./atac_peak_files/tfe3_bmdm* > ./merged_peaks/tfe3_bmdm_merged.txt
mergePeaks -d given ./atac_peak_files/tfeb_bmdm* > ./merged_peaks/tfeb_bmdm_merged.txt

	Max distance to merge: direct overlap required (-d given)
	Merging peaks... 
	Comparing ./atac_peak_files/mitf_bmdm_JNKin1h_LPS5h.idr (11588 total) and ./atac_peak_files/mitf_bmdm_JNKin1h_LPS5h.idr (11588 total)
	Comparing ./atac_peak_files/mitf_bmdm_JNKin1h_LPS5h.idr (11588 total) and ./atac_peak_files/mitf_bmdm_JNKin1h_veh5h.idr (16302 total)
	Comparing ./atac_peak_files/mitf_bmdm_JNKin1h_LPS5h.idr (11588 total) and ./atac_peak_files/mitf_bmdm_veh1h_LPS5h.idr (21156 total)
	Comparing ./atac_peak_files/mitf_bmdm_JNKin1h_LPS5h.idr (11588 total) and ./atac_peak_files/mitf_bmdm_veh1h_veh5h.idr (14446 total)
	Comparing ./atac_peak_files/mitf_bmdm_JNKin1h_veh5h.idr (16302 total) and ./atac_peak_files/mitf_bmdm_JNKin1h_LPS5h.idr (11588 total)
	Comparing ./atac_peak_files/mitf_bmdm_JNKin1h_veh5h.idr (16302 total) and ./atac_peak_files/mitf_bmdm_JNKin1h_veh5h.idr (16302 total)
	Comparing ./atac_peak_files/mitf_bmdm_JNKin1h_veh5h.idr (16302 total) and ./atac_peak_files/mitf_bmdm_veh1h_LPS5h.i