Hunter Bennett | Glass Lab | Kupffer Strains Project | 19 April 2021

In addition to calling differential peaks we want to examine the mutational burden in enhancers with different log2fc between strains (similar to analysis done in our other strains papers)

In [4]:
### header ###
__author__ = "Hunter Bennett"
__license__ = "BSD"
__email__ = "hunter.r.bennett@gmail.com"
%load_ext autoreload
%autoreload 2
### imports ###
import sys
%matplotlib inline
import os
import re
import glob
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns
matplotlib.rcParams['savefig.dpi'] = 200
sns.set(font_scale=1)
sns.set_context('talk')
sns.set_style('white')

# import custom functions
import sys
sys.path.insert(0, '/home/h1bennet/code/')
from hbUtils import ngs_qc, quantile_normalize_df
from plotting_scripts import label_point, pca_rpkm_mat, get_diff_volcano
from homer_preprocessing import read_annotated_peaks, import_homer_diffpeak, pull_comparisons_get_diff

In [5]:
dataDirectory = ''
workingDirectory = '/home/h1bennet/strains/results/06_Strains_Control_Cohort2_ATAC/'
if not os.path.isdir(workingDirectory):
    os.mkdir(workingDirectory)
os.chdir(workingDirectory)

In [10]:
best_reps = [
       '/home/h1bennet/strains/data/ATAC/control_cohort2/tag_mouse_aj_Male_Kupffer_ATAC_Chow_healthyoung_AJ1_TDT_l20210213_GTAGAGAG_CTATTAAG_S11_L001_trim30.aj.bowtie2_shifted_from_AJ.sam',
       '/home/h1bennet/strains/data/ATAC/control_cohort2/tag_mouse_aj_Male_Kupffer_ATAC_Chow_healthyoung_AJ2_TDT_l20210213_CAGCCTCG_AAGGCTAT_S12_L001_trim30.aj.bowtie2_shifted_from_AJ.sam',
       '/home/h1bennet/strains/data/ATAC/control_cohort2/tag_mouse_aj_Male_Kupffer_ATAC_Chow_healthyoung_AJ3_TDT_l20210213_TGCCTCTT_GAGCCTTA_S13_L001_trim30.aj.bowtie2_shifted_from_AJ.sam',
       '/home/h1bennet/strains/data/ATAC/control_cohort2/tag_mouse_aj_Male_Kupffer_ATAC_Chow_healthyoung_AJ4_TDT_l20210213_TCCTCTAC_TTATGCGA_S14_L001_trim30.aj.bowtie2_shifted_from_AJ.sam',
       '/home/h1bennet/strains/data/ATAC/control_cohort2/tag_mouse_balbcj_Male_Kupffer_ATAC_Chow_healthyoung_BALB1_TDT_l20210213_TCATGAGC_CTCTCTAT_S15_L001_trim30.balbcj.bowtie2_shifted_from_BALBCJ.sam',
       '/home/h1bennet/strains/data/ATAC/control_cohort2/tag_mouse_balbcj_Male_Kupffer_ATAC_Chow_healthyoung_BALB2_TDT_l20210213_CCTGAGAT_TATCCTCT_S16_L001_trim30.balbcj.bowtie2_shifted_from_BALBCJ.sam',
       '/home/h1bennet/strains/data/ATAC/control_cohort2/tag_mouse_balbcj_Male_Kupffer_ATAC_Chow_healthyoung_BALB3_TDT_l20210213_TAGCGAGT_GTAAGGAG_S17_L001_trim30.balbcj.bowtie2_shifted_from_BALBCJ.sam',
       '/home/h1bennet/strains/data/ATAC/control_cohort2/tag_mouse_balbcj_Male_Kupffer_ATAC_Chow_healthyoung_BALB4_TDT_l20210213_GTAGCTCC_ACTGCATA_S18_L001_trim30.balbcj.bowtie2_shifted_from_BALBCJ.sam',
       '/home/h1bennet/strains/data/ATAC/control_cohort2/tag_mouse_c57bl6j_Male_Kupffer_ATAC_Chow_healthyoung_C571_TDT_l20210213_TACTACGC_AAGGAGTA_S19_L001_trim30',
       '/home/h1bennet/strains/data/ATAC/control_cohort2/tag_mouse_c57bl6j_Male_Kupffer_ATAC_Chow_healthyoung_C572_TDT_l20210213_AGGCTCCG_CTAAGCCT_S20_L001_trim30',
       '/home/h1bennet/strains/data/ATAC/control_cohort2/tag_mouse_c57bl6j_Male_Kupffer_ATAC_Chow_healthyoung_C573_TDT_l20210213_GCAGCGTA_CGTCTAAT_S21_L001_trim30',
       '/home/h1bennet/strains/data/ATAC/control_cohort2/tag_mouse_c57bl6j_Male_Kupffer_ATAC_Chow_healthyoung_C574_TDT_l20210213_CTGCGCAT_TCTCTCCG_S22_L001_trim30']

# Save differential peaks for MMARGE mutational burden analysis.

In [11]:
diff_peak, peaks, peak_mat, peak_mat_quant = import_homer_diffpeak(
    './merged_peaks/diff_output.txt',
    './merged_peaks/ann_norm_idr_peaks_merged.txt')

annotatePeaks all peaks (86301, 30)
getDiffExpression selected transcripts (84264, 39)
annotatePeaks selected peaks (84264, 30)


In [16]:
# pull out selected samples
cols = np.append(np.asarray([True]*18, 'bool'), peak_mat.columns.str.contains('|'.join(best_reps)))
tst = np.append(np.asarray([True]*18, 'bool'), peak_mat.columns.str.contains('|'.join(best_reps)))
peaks = peaks.loc[:, tst]
peak_mat = peak_mat.loc[:, peak_mat.columns.str.contains('|'.join(best_reps))]

In [17]:
comp_dict = pull_comparisons_get_diff(diff_peak, seq_type='Peak')

In [18]:
comp_dict.keys()

dict_keys(['aj vs. balbcj', 'aj vs. c57bl6j', 'balbcj vs. c57bl6j'])

In [24]:
if not os.path.isdir('./marge_mutational_burden/'):
    os.mkdir('./marge_mutational_burden/')

In [36]:
pval = 0.05
fcs = [1,2,4]
peak_list_dict = {}

In [37]:
for fc in fcs:   
    for key in comp_dict.keys():
        # select get diff data frame.
        df = comp_dict[key]
        deg = df.index[(df.loc[:, 'adj_pval'] < pval) & (np.abs(df.loc[:, 'log2fc']) >= fc)]
        peak_list_dict[key+'_de_peaks_fc_'+str(fc)] = deg
        
        # add list for nonsig peaks
        nondeg = df.index[(df.loc[:, 'adj_pval'] >= pval) & (np.abs(df.loc[:, 'log2fc']) < fc)]
        peak_list_dict[key+'_nonsig'] = nondeg

if np.NaN get introduced into the matrix then it converts 'int' type columns to 'float' type columns, this is not ideal and interferes with downstream peak analysis so we create a dict to change the start and end columns back to integers just in case

In [38]:
convert_dict = {'Start': int,
                'End': int}

for key in peak_list_dict.keys():
    # save cell specific promoters
    tmp = diff_peak.reindex(peak_list_dict[key]).dropna(how='all').iloc[:, :4]
    tmp = tmp.astype(convert_dict)
    tmp.to_csv('./marge_mutational_burden/'+key.replace(' vs. ','_vs_')+'.txt',
               sep='\t')

Annoate mutations with MMARGE

In [42]:
with open('./marge_annotate_mutations.sh', 'w') as f:
    for peakfile in glob.glob('./marge_mutational_burden/aj*c57*.txt'):
        anno_mut = ['MMARGE.pl annotate_mutations', '-file', peakfile,
                    '-output', peakfile.replace('.txt', '_anno_muts.txt'),
                    '-ind', 'aj\n\n']
        f.write(' '.join(anno_mut))
        
    for peakfile in glob.glob('./marge_mutational_burden/balbcj*c57*.txt'):
        anno_mut = ['MMARGE.pl annotate_mutations', '-file', peakfile,
                    '-output', peakfile.replace('.txt', '_anno_muts.txt'),
                    '-ind', 'balbcj\n\n']
        f.write(' '.join(anno_mut))
        
    for peakfile in glob.glob('./marge_mutational_burden/aj*balbcj*.txt'):
        anno_mut = ['MMARGE.pl annotate_mutations', '-file', peakfile,
                    '-output', peakfile.replace('.txt', '_anno_muts.txt'),
                    '-ind', 'balbcj\n\n']
        f.write(' '.join(anno_mut))

Calculate number of mutations in each file

In [65]:
np.sort(glob.glob('./marge_mutational_burden/*anno_muts.txt'))

array(['./marge_mutational_burden/aj_vs_c57bl6j_de_peaks_fc_1_anno_muts.txt',
       './marge_mutational_burden/aj_vs_c57bl6j_de_peaks_fc_2_anno_muts.txt',
       './marge_mutational_burden/aj_vs_c57bl6j_de_peaks_fc_4_anno_muts.txt',
       './marge_mutational_burden/aj_vs_c57bl6j_nonsig_anno_muts.txt',
       './marge_mutational_burden/balbcj_vs_c57bl6j_de_peaks_fc_1_anno_muts.txt',
       './marge_mutational_burden/balbcj_vs_c57bl6j_de_peaks_fc_2_anno_muts.txt',
       './marge_mutational_burden/balbcj_vs_c57bl6j_de_peaks_fc_4_anno_muts.txt',
       './marge_mutational_burden/balbcj_vs_c57bl6j_nonsig_anno_muts.txt'],
      dtype='<U71')

In [60]:
help(round)

Help on built-in function round in module builtins:

round(number, ndigits=None)
    Round a number to a given precision in decimal digits.
    
    The return value is an integer if ndigits is omitted or None.  Otherwise
    the return value has the same type as the number.  ndigits may be negative.



In [81]:
mut_frac_dict = {}
for peakfile in np.sort(glob.glob('./marge_mutational_burden/*anno_muts.txt')):
    mutfile = pd.read_csv(peakfile, index_col=0, sep='\t')
     #print(mutfile.iloc[:, -1].isna().value_counts())
    nomut = mutfile.iloc[:, -1].isna().sum()
    total_peaks = mutfile.iloc[:, -1].shape[0]
    mut_frac_dict[peakfile.split('/')[-1]] = round((total_peaks-nomut)/(total_peaks),
                                                   ndigits=3)

In [82]:
pd.Series(mut_frac_dict)

aj_vs_c57bl6j_de_peaks_fc_1_anno_muts.txt        0.552
aj_vs_c57bl6j_de_peaks_fc_2_anno_muts.txt        0.641
aj_vs_c57bl6j_de_peaks_fc_4_anno_muts.txt        0.654
aj_vs_c57bl6j_nonsig_anno_muts.txt               0.166
balbcj_vs_c57bl6j_de_peaks_fc_1_anno_muts.txt    0.491
balbcj_vs_c57bl6j_de_peaks_fc_2_anno_muts.txt    0.593
balbcj_vs_c57bl6j_de_peaks_fc_4_anno_muts.txt    0.641
balbcj_vs_c57bl6j_nonsig_anno_muts.txt           0.148
dtype: float64

This can be put into a fancy table for publication