Hunter Bennett | Glass Lab | Kupffer Strains Project | 14 March 2023

Notebook for motif analysis with HOMER, used to assess differential ATAC-seq peaks

### Load packages, set plot parameters

In [1]:
### header ###
__author__ = "Hunter Bennett"
__license__ = "BSD"
__email__ = "hunter.r.bennett@gmail.com"
%load_ext autoreload
%autoreload 2
### imports ###
import sys
%matplotlib inline
import os
import re
import glob
import pandas as pd
import numpy as np

Set working directory  

In [2]:
workingDirectory = '/home/h1bennet/strains_github/results/Figure2_ATAC/' # user will need to set own wd
if not os.path.isdir(workingDirectory):
    os.mkdir(workingDirectory)
os.chdir(workingDirectory)

Define custom functions

In [3]:
def import_homer_diffpeak(depath, appath):
    '''This function takes in a getDiffExpression file created from raw tag counts
       and a peak tag file created with annotatePeaks
       and processes the files so that they contain the same set of genes and can be
       used for data visualization and analysis
       Accepts:
           depath (str): path to getDiffExpression file
           appath (str): path to annotatePeaks file
       Returns:
           de (pandas.DataFrame): modified getDiffExpression file
           ge (pandas.DataFrame): modified annotatePeaks,
           ge_mat (pandas.DataFrame): annotatePeaks file without annotation
           ge_mat_quatn (pandas.DataFrame) : annotatePeaks file without annotation
           and quantile normalized
       '''
    # import packages
    import pandas as pd
    import numpy as np

    # set autosomes for filtering
    autosomes = ['chr1', 'chr2', 'chr3', 'chr4',
                 'chr5', 'chr6', 'chr7', 'chr8',
                 'chr9', 'chr10', 'chr11', 'chr12',
                 'chr13', 'chr14', 'chr15', 'chr16',
                 'chr17', 'chr18', 'chr19']

    # import differential gene expression
    de = pd.read_csv(depath, sep='\t', index_col=0)
    de.index.rename('PeakID', inplace=True)
    de = de.loc[de.Chr.str.contains('|'.join(autosomes)), :]


    # import ge file
    ap = pd.read_csv(appath, sep='\t', index_col=0)
    ap.index.rename('PeakID', inplace=True)

    # select ge transcripts that are in diff_gene
    print('annotatePeaks all peaks', ap.shape)
    print('getDiffExpression selected transcripts', de.shape)
    ap = ap.loc[de.index.tolist(), :]
    print('annotatePeaks selected peaks', ap.shape)


    # return files
    return (de, ap, ap.iloc[:, 18:]) # also return count matrix without annotation

def pull_comparisons_get_diff(diff_gene, seq_type='Repeat'):
    '''This function pulls out comparisons from a diff gene file with multiple comparision groups
    and returns a dict of pandas DataFrames with one comparison each.

    Accepts:
        diff_gene (pandas.DataFrame): diff gene file processed to have
        genes as index and column of RefSeqIDs titled RepeatID
        seq_type (str): Repeat|Peak type of annotation file. repeat for RNA
        peak for ChIP/ATAC

    Returns:
    comp_dict (dict): dictionary of 1 pandas Data Frame per each comparison
    ''' 


    # import packages
    import pandas as pd
    import re
    
    if seq_type=='Repeat':
        # extract groups
        def subset_get_diff(diff_gene, comp):
            return diff_gene.loc[:, [seq_type+'ID',
                                     comp + ' Log2 Fold Change',
                                     comp + ' p-value',
                                     comp + ' adj. p-value']]

    if seq_type=='Peak':
        # exract groups
        def subset_get_diff(diff_gene, comp):
             return diff_gene.loc[:, ['Chr', 'Start', 'End',
                                      'Annotation',
                                      'Gene Name',
                                      'Distance to TSS',
                                       comp + ' Log2 Fold Change',
                                       comp + ' p-value',
                                       comp + ' adj. p-value']]
    
    comp_dict = {}
    pattern='(\w* vs. \w*).*'
    for col in diff_gene.columns.values:
        m = re.search(string=col, pattern=pattern)
        if m:
            df = subset_get_diff(diff_gene, m.group(1))
            if seq_type=='Repeat':
                df.columns = ['RepeatID', 'log2fc', 'pval', 'adj_pval']
            if seq_type=='Peak':
                df['location'] = df.Chr.astype(str)+':'+df.Start.astype(str)+'-'+df.End.astype(str)
                df.columns = ['Chr', 'Start', 'End',
                              'Annotation', 'gene', 'TSS_dist',
                              'log2fc', 'pval', 'adj_pval', 'location']
    
            comp_dict[re.sub('G0[0-9]_', '', m.group(1))] = df
    
    return comp_dict

###  Import differential peak analysis

In [4]:
diff_peak, peaks, peak_mat = import_homer_diffpeak(
    './diff_output.txt',
    './idr_peaks_atac_norm.txt')

# create dictionary of sub data-frames for each comparison
comp_dict = pull_comparisons_get_diff(diff_peak, seq_type='Peak')

annotatePeaks all peaks (84264, 30)
getDiffExpression selected transcripts (84264, 39)
annotatePeaks selected peaks (84264, 30)


### Process for motif calling with Homer

In [5]:
if not os.path.isdir('./motif_calling/'):
    os.mkdir('./motif_calling/')

First, select distal peak subset for distal motif calling. In this case we will use 3kb to differentiate between prozimal and distal enhancers

In [6]:
diff_peak_distal = diff_peak.loc[np.abs(diff_peak.loc[:, 'Distance to TSS'])>3000, :]

# print to check that this worked
print(diff_peak.shape[0], 'peaks in differential peak file')
print()
print(diff_peak_distal.shape[0], 'distal peaks in differential peak file')
# write out distal peaks for distal background
diff_peak_distal.to_csv(
    './diff_output_github.txt',
    sep='\t')


84264 peaks in differential peak file

56602 distal peaks in differential peak file


### Extract target peaks
For the three-way comparison we are ultimately interested in two peak set options.  
1. Intersection peaks: these are peaks that are specifically increased in one strain compared to two other strains.
2. Union peaks: these are peaks that are increased in one strain compared to one of the comparator strains but not the other.

Set cutoffs

In [7]:
# set pvalue and fc thresholds
fc = np.log2(2)
pval = 0.05

# initialize dict and list of strains
peak_lists = {}
bg_lists = {}
strains = ['aj',
         'balbcj',
         'c57bl6j']

In [8]:
for strain in strains:
    print('Analyzing', strain, '...\n')
    
    # set count
    count = 0
    
    for key in comp_dict.keys():
        
        # select get diff data frame.
        df = comp_dict[key]
        
        # check on counter
        print(count)
        
        # store pairwise differential
        # strain 1
        pairwise_key = key.replace(' vs. ', '_vs_')+'_'+key.split(' vs. ')[0]+'_up'
        sig_peaks = df.index[(df['adj_pval'] < pval) & (df['log2fc'] <= -fc)].to_list()
        peak_lists[pairwise_key] = list(sig_peaks)
        # strain 2
        pairwise_key = key.replace(' vs. ', '_vs_')+'_'+key.split(' vs. ')[1]+'_up'
        sig_peaks = df.index[(df['adj_pval'] < pval) & (df['log2fc'] >= fc)].to_list()
        peak_lists[pairwise_key] = list(sig_peaks)

        # initialize lists if count == 0
        if count == 0:
            # fold change direction depends on the location of the strain in the comparison.
            # if strain is in first position it is reference - so negative log2fc is higher in first position
            if strain in key.split(' vs. ')[0]:
                u = set(df.index[(df.loc[:, 'adj_pval'] <= pval) & (df.loc[:, 'log2fc'] < -fc)])
                i = set(df.index[(df.loc[:, 'adj_pval'] <= pval) & (df.loc[:, 'log2fc'] < -fc)])
                count = count + 1
            # if strain is in second position it is comparison - so positive log2fc is higher in second position
            elif strain in key.split(' vs. ')[1]:
                u = set(df.index[(df.loc[:, 'adj_pval'] <= pval) & (df.loc[:, 'log2fc'] > fc)])
                i = set(df.index[(df.loc[:, 'adj_pval'] <= pval) & (df.loc[:, 'log2fc'] > fc)])
                count = count + 1

        else:
            # fold change direction depends on the location of the strain in the comparison.
            # if strain is in first position it is reference - so negative log2fc is higher in first position
            if strain in key.split(' vs. ')[0]:
                u = u.union(df.index[(df.loc[:, 'adj_pval'] <= pval) & (df.loc[:, 'log2fc'] < -fc)])
                i = i.intersection(df.index[(df.loc[:, 'adj_pval'] <= pval) & (df.loc[:, 'log2fc'] < -fc)])
                count = count + 1
            # if strain is in second position it is comparison - so positive log2fc is higher in second position
            elif strain in key.split(' vs. ')[1]:
                u = u.union(df.index[(df.loc[:, 'adj_pval'] <= pval) & (df.loc[:, 'log2fc'] > fc)])
                i = i.intersection(df.index[(df.loc[:, 'adj_pval'] <= pval) & (df.loc[:, 'log2fc'] > fc)])
                count = count + 1

        
        # summarize iteration
        print('For comp:', key)
        print('Union set at:', len(u), 'peaks')
        print('Intersection set at:', len(i), 'peaks\n')
     
    
        # store lists
        peak_lists[strain+'_union'] = u
        peak_lists[strain+'_intersection'] = i

Analyzing aj ...

0
For comp: aj vs. balbcj
Union set at: 1386 peaks
Intersection set at: 1386 peaks

1
For comp: aj vs. c57bl6j
Union set at: 3871 peaks
Intersection set at: 593 peaks

2
For comp: balbcj vs. c57bl6j
Union set at: 3871 peaks
Intersection set at: 593 peaks

Analyzing balbcj ...

0
For comp: aj vs. balbcj
Union set at: 2174 peaks
Intersection set at: 2174 peaks

1
For comp: aj vs. c57bl6j
Union set at: 2174 peaks
Intersection set at: 2174 peaks

1
For comp: balbcj vs. c57bl6j
Union set at: 5468 peaks
Intersection set at: 790 peaks

Analyzing c57bl6j ...

0
For comp: aj vs. balbcj
Union set at: 5468 peaks
Intersection set at: 790 peaks

0
For comp: aj vs. c57bl6j
Union set at: 3100 peaks
Intersection set at: 3100 peaks

1
For comp: balbcj vs. c57bl6j
Union set at: 4654 peaks
Intersection set at: 1550 peaks



Save peaks for motif calling

In [9]:
# make sure that start and end coordinates are saved as integers
convert_dict = {'Start': int,
                'End': int}

In [10]:
for df, txt in zip([diff_peak, diff_peak_distal], ['', '_distal']):
    for key in peak_lists.keys():
        # save cell specific promoters
        tmp = df.reindex(peak_lists[key]).dropna(how='all').iloc[:, :5]
        tmp = tmp.astype(convert_dict)
        tmp.to_csv('./motif_calling/'+key+'_act'+txt+'_peaks.txt',
                   sep='\t')
        
        # save specific background
        tmp = df.loc[df.index.difference(peak_lists[key]), :].iloc[:, :5]
        tmp = tmp.astype(convert_dict)
        tmp.to_csv('./motif_calling/'+key+'_bg'+txt+'_peaks.txt',
                   sep='\t')


### Call motifs using homer

In [11]:
strain_dict = {'c57bl6j': 'c57bl6j',
               'balbcj': 'balbcj',
               'aj': 'aj'}

Call with background

In [12]:
with open('./findMotifs_differential.sh', 'w') as f:
    for peakfile in np.sort(glob.glob('./motif_calling/*ion*act*')):
        strain = peakfile.split('/')[-1].split('_')[0]
        call = ['/home/vlink/code/marge/bin/MMARGE.pl', 'denovo_motifs',
                peakfile, 'mm10', peakfile.replace('_peaks.txt', '_motifs').replace('_act', ''),
                '-bg', peakfile.replace('_act_', '_bg_'),
                '-len 8,10,12,14,16', '-fg_strain', strain_dict[strain],
                '-bg_strain', strain_dict[strain], '-p 16\n\n']
        f.write(' '.join(call))

Call without background

In [13]:
with open('./findMotifs_differential_nobg.sh', 'w') as f:
    for peakfile in np.sort(glob.glob('./motif_calling/*ion*act*')):
        strain = peakfile.split('/')[-1].split('_')[0]
        call = ['/home/vlink/code/marge/bin/MMARGE.pl', 'denovo_motifs',
                peakfile, 'mm10', peakfile.replace('_peaks.txt', '_motifs_nobg_nodenovo').replace('_act', ''),
                '-fg_strain', strain_dict[strain],
                '-bg_strain', strain_dict[strain], '-p 8\n\n']
        f.write(' '.join(call))