Hunter Bennett | Glass Lab | Kupffer Strains Project | 15 March 2023

This notebook will do a basic analysis of motif enrichment in accessible enhancers over a generic background

### Load packages, set plot parameters

In [1]:
### header ###
__author__ = "Hunter Bennett"
__license__ = "BSD"
__email__ = "hunter.r.bennett@gmail.com"
%load_ext autoreload
%autoreload 2
### imports ###
import os
import glob
import pandas as pd
import numpy as np

Set working directory

In [2]:
workingDirectory = '/home/h1bennet/strains_github/results/Figure2_ATAC/' # user will need to set own wd
if not os.path.isdir(workingDirectory):
    os.mkdir(workingDirectory)
os.chdir(workingDirectory)

### Call motifs using homer
____
Main difference here is we are just using a general background. This is because most enhancers are 'accessible' by our definition within a strain so we just want to get an idea for the enrichment of different motifs within the open chromatin of each sample

In [3]:
strain_dict = {'C57Bl6J': 'c57bl6j',
               'shared': 'c57bl6j',
               'BALBcJ': 'balbcj',
               'AJ': 'aj'}

In [4]:
with open('./findMotifs_poised.sh', 'w') as f:
    for peakfile in np.sort(glob.glob('./poised_enhancers/*peaks.txt')):
        strain = peakfile.split('/')[-1].split('_')[0]
        call = ['/home/vlink/code/marge/bin/MMARGE.pl', 'denovo_motifs',
                peakfile, 'mm10', peakfile.replace('_peaks.txt', '_motifs'),
                '-len 8,10,12,14,16', '-fg_strain', strain_dict[strain],
                '-bg_strain', strain_dict[strain], '-p 8\n\n']
        f.write(' '.join(call))
    f.close()

Run on command line

    bash ./findMotifs_poised.sh

# Aggregate results - Homer

In [5]:
known_motif_pval_dict = {}
for f in np.sort(glob.glob('./poised_enhancers/*/knownResults.txt')):
    # read in signifance values using pandas
    frame = pd.read_csv(f, sep='\t', index_col=0)
    
    # remove duplicates
    frame = frame[~frame.index.duplicated(keep='first')]
    pval = frame.iloc[:, 2]
    
    # extract name of sample
    sample = f.split('/')[2]
    
    # store in dict
    known_motif_pval_dict[sample] = pval    
    
# create data frame
known_motif_pval_frame = pd.DataFrame(known_motif_pval_dict)

# sort by average p value
known_motif_pval_frame = known_motif_pval_frame.reindex(known_motif_pval_frame.mean(axis=1).sort_values().index.tolist())

known_motif_pval_frame.to_csv('./homer_known_motifs_log.txt', sep='\t')