Hunter Bennett | Kupffer Strains | Glass Lab | 04 March 2023

This notebook will assess motif enrichment within accessible enhancers associated with C57BL/6J or BALB/cJ specific *trans* genes

### Import packages, set plotting parameters

In [1]:
### header ###
__author__ = "Hunter Bennett"
__license__ = "BSD"
__email__ = "hunter.r.bennett@gmail.com"
%load_ext autoreload
%autoreload 2
### imports ###
import sys
%matplotlib inline
import os
import re
import glob
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns

Set working directory

In [2]:
workingDirectory = '/home/h1bennet/strains_github/results/Figure5_ATAC/'
if not os.path.isdir(workingDirectory):
    os.mkdir(workingDirectory)
os.chdir(workingDirectory)

Define custom functions

In [3]:
def read_annotated_peaks(path, filter_sex_unknown=True):
    '''This function imports an annotated homer peak file using
    pandas read_csv function. It also performes common preprocessing steps
    such as placing the peak IDs in the index and filtering so that
    only autosomal peaks are included. Also renames index to peakID.
       
       Accepts:
           path (str): path to expression file
           filter_sex_unkown (bool): filter unknown chromosomes
           and sex chromosomes - focus analysis on autosomal chr.
       
       Returns:
           df (pandas.Dataframe): genes by samples with annotation.
           mat (pandas.DataFrame): genes by samples without annotation.
       '''
    # import functions
    import pandas as pd

    df = pd.read_csv(path, sep='\t', index_col=0)

    # rename index
    df.index.rename('PeakID', inplace=True)
    # rename sample columns so they are shorter (can add .str.split('/').str[-1] to shorten even more
    df = df.rename(
        columns=dict(zip(df.iloc[:, 18:].columns.values,
                         pd.Series(df.iloc[:, 18:].columns.values).str.split(' Tag Count').str[0])))

    
    # filter unknown and sex chromosomes
    autosomes = ['chr1', 'chr2', 'chr3', 'chr4',
                 'chr5', 'chr6', 'chr7', 'chr8',
                 'chr9', 'chr10', 'chr11', 'chr12',
                 'chr13', 'chr14', 'chr15', 'chr16',
                 'chr17', 'chr18', 'chr19']

    df = df.loc[df.Chr.str.contains('|'.join(autosomes)), :]
      
    mat = df.iloc[:, 18:]
        
    return (df, mat)

### Import files

Grab list of CB6F1J *trans* genes from previous RNA-seq analysis

In [4]:
gene_list_dict = {}
for i in np.sort(glob.glob('../Figure4/gene_lists/cb6f1*filt_pval.txt')):
    with open(i, 'r') as f:
        gene_list_dict[i.split('/')[-1].replace('.txt','')]=[j.strip() for j in f.readlines()[1:]]

Annotate poised enhancers to get nearest genes

    annotatePeaks.pl ../Figure2_ATAC/poised_enhancers/BALBcJ_poised_enhancer_peaks.txt mm10 \
    > ./BALBcJ_poised_enhancer_peaks_ann.txt

    annotatePeaks.pl ../Figure2_ATAC/poised_enhancers/BALBcJ_poised_distal_enhancer_peaks.txt mm10 \
    > ./BALBcJ_poised_distal_enhancer_peaks_ann.txt

    annotatePeaks.pl ../Figure2_ATAC/poised_enhancers/C57Bl6J_poised_enhancer_peaks.txt mm10 \
    > ./C57Bl6J_poised_enhancer_peaks_ann.txt

    annotatePeaks.pl ../Figure2_ATAC/poised_enhancers/C57Bl6J_poised_distal_enhancer_peaks.txt mm10 \
    > ./C57Bl6J_poised_distal_enhancer_peaks_ann.txt

### Link peaks to target genes

Use annotated peak files to link peaks associated with *trans* genes by nearest gene analysis

In [6]:
if not os.path.isdir('./cis_trans_motif_calling/'):
    os.mkdir('./cis_trans_motif_calling/')

In [7]:
pairs = [('./C57Bl6J_poised_enhancer_peaks_ann.txt', 'cb6f1_kupffer_trans_c57_filt_pval'),
         ('./C57Bl6J_poised_distal_enhancer_peaks_ann.txt', 'cb6f1_kupffer_trans_c57_filt_pval'),
         ('./BALBcJ_poised_enhancer_peaks_ann.txt', 'cb6f1_kupffer_trans_balb_filt_pval'),
         ('./BALBcJ_poised_distal_enhancer_peaks_ann.txt', 'cb6f1_kupffer_trans_balb_filt_pval')]

for peakfile, key in pairs:
    
    file = peakfile.split('/')[-1].replace('_peaks_ann.txt', '')
    category = key.split('_')[2]
    
    print(file,'|',category)
    
    df, mat = read_annotated_peaks(peakfile)
    
    trans_enhancers = df['Gene Name'].str.match('|'.join(gene_list_dict[key]))
    print('genes: %i' % len(gene_list_dict[key]))
    print('enhancers: %i' % trans_enhancers.sum())
    print('enhancer genes: %i' % len(df.loc[trans_enhancers, 'Gene Name'].unique()))
    df.loc[trans_enhancers, :].iloc[:, :5].to_csv(
        './cis_trans_motif_calling/'+file+'_'+category+'_peaks.txt',
        sep='\t')

C57Bl6J_poised_enhancer | trans
genes: 61
enhancers: 317
enhancer genes: 56
C57Bl6J_poised_distal_enhancer | trans
genes: 61
enhancers: 215
enhancer genes: 45
BALBcJ_poised_enhancer | trans
genes: 63
enhancers: 227
enhancer genes: 66
BALBcJ_poised_distal_enhancer | trans
genes: 63
enhancers: 137
enhancer genes: 49


### Call motifs in the selected peak sets using homer

Make file for motif analysis

In [8]:
strain_dict = {'C57Bl6J': 'c57bl6j',
               'BALBcJ': 'balbcj',
               'c57': 'c57bl6j',
               'balb': 'balbcj'}

In [9]:
with open('./findMotifs_accessible_trans_gene_enhancers_nobg.sh', 'w') as f:
    for peakfile in np.sort(glob.glob('./cis_trans_motif_calling/*enhancer_trans_peaks*')):
        strain = peakfile.split('/')[-1].split('_')[0]
        call = ['/home/vlink/code/marge/bin/MMARGE.pl', 'denovo_motifs',
                peakfile, 'mm10', peakfile.replace('_peaks.txt', '_motifs_nobg'),
                '-fg_strain', strain_dict[strain],
                '-bg_strain', strain_dict[strain], '-p 8\n\n']
        f.write(' '.join(call))

### Call motifs with a background peak set
This will control for bias in Kupffer cell enhancers

Idenfity nonspecific genes

In [10]:
log2fc = 1
padj = 0.05

df = pd.read_csv('../Figure4/c57bl6j_control.vs.balbcj_control.deseq.txt',
                 sep='\t',
                 index_col=0)

nonspec_genes = df.index[(df['padj'] > padj) | (np.abs(df['log2FoldChange']) < log2fc)]

### Set background peak set
Background will be non-differential enhancers associated with target genes

In [11]:
peakfiles = ['./C57Bl6J_poised_enhancer_peaks_ann.txt',
             './C57Bl6J_poised_distal_enhancer_peaks_ann.txt',
             './BALBcJ_poised_enhancer_peaks_ann.txt',
             './BALBcJ_poised_distal_enhancer_peaks_ann.txt']

for peakfile in peakfiles:
    
    file = peakfile.split('/')[-1].replace('_peaks_ann.txt', '')
    
    print(file)
    
    df, mat = read_annotated_peaks(peakfile)
    
    trans_enhancers = df['Gene Name'].str.match('|'.join(nonspec_genes))
    print('genes: %i' % len(nonspec_genes))
    print('enhancers: %i' % trans_enhancers.sum())
    print('enhancer genes: %i' % len(df.loc[trans_enhancers, 'Gene Name'].unique()))
    print('./cis_trans_motif_calling/'+file+'_'+'nonspec_peaks.txt')
    print()
    df.loc[trans_enhancers, :].iloc[:, :5].to_csv(
        './cis_trans_motif_calling/'+file+'_'+'nonspec_peaks.txt',
        sep='\t')

C57Bl6J_poised_enhancer
genes: 7306
enhancers: 23492
enhancer genes: 7103
./cis_trans_motif_calling/C57Bl6J_poised_enhancer_nonspec_peaks.txt

C57Bl6J_poised_distal_enhancer
genes: 7306
enhancers: 12109
enhancer genes: 3567
./cis_trans_motif_calling/C57Bl6J_poised_distal_enhancer_nonspec_peaks.txt

BALBcJ_poised_enhancer
genes: 7306
enhancers: 22428
enhancer genes: 7043
./cis_trans_motif_calling/BALBcJ_poised_enhancer_nonspec_peaks.txt

BALBcJ_poised_distal_enhancer
genes: 7306
enhancers: 11738
enhancer genes: 3442
./cis_trans_motif_calling/BALBcJ_poised_distal_enhancer_nonspec_peaks.txt



In [12]:
pairs = (('C57Bl6J_poised_enhancer_trans_peaks.txt',
          'C57Bl6J_poised_enhancer_nonspec_peaks.txt'),
         ('C57Bl6J_poised_distal_enhancer_trans_peaks.txt',
          'C57Bl6J_poised_distal_enhancer_nonspec_peaks.txt'),
         ('BALBcJ_poised_enhancer_trans_peaks.txt',
          'BALBcJ_poised_enhancer_nonspec_peaks.txt'),
         ('BALBcJ_poised_distal_enhancer_trans_peaks.txt',
          'BALBcJ_poised_distal_enhancer_nonspec_peaks.txt'))

with open('./findMotifs_accessible_trans_gene_enhancers.sh', 'w') as f:
    for peakfile, bg in pairs:
        strain = peakfile.split('/')[-1].split('_')[0]
        call = ['/home/vlink/code/marge/bin/MMARGE.pl', 'denovo_motifs',
                './cis_trans_motif_calling/'+peakfile, 'mm10',
                './cis_trans_motif_calling/'+peakfile.replace('_peaks.txt', '_motifs'),
                '-bg', './cis_trans_motif_calling/'+bg,
                '-fg_strain', strain_dict[strain],
                '-bg_strain', strain_dict[strain], '-p 8\n\n']
        f.write(' '.join(call))
        

Run the following commands in terminal

    ./findMotifs_accessible_trans_gene_enhancers_nobg.sh
    ./findMotifs_accessible_trans_gene_enhancers.sh

The *de novo* motif analysis from these is aggregated in Figure 5