Hunter Bennett | Glass Lab | Kupffer Strains Project | 14 March 2023

Notebook for motif mutation analysis with MAGGIE

### Load packages, set plot parameters

In [1]:
### header ###
__author__ = "Hunter Bennett"
__license__ = "BSD"
__email__ = "hunter.r.bennett@gmail.com"
%load_ext autoreload
%autoreload 2
### imports ###
%matplotlib inline
import os
import re
import glob
import pandas as pd
import numpy as np

Set working directory

In [2]:
workingDirectory = '/home/h1bennet/strains_github/results/Figure2_H3K27Ac/' # user will need to set own wd
if not os.path.isdir(workingDirectory):
    os.mkdir(workingDirectory)
os.chdir(workingDirectory)

Define custom functions

In [3]:
def import_homer_diffpeak(depath, appath):
    '''This function takes in a getDiffExpression file created from raw tag counts
       and a peak tag file created with annotatePeaks
       and processes the files so that they contain the same set of genes and can be
       used for data visualization and analysis
       Accepts:
           depath (str): path to getDiffExpression file
           appath (str): path to annotatePeaks file
       Returns:
           de (pandas.DataFrame): modified getDiffExpression file
           ge (pandas.DataFrame): modified annotatePeaks,
           ge_mat (pandas.DataFrame): annotatePeaks file without annotation
           ge_mat_quatn (pandas.DataFrame) : annotatePeaks file without annotation
           and quantile normalized
       '''
    # import packages
    import pandas as pd
    import numpy as np

    # set autosomes for filtering
    autosomes = ['chr1', 'chr2', 'chr3', 'chr4',
                 'chr5', 'chr6', 'chr7', 'chr8',
                 'chr9', 'chr10', 'chr11', 'chr12',
                 'chr13', 'chr14', 'chr15', 'chr16',
                 'chr17', 'chr18', 'chr19']

    # import differential gene expression
    de = pd.read_csv(depath, sep='\t', index_col=0)
    de.index.rename('PeakID', inplace=True)
    de = de.loc[de.Chr.str.contains('|'.join(autosomes)), :]


    # import ge file
    ap = pd.read_csv(appath, sep='\t', index_col=0)
    ap.index.rename('PeakID', inplace=True)

    # select ge transcripts that are in diff_gene
    print('annotatePeaks all peaks', ap.shape)
    print('getDiffExpression selected transcripts', de.shape)
    ap = ap.loc[de.index.tolist(), :]
    print('annotatePeaks selected peaks', ap.shape)


    # return files
    return (de, ap, ap.iloc[:, 18:]) # also return count matrix without annotation

def pull_comparisons_get_diff(diff_gene, seq_type='Repeat'):
    '''This function pulls out comparisons from a diff gene file with multiple comparision groups
    and returns a dict of pandas DataFrames with one comparison each.

    Accepts:
        diff_gene (pandas.DataFrame): diff gene file processed to have
        genes as index and column of RefSeqIDs titled RepeatID
        seq_type (str): Repeat|Peak type of annotation file. repeat for RNA
        peak for ChIP/ATAC

    Returns:
    comp_dict (dict): dictionary of 1 pandas Data Frame per each comparison
    ''' 


    # import packages
    import pandas as pd
    import re
    
    if seq_type=='Repeat':
        # extract groups
        def subset_get_diff(diff_gene, comp):
            return diff_gene.loc[:, [seq_type+'ID',
                                     comp + ' Log2 Fold Change',
                                     comp + ' p-value',
                                     comp + ' adj. p-value']]

    if seq_type=='Peak':
        # exract groups
        def subset_get_diff(diff_gene, comp):
             return diff_gene.loc[:, ['Chr', 'Start', 'End',
                                      'Annotation',
                                      'Gene Name',
                                      'Distance to TSS',
                                       comp + ' Log2 Fold Change',
                                       comp + ' p-value',
                                       comp + ' adj. p-value']]
    
    comp_dict = {}
    pattern='(\w* vs. \w*).*'
    for col in diff_gene.columns.values:
        m = re.search(string=col, pattern=pattern)
        if m:
            df = subset_get_diff(diff_gene, m.group(1))
            if seq_type=='Repeat':
                df.columns = ['RepeatID', 'log2fc', 'pval', 'adj_pval']
            if seq_type=='Peak':
                df['location'] = df.Chr.astype(str)+':'+df.Start.astype(str)+'-'+df.End.astype(str)
                df.columns = ['Chr', 'Start', 'End',
                              'Annotation', 'gene', 'TSS_dist',
                              'log2fc', 'pval', 'adj_pval', 'location']
    
            comp_dict[re.sub('G0[0-9]_', '', m.group(1))] = df
    
    return comp_dict

###  Import differential peak analysis

In [4]:
diff_peak, peaks, peak_mat = import_homer_diffpeak(
    './diff_output.txt',
    './idr_peaks_h3k27ac_norm.txt')

# create dictionary of sub data-frames for each comparison
comp_dict = pull_comparisons_get_diff(diff_peak, seq_type='Peak')

annotatePeaks all peaks (84264, 27)
getDiffExpression selected transcripts (84264, 36)
annotatePeaks selected peaks (84264, 27)


### Process for motif calling with MAGGIE

#### Correct merge-peaks shifting of atac peak files
Make atac peak file with 200bp peaks set on center of current peak (rounding down if fractional center)

In [5]:
atac_peaks = pd.read_csv('./idr_peaks.txt', sep='\t', index_col=0)

atac_peaks.columns = atac_peaks.columns.str.capitalize()

In [6]:
new_start = []
new_end = []
for index, row in atac_peaks.iterrows():
    center = row.Start + int(np.floor((row.End - row.Start)/2))
    new_start.append(center - 100)
    new_end.append(center + 100)
    
atac_peaks_200bp = atac_peaks.iloc[:, :4].copy(deep=True)
atac_peaks_200bp['Start'] = new_start
atac_peaks_200bp['End'] = new_end
atac_peaks_200bp.index.rename('PeakID', inplace=True)

Next, select distal peak subset for distal motif calling. In this case we will use 3kb to differentiate between prozimal and distal enhancers

In [7]:
diff_peak_distal = diff_peak.loc[np.abs(diff_peak.loc[:, 'Distance to TSS'])>3000, :]
atac_peaks_200bp_distal = atac_peaks_200bp.reindex(diff_peak_distal.index).dropna(how='all')

# print to check that this worked
print(diff_peak.shape[0], 'peaks in differential peak file')
print(atac_peaks_200bp.shape[0], 'peaks in ATAC-seq file')
print()
print(diff_peak_distal.shape[0], 'distal peaks in differential peak file')
print(atac_peaks_200bp_distal.shape[0], 'distal peaks in ATAC-seq file')

84264 peaks in differential peak file
86301 peaks in ATAC-seq file

56602 distal peaks in differential peak file
56602 distal peaks in ATAC-seq file


Here we just want to take peaks that are increased specifically in one strain when compared to another, but i would prefer to do this all in one go so that we can pool our power. Perhaps we can start by doing everything relevant to C57 - this would introduce duplicate peaks which could affect our overall power

To do this part it helps to have some understanding of the diffpeak file that homer puts out... the columns at the end are "group1 vs group2" with p value and log2fold change. The log2fold change here looks like this:  \

$\log2(\frac{group2 tags}{group1 tags})$  

So in this case things that are higher in group 1 will be negative and things that are higher in group 2 will be positive. Since C57 is group 1 in this case it all negative peaks will be up in C57

In [8]:
if not os.path.isdir('./differential_peaks/'):
    os.mkdir('./differential_peaks/')

In [9]:
comps = ['aj vs. balbcj',
         'aj vs. c57bl6j',
         'balbcj vs. c57bl6j']

# set cutoffs
fc = 1
pval = 0.05

Select peak sets

In [10]:
for df, text in zip([diff_peak, diff_peak_distal], ['', '_distal']):
    print(text, 'analysis')
    for comp in comps:
        # extract strains
        s1 = comp.split(' vs. ')[0]
        s2 = comp.split(' vs. ')[1]

        # write group 2 activated
        g1 = df.loc[(df.loc[:, comp+' adj. p-value'] <=pval) & (df.loc[:, comp+' Log2 Fold Change'] < -fc)].index.tolist()
        print(len(g1), 'peaks upregualted in', s1, 'relative to', s2)
        atac_peaks_200bp.loc[g1, :].to_csv('./differential_peaks/'+s1+'_spec_'+s2+'_200bp'+text+'.txt', sep='\t')

        g2 = df.loc[(df.loc[:, comp+' adj. p-value'] <=pval) & (df.loc[:, comp+' Log2 Fold Change'] > fc)].index.tolist()
        print(len(g2), 'peaks upregualted in', s2, 'relative to', s1)
        atac_peaks_200bp.loc[g2, :].to_csv('./differential_peaks/'+s2+'_spec_'+s1+'_200bp'+text+'.txt', sep='\t')

 analysis
815 peaks upregualted in aj relative to balbcj
1260 peaks upregualted in balbcj relative to aj
1620 peaks upregualted in aj relative to c57bl6j
2246 peaks upregualted in c57bl6j relative to aj
2474 peaks upregualted in balbcj relative to c57bl6j
2397 peaks upregualted in c57bl6j relative to balbcj
_distal analysis
605 peaks upregualted in aj relative to balbcj
991 peaks upregualted in balbcj relative to aj
1275 peaks upregualted in aj relative to c57bl6j
1751 peaks upregualted in c57bl6j relative to aj
1965 peaks upregualted in balbcj relative to c57bl6j
1823 peaks upregualted in c57bl6j relative to balbcj


Extract sequences for maggie

In [11]:
if not os.path.isdir('./differential_fasta/'):
    os.mkdir('./differential_fasta/')

Run in terminal

    ### c57 vs balb
    /home/vlink/code/marge/bin/MMARGE.pl extract_sequences \
     -ind BALBCJ -file ./differential_peaks/balbcj_spec_c57bl6j_200bp.txt \
    -output ./differential_fasta/balbcj_spec_c57bl6j_ref_200bp.fa
    /home/vlink/code/marge/bin/MMARGE.pl extract_sequences \
     -ind C57BL6J -file ./differential_peaks/balbcj_spec_c57bl6j_200bp.txt \
     -output ./differential_fasta/balbcj_spec_c57bl6j_mut_200bp.fa

    /home/vlink/code/marge/bin/MMARGE.pl extract_sequences \
     -ind C57BL6J -file ./differential_peaks/c57bl6j_spec_balbcj_200bp.txt \
     -output ./differential_fasta/c57bl6j_spec_balbcj_ref_200bp.fa
    /home/vlink/code/marge/bin/MMARGE.pl extract_sequences \
     -ind BALBCJ -file ./differential_peaks/c57bl6j_spec_balbcj_200bp.txt \
     -output ./differential_fasta/c57bl6j_spec_balbcj_mut_200bp.fa

    # distal

    /home/vlink/code/marge/bin/MMARGE.pl extract_sequences \
     -ind BALBCJ -file ./differential_peaks/balbcj_spec_c57bl6j_200bp_distal.txt \
     -output ./differential_fasta/balbcj_spec_c57bl6j_ref_200bp_distal.fa
    /home/vlink/code/marge/bin/MMARGE.pl extract_sequences \
     -ind C57BL6J -file ./differential_peaks/balbcj_spec_c57bl6j_200bp_distal.txt \
     -output ./differential_fasta/balbcj_spec_c57bl6j_mut_200bp_distal.fa

    /home/vlink/code/marge/bin/MMARGE.pl extract_sequences \
     -ind C57BL6J -file ./differential_peaks/c57bl6j_spec_balbcj_200bp_distal.txt \
     -output ./differential_fasta/c57bl6j_spec_balbcj_ref_200bp_distal.fa
    /home/vlink/code/marge/bin/MMARGE.pl extract_sequences \
     -ind BALBCJ -file ./differential_peaks/c57bl6j_spec_balbcj_200bp_distal.txt \
     -output ./differential_fasta/c57bl6j_spec_balbcj_mut_200bp_distal.fa

    ### balb vs aj

    /home/vlink/code/marge/bin/MMARGE.pl extract_sequences \
     -ind BALBCJ -file ./differential_peaks/balbcj_spec_aj_200bp.txt \
     -output ./differential_fasta/balbcj_spec_aj_ref_200bp.fa
    /home/vlink/code/marge/bin/MMARGE.pl extract_sequences \
     -ind AJ -file ./differential_peaks/balbcj_spec_aj_200bp.txt \
     -output ./differential_fasta/balbcj_spec_aj_mut_200bp.fa

    /home/vlink/code/marge/bin/MMARGE.pl extract_sequences \
     -ind AJ -file ./differential_peaks/aj_spec_balbcj_200bp.txt \
     -output ./differential_fasta/aj_spec_balbcj_ref_200bp.fa
    /home/vlink/code/marge/bin/MMARGE.pl extract_sequences \
     -ind BALBCJ -file ./differential_peaks/aj_spec_balbcj_200bp.txt \
     -output ./differential_fasta/aj_spec_balbcj_mut_200bp.fa

    # distal

    /home/vlink/code/marge/bin/MMARGE.pl extract_sequences \
     -ind BALBCJ -file ./differential_peaks/balbcj_spec_aj_200bp_distal.txt \
     -output ./differential_fasta/balbcj_spec_aj_ref_200bp_distal.fa
    /home/vlink/code/marge/bin/MMARGE.pl extract_sequences \
     -ind AJ -file ./differential_peaks/balbcj_spec_aj_200bp_distal.txt \
     -output ./differential_fasta/balbcj_spec_aj_mut_200bp_distal.fa

    /home/vlink/code/marge/bin/MMARGE.pl extract_sequences \
     -ind AJ -file ./differential_peaks/aj_spec_balbcj_200bp_distal.txt \
     -output ./differential_fasta/aj_spec_balbcj_ref_200bp_distal.fa
    /home/vlink/code/marge/bin/MMARGE.pl extract_sequences \
     -ind BALBCJ -file ./differential_peaks/aj_spec_balbcj_200bp_distal.txt \
     -output ./differential_fasta/aj_spec_balbcj_mut_200bp_distal.fa

    ### c57 vs aj

    /home/vlink/code/marge/bin/MMARGE.pl extract_sequences \
     -ind C57BL6J -file ./differential_peaks/c57bl6j_spec_aj_200bp.txt \
     -output ./differential_fasta/c57bl6j_spec_aj_ref_200bp.fa
    /home/vlink/code/marge/bin/MMARGE.pl extract_sequences \
     -ind AJ -file ./differential_peaks/c57bl6j_spec_aj_200bp.txt \
     -output ./differential_fasta/c57bl6j_spec_aj_mut_200bp.fa

    /home/vlink/code/marge/bin/MMARGE.pl extract_sequences \
     -ind AJ -file ./differential_peaks/aj_spec_c57bl6j_200bp.txt \
     -output ./differential_fasta/aj_spec_c57bl6j_ref_200bp.fa
    /home/vlink/code/marge/bin/MMARGE.pl extract_sequences \
     -ind C57BL6J -file ./differential_peaks/aj_spec_c57bl6j_200bp.txt \
     -output ./differential_fasta/aj_spec_c57bl6j_mut_200bp.fa

    # distal

    /home/vlink/code/marge/bin/MMARGE.pl extract_sequences \
     -ind C57BL6J -file ./differential_peaks/c57bl6j_spec_aj_200bp_distal.txt \
     -output ./differential_fasta/c57bl6j_spec_aj_ref_200bp_distal.fa
    /home/vlink/code/marge/bin/MMARGE.pl extract_sequences \
     -ind AJ -file ./differential_peaks/c57bl6j_spec_aj_200bp_distal.txt \
     -output ./differential_fasta/c57bl6j_spec_aj_mut_200bp_distal.fa

    /home/vlink/code/marge/bin/MMARGE.pl extract_sequences \
     -ind AJ -file ./differential_peaks/aj_spec_c57bl6j_200bp_distal.txt \
     -output ./differential_fasta/aj_spec_c57bl6j_ref_200bp_distal.fa
    /home/vlink/code/marge/bin/MMARGE.pl extract_sequences \
     -ind C57BL6J -file ./differential_peaks/aj_spec_c57bl6j_200bp_distal.txt \
     -output ./differential_fasta/aj_spec_c57bl6j_mut_200bp_distal.fa


# Run Maggie

In [12]:
if not os.path.isdir('./maggie_output/'):
    os.mkdir('./maggie_output/')

Run in command line  

All peaks

    /home/h1bennet/anaconda3/envs/maggie/bin/python ~/maggie/bin/maggie_fasta_input.py \
    ./differential_fasta/balbcj_spec_c57bl6j_ref_200bp.fa,\
    ./differential_fasta/c57bl6j_spec_balbcj_ref_200bp.fa,\
    ./differential_fasta/aj_spec_c57bl6j_ref_200bp.fa,\
    ./differential_fasta/c57bl6j_spec_aj_ref_200bp.fa,\
    ./differential_fasta/balbcj_spec_aj_ref_200bp.fa,\
    ./differential_fasta/aj_spec_balbcj_ref_200bp.fa \
    ./differential_fasta/balbcj_spec_c57bl6j_mut_200bp.fa,\
    ./differential_fasta/c57bl6j_spec_balbcj_mut_200bp.fa,\
    ./differential_fasta/aj_spec_c57bl6j_mut_200bp.fa,\
    ./differential_fasta/c57bl6j_spec_aj_mut_200bp.fa,\
    ./differential_fasta/balbcj_spec_aj_mut_200bp.fa,\
    ./differential_fasta/aj_spec_balbcj_mut_200bp.fa \
    -o ./maggie_output/all_200bp/ \
    -p 8
    
Distal peaks

    /home/h1bennet/anaconda3/envs/maggie/bin/python ~/maggie/bin/maggie_fasta_input.py \
    ./differential_fasta/balbcj_spec_c57bl6j_ref_200bp_distal.fa, \
        ./differential_fasta/c57bl6j_spec_balbcj_ref_200bp_distal.fa, \
        ./differential_fasta/aj_spec_c57bl6j_ref_200bp_distal.fa, \
        ./differential_fasta/c57bl6j_spec_aj_ref_200bp_distal.fa, \
        ./differential_fasta/balbcj_spec_aj_ref_200bp_distal.fa, \
        ./differential_fasta/aj_spec_balbcj_ref_200bp_distal.fa \
    ./differential_fasta/balbcj_spec_c57bl6j_mut_200bp_distal.fa, \
        ./differential_fasta/c57bl6j_spec_balbcj_mut_200bp_distal.fa, \
        ./differential_fasta/aj_spec_c57bl6j_mut_200bp_distal.fa, \
        ./differential_fasta/c57bl6j_spec_aj_mut_200bp_distal.fa, \
        ./differential_fasta/balbcj_spec_aj_mut_200bp_distal.fa, \
        ./differential_fasta/aj_spec_balbcj_mut_200bp_distal.fa \
    -o ./maggie_output/all_200bp_distal/ \
    -p 8

Aggregate all maggie results into data frame

In [None]:
maggie_merged_pval_dict = {}
for f in glob.glob('./maggie_output/*/maggie_output_merged.tsv'):
    # read in signifance values using pandas
    frame = pd.read_csv(f, sep='\t', index_col=0)

    # remove duplicates
    # frame = frame[~frame.index.duplicated(keep='first')]
    pval = frame.loc[:, 'Median p-val']

    # extract name of sample
    sample = f.split('/')[2]

    # store in dict
    maggie_merged_pval_dict[sample] = pval    
    
# create data frame
maggie_merged_pval_frame = pd.DataFrame(maggie_merged_pval_dict)

# sort by average p value
maggie_merged_pval_frame = maggie_merged_pval_frame.reindex(maggie_merged_pval_frame.mean(axis=1).sort_values().index.tolist())

maggie_merged_pval_frame.to_csv('./maggie_all_merged_log_github.txt', sep='\t')