Goal is to make a clear pipeline to run Zeyang's maggie script with the strains data

In [5]:
### header ###
__author__ = "Hunter Bennett"
__license__ = "BSD"
__email__ = "hunter.r.bennett@gmail.com"
%load_ext autoreload
%autoreload 2
### imports ###
import sys
%matplotlib inline
import os
import re
import glob
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns
matplotlib.rcParams['savefig.dpi'] = 200
sns.set(font_scale=1)
sns.set_context('talk')
sns.set_style('white')

# import custom functions
import sys
sys.path.insert(0, '/home/h1bennet/code/')
# from hbUtils import ngs_qc, quantile_normalize_df

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
dataDirectory = '/gpfs/data01/glasslab/home/h1bennet/strains/data/ATAC/control_30week/'
workingDirectory = '/home/h1bennet/strains/results/200925_Strains_Control_30weeks_ATAC_Maggie/'
if not os.path.isdir(workingDirectory):
    os.mkdir(workingDirectory)
os.chdir(workingDirectory)

# 1. Pull IDR ATAC Peaks from O'Young's peakPipe and convert to BED files

In [9]:
%%bash

if [ ! -d ./bed_files/ ]; then mkdir ./bed_files; fi

#copy peakIDR
cp -r ../200923_Strains_Control_30weeks_ATAC_PeakPipe/peakIDR/ ../

# convert each idr peak file into a bed file
for peak_file in ./peakIDR/*idr;
do bed_file=${peak_file/.idr/_peaks.bed}
bed_file=${bed_file/peakIDR/bed_files}

pos2bed.pl $peak_file -o $bed_file

done

	Output File: ./bed_files/aj_kupffer_control_30weeks_peaks.bed

	Converted 48985 peaks total

	Output File: ./bed_files/balbc_kupffer_control_30weeks_peaks.bed

	Converted 32868 peaks total

	Output File: ./bed_files/c57_kupffer_control_30weeks_peaks.bed

	Converted 38018 peaks total



# 2. Find differential regions using Zeyang's tool

make directory for analysis

In [5]:
!if [ ! -d ./balb_c57_maggie/ ]; then mkdir ./balb_c57_maggie; fi

In [10]:
%%bash
source activate maggie

# find_differential.py \
# -i1 ./bed_files/balbc_kupffer_control_30weeks_peaks.bed \
# -i2 ./bed_files/c57_kupffer_control_30weeks_peaks.bed \
# -o1 ./balb_c57_maggie/balbc_specific_peaks_c57bg.txt \
# -o2 ./balb_c57_maggie/c57_specific_peaks_balbcbg.txt

In [16]:
%%bash
# delete the existing script file
rm ./find_differential.sh
# create a script file
touch ./find_differential.sh

# if directories don't exist, create them
if [ ! -d ./differential_peaks/ ]; then mkdir ./differential_peaks; fi

rm: cannot remove ‘./find_differential.sh’: No such file or directory


In [20]:
import itertools
mylist = glob.glob('./bed_files/*peaks.bed')
mylist = [i.split('/')[-1] for i in mylist]
mylist.sort()
# use pattern to extract cell type
with open('./find_differential.sh', 'w') as f:
    for i, j in itertools.permutations(mylist, r=2):
        print(i,j)
        find_dif = ['bedtools', 'intersect', '-v',
                    '-a', './bed_files/'+i, '-b', './bed_files/'+j,
                    '>', './differential_peaks/'+i.split('_')[0]+'_spec_'+j.split('_')[0]+'.bed',
                   '\n']
        f.write(' '.join(find_dif))


aj_kupffer_control_30weeks_peaks.bed balbc_kupffer_control_30weeks_peaks.bed
aj_kupffer_control_30weeks_peaks.bed c57_kupffer_control_30weeks_peaks.bed
balbc_kupffer_control_30weeks_peaks.bed aj_kupffer_control_30weeks_peaks.bed
balbc_kupffer_control_30weeks_peaks.bed c57_kupffer_control_30weeks_peaks.bed
c57_kupffer_control_30weeks_peaks.bed aj_kupffer_control_30weeks_peaks.bed
c57_kupffer_control_30weeks_peaks.bed balbc_kupffer_control_30weeks_peaks.bed


Convert bed to peaks

In [31]:
%%bash
for bed_file in ./differential_peaks/*bed;
do peak_file=${bed_file/bed/txt}
bed2pos.pl $bed_file -o $peak_file
done

	Output File: ./differential_peaks/aj_spec_balbc.txt
	Output File: ./differential_peaks/aj_spec_c57.txt
	Output File: ./differential_peaks/balbc_spec_aj.txt
	Output File: ./differential_peaks/balbc_spec_c57.txt
	Output File: ./differential_peaks/c57_spec_aj.txt
	Output File: ./differential_peaks/c57_spec_balbc.txt


Currently the find_differential.py is hanging with my toy example - trying to workaround with bedtools intersect followed by bed2pos.pl

# 3. Extract sequences with MMarge

In [7]:
if not os.path.isdir('./differential_fasta/'):
    os.mkdir('./differential_fasta/')

BALB vs C57

In [None]:
marge extract_sequences -ind BALBCJ -file ./differential_peaks/balbc_spec_c57.txt -data_dir . -genome_dir ~/genomes/ -output ./differential_fasta/balbc_spec_c57_ref.fa
marge extract_sequences -ind C57BL6J -file ./differential_peaks/balbc_spec_c57.txt -data_dir . -genome_dir ~/genomes/ -output ./differential_fasta/balbc_spec_c57_mut.fa

marge extract_sequences -ind C57BL6J -file ./differential_peaks/c57_spec_balbc.txt -data_dir . -genome_dir ~/genomes/ -output ./differential_fasta/c57_spec_balbc_ref.fa
marge extract_sequences -ind BALBCJ -file ./differential_peaks/c57_spec_balbc.txt -data_dir . -genome_dir ~/genomes/ -output ./differential_fasta/c57_spec_balbc_mut.fa


BALB vs AJ

In [None]:
marge extract_sequences -ind BALBCJ -file ./differential_peaks/balbc_spec_aj.txt -data_dir . -genome_dir ~/genomes/ -output ./differential_fasta/balbc_spec_aj_ref.fa
marge extract_sequences -ind AJ -file ./differential_peaks/balbc_spec_aj.txt -data_dir . -genome_dir ~/genomes/ -output ./differential_fasta/balbc_spec_aj_mut.fa

marge extract_sequences -ind AJ -file ./differential_peaks/aj_spec_balbc.txt -data_dir . -genome_dir ~/genomes/ -output ./differential_fasta/aj_spec_balbc_ref.fa
marge extract_sequences -ind BALBCJ -file ./differential_peaks/aj_spec_balbc.txt -data_dir . -genome_dir ~/genomes/ -output ./differential_fasta/aj_spec_balbc_mut.fa


AJ vs. C57

In [None]:
marge extract_sequences -ind C57BL6J -file ./differential_peaks/c57_spec_aj.txt -data_dir . -genome_dir ~/genomes/ -output ./differential_fasta/c57_spec_aj_ref.fa
marge extract_sequences -ind AJ -file ./differential_peaks/c57_spec_aj.txt -data_dir . -genome_dir ~/genomes/ -output ./differential_fasta/c57_spec_aj_mut.fa

marge extract_sequences -ind AJ -file ./differential_peaks/aj_spec_c57.txt -data_dir . -genome_dir ~/genomes/ -output ./differential_fasta/aj_spec_c57_ref.fa
marge extract_sequences -ind C57BL6J -file ./differential_peaks/aj_spec_c57.txt -data_dir . -genome_dir ~/genomes/ -output ./differential_fasta/aj_spec_c57_mut.fa


# 4. Run MAGGIE

In [10]:
if not os.path.isdir('./maggie_output/'):
    os.mkdir('./maggie_output/')

BALB vs C57

In [11]:
if not os.path.isdir('./maggie_output/balb_c57'):
    os.mkdir('./maggie_output/balb_c57')

In [None]:
/home/h1bennet/anaconda3/envs/maggie/bin/python ~/maggie/bin/maggie_fasta_input.py \
./differential_fasta/balbc_spec_c57_ref.fa,./differential_fasta/c57_spec_balbc_ref.fa \
./differential_fasta/balbc_spec_c57_mut.fa,./differential_fasta/c57_spec_balbc_mut.fa \
-o ./maggie_output/balbc_c57/ \
-p 20

BALB vs AJ

In [12]:
if not os.path.isdir('./maggie_output/balb_aj'):
    os.mkdir('./maggie_output/balb_aj')

In [None]:
/home/h1bennet/anaconda3/envs/maggie/bin/python ~/maggie/bin/maggie_fasta_input.py \
./differential_fasta/balbc_spec_aj_ref.fa,./differential_fasta/aj_spec_balbc_ref.fa \
./differential_fasta/balbc_spec_aj_mut.fa,./differential_fasta/aj_spec_balbc_mut.fa \
-o ./maggie_output/balbc_aj/ \
-p 20

AJ vs. C57

In [13]:
if not os.path.isdir('./maggie_output/aj_c57'):
    os.mkdir('./maggie_output/aj_c57')

In [None]:
/home/h1bennet/anaconda3/envs/maggie/bin/python ~/maggie/bin/maggie_fasta_input.py \
./differential_fasta/aj_spec_c57_ref.fa,./differential_fasta/c57_spec_aj_ref.fa \
./differential_fasta/aj_spec_c57_mut.fa,./differential_fasta/c57_spec_aj_mut.fa \
-o ./maggie_output/aj_c57/ \
-p 20