# Motif-matching

This python3 notebook converts the coordinates of FIMO-identified motif sequences (given relative to the submitted sequence) back into global genomic coordinates using a GTF describing the original sequences. Both these inputs are generated from [`bioc-mouse-preparation-public.ipynb`](bioc-human-preparation-public.ipynb). The outputted BED4/6 files are then used in [`bioc-mouse-genomic-analysis-public.ipynb`](bioc-human-analysis-public.ipynb).

In [1]:
import pandas as pd

In [2]:
# hack to import slowkow's GTF.py (https://gist.github.com/slowkow/8101481)
# from https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path

import importlib.util
spec = importlib.util.spec_from_file_location("GTF", "/ru-auth/local/home/ezheng/ipynb/GTF.py")
GTF = importlib.util.module_from_spec(spec)
spec.loader.exec_module(GTF)

# automation!

In [3]:
def process(infile, reference, outfile):
    SoxOct_peaks = GTF.dataframe(filename=reference)

    FIMO_motifs = pd.read_table(infile)

    FIMO_motifs = FIMO_motifs[["sequence name", "start", "stop", "strand"]]
    FIMO_motifs.rename(columns={"stop":"end"}, inplace=True)
    
    SoxOct_peaks.columns

    merge = pd.merge(SoxOct_peaks, FIMO_motifs, left_on="name", \
                     right_on="sequence name", how = 'right', suffixes=["_peak", "_motif"])

    merge[0:10]

    DNAflank = 200

    merge["start_motif_genomic"] = merge["start_peak"].astype(int) + merge["start_motif"].astype(int) - DNAflank -1 
    merge["end_motif_genomic"] = merge["start_peak"].astype(int) + merge["end_motif"].astype(int) - DNAflank -1 

    merge[["seqname", "start_motif_genomic", "end_motif_genomic", "name"]].to_csv(\
                            outfile, \
                            header=False, index=False, sep = "\t")

## mESCs (Whyte)

In [4]:
# Tandem

infile =    "/ru-auth/local/home/ezheng/SoxOct/public/mouse/fimo/Whyte_mESC_tandem/fimo.txt"
reference = "/ru-auth/local/home/ezheng/SoxOct/public/mouse/chipseq_Whyte2013/tandem_peaks.gtf"
outfile =                            "~/SoxOct/public/mouse/chipseq_Whyte2013/tandem_motifs.bed"

process(infile = infile,reference = reference,outfile = outfile)

# Sox

infile =    "/ru-auth/local/home/ezheng/SoxOct/public/mouse/fimo/Whyte_mESC_Sox2/fimo.txt"
reference = "/ru-auth/local/home/ezheng/SoxOct/public/mouse/chipseq_Whyte2013/Sox_peaks.gtf"
outfile =                            "~/SoxOct/public/mouse/chipseq_Whyte2013/Sox_motifs.bed"

process(infile = infile,reference = reference,outfile = outfile)

# Oct

infile =    "/ru-auth/local/home/ezheng/SoxOct/public/mouse/fimo/Whyte_mESC_Oct4/fimo.txt"
reference = "/ru-auth/local/home/ezheng/SoxOct/public/mouse/chipseq_Whyte2013/Oct_peaks.gtf"
outfile =                            "~/SoxOct/public/mouse/chipseq_Whyte2013/Oct_motifs.bed"

process(infile = infile,reference = reference,outfile = outfile)

## Matsuda (EpiSC)

In [5]:
# Tandem

infile =    "/ru-auth/local/home/ezheng/SoxOct/public/mouse/fimo/EpiSC_tandem/fimo.txt"
reference = "/ru-auth/local/home/ezheng/SoxOct/public/mouse/EpiSC_chip/tandempeaks.gtf"
outfile =                            "~/SoxOct/public/mouse/EpiSC_chip/tandem_motifs.bed"

process(infile = infile,reference = reference,outfile = outfile)

# Sox

infile =    "/ru-auth/local/home/ezheng/SoxOct/public/mouse/fimo/EpiSC_Sox2/fimo.txt"
reference = "/ru-auth/local/home/ezheng/SoxOct/public/mouse/EpiSC_chip/Sox_peaks.gtf"
outfile =                            "~/SoxOct/public/mouse/EpiSC_chip/Sox_motifs.bed"

process(infile = infile,reference = reference,outfile = outfile)

# Oct

infile =    "/ru-auth/local/home/ezheng/SoxOct/public/mouse/fimo/EpiSC_Oct4/fimo.txt"
reference = "/ru-auth/local/home/ezheng/SoxOct/public/mouse/EpiSC_chip/Oct_peaks.gtf"
outfile =                            "~/SoxOct/public/mouse/EpiSC_chip/Oct_motifs.bed"

process(infile = infile,reference = reference,outfile = outfile)

# stranded processing

In [6]:
def process_strand(infile, reference, outfile, DNAflank):
    peaks = GTF.dataframe(filename=reference)

    FIMO_motifs = pd.read_table(infile)

    FIMO_motifs = FIMO_motifs[["sequence name", "start", "stop", "strand"]]
    FIMO_motifs.rename(columns={"stop":"end"}, inplace=True)
    #FIMO_motifs

    merge = pd.merge(peaks, FIMO_motifs, left_on="name", \
                     right_on="sequence name", how = 'right', suffixes=["_peak", "_motif"])

    # do the coordinate conversion
    DNAflank = DNAflank # this is normally 200, but sometimes it is 0 -- this is the offset from the reference GTF!

    merge["start_motif_genomic"] = merge["start_peak"].astype(int) + merge["start_motif"].astype(int) - DNAflank -1 
    merge["end_motif_genomic"] = merge["start_peak"].astype(int) + merge["end_motif"].astype(int) - DNAflank -1 

    #merge.columns
    merge["dummy"] = "."
    merge[["seqname", "start_motif_genomic", "end_motif_genomic", "name", "dummy", "strand_motif"]].to_csv(\
                            outfile, \
                            header=False, index=False, sep = "\t")

## mESCs (Whyte)

In [7]:
# Tandem

infile =    "/ru-auth/local/home/ezheng/SoxOct/public/mouse/fimo/Whyte_mESC_tandem/fimo.txt"
reference = "/ru-auth/local/home/ezheng/SoxOct/public/mouse/chipseq_Whyte2013/tandem_peaks.gtf"
outfile =                            "~/SoxOct/public/mouse/chipseq_Whyte2013/tandemmotifs.strand.bed"

process_strand(infile = infile,reference = reference,outfile = outfile, DNAflank = 200)

# Sox

infile =    "/ru-auth/local/home/ezheng/SoxOct/public/mouse/fimo/Whyte_mESC_Sox2/fimo.txt"
reference = "/ru-auth/local/home/ezheng/SoxOct/public/mouse/chipseq_Whyte2013/Sox_peaks.gtf"
outfile =                            "~/SoxOct/public/mouse/chipseq_Whyte2013/Sox_motifs.strand.bed"

process_strand(infile = infile,reference = reference,outfile = outfile, DNAflank = 200)

# Oct

infile =    "/ru-auth/local/home/ezheng/SoxOct/public/mouse/fimo/Whyte_mESC_Oct4/fimo.txt"
reference = "/ru-auth/local/home/ezheng/SoxOct/public/mouse/chipseq_Whyte2013/Oct_peaks.gtf"
outfile =                            "~/SoxOct/public/mouse/chipseq_Whyte2013/Oct_motifs.strand.bed"

process_strand(infile = infile,reference = reference,outfile = outfile, DNAflank = 200)

## Matsuda (EpiSC)

In [8]:
# Tandem

infile =    "/ru-auth/local/home/ezheng/SoxOct/public/mouse/fimo/EpiSC_tandem/fimo.txt"
reference = "/ru-auth/local/home/ezheng/SoxOct/public/mouse/EpiSC_chip/tandempeaks.gtf"
outfile =                            "~/SoxOct/public/mouse/EpiSC_chip/tandemmotifs.strand.bed"

process_strand(infile = infile,reference = reference,outfile = outfile, DNAflank = 200)

# Sox

infile =    "/ru-auth/local/home/ezheng/SoxOct/public/mouse/fimo/EpiSC_Sox2/fimo.txt"
reference = "/ru-auth/local/home/ezheng/SoxOct/public/mouse/EpiSC_chip/Sox_peaks.gtf"
outfile =                            "~/SoxOct/public/mouse/EpiSC_chip/Sox_motifs.strand.bed"

process_strand(infile = infile,reference = reference,outfile = outfile, DNAflank = 200)

# Oct

infile =    "/ru-auth/local/home/ezheng/SoxOct/public/mouse/fimo/EpiSC_Oct4/fimo.txt"
reference = "/ru-auth/local/home/ezheng/SoxOct/public/mouse/EpiSC_chip/Oct_peaks.gtf"
outfile =                            "~/SoxOct/public/mouse/EpiSC_chip/Oct_motifs.strand.bed"

process_strand(infile = infile,reference = reference,outfile = outfile, DNAflank = 200)