# Motif-matching

This python3 notebook converts the coordinates of FIMO-identified motif sequences (given relative to the submitted sequence) back into global genomic coordinates using a GTF describing the original sequences. Both these inputs are generated from [`bioc-human-preparation-public.ipynb`](bioc-human-preparation-public.ipynb). The outputted BED4/6 files are then used in [`bioc-human-analysis-public.ipynb`](bioc-human-analysis-public.ipynb).

In [1]:
import pandas as pd

In [2]:
# hack to import slowkow's GTF.py (https://gist.github.com/slowkow/8101481)
# from https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path

import importlib.util
spec = importlib.util.spec_from_file_location("GTF", "/ru-auth/local/home/ezheng/ipynb/GTF.py")
GTF = importlib.util.module_from_spec(spec)
spec.loader.exec_module(GTF)

# automation!

In [3]:
def process(infile, reference, outfile):
    SoxOct_peaks = GTF.dataframe(filename=reference)

    FIMO_motifs = pd.read_table(infile)

    FIMO_motifs = FIMO_motifs[["sequence name", "start", "stop", "strand"]]
    FIMO_motifs.rename(columns={"stop":"end"}, inplace=True)
    
    SoxOct_peaks.columns

    merge = pd.merge(SoxOct_peaks, FIMO_motifs, left_on="name", \
                     right_on="sequence name", how = 'right', suffixes=["_peak", "_motif"])

    merge[0:10]

    DNAflank = 0

    merge["start_motif_genomic"] = merge["start_peak"].astype(int) + merge["start_motif"].astype(int) - DNAflank -1 
    merge["end_motif_genomic"] = merge["start_peak"].astype(int) + merge["end_motif"].astype(int) - DNAflank -1 

    merge[["seqname", "start_motif_genomic", "end_motif_genomic", "name"]].to_csv(\
                            outfile, \
                            header=False, index=False, sep = "\t")

In [4]:
# Sox

infile =    "/ru-auth/local/home/ezheng/SoxOct/Soufi2015/EBZ_fimo/Sox2_200/fimo.txt"
reference = "/ru-auth/local/home/ezheng/SoxOct/Soufi2015/Sox_peaks_200.gtf"
outfile =                            "~/SoxOct/public/Soufi2015/Sox_motifs_200.bed"

process(infile = infile,reference = reference,outfile = outfile)

In [5]:
infile =    "~/SoxOct/Soufi2015/EBZ_fimo/Sox2-SoxNucHi74_200/fimo.txt"
reference = "/ru-auth/local/home/ezheng/SoxOct/Soufi2015/Sox_peaks_200.gtf"
outfile =   "~/SoxOct/public/Soufi2015/SoxNucHi74_motifs_200.bed"

process(infile = infile,reference = reference,outfile = outfile)

In [6]:
infile =    "~/SoxOct/Soufi2015/EBZ_fimo/Oct4_200/fimo.txt"
reference = "/ru-auth/local/home/ezheng/SoxOct/Soufi2015/Oct_peaks_200.gtf"
outfile =   "~/SoxOct/public/Soufi2015/Oct_motifs_200.bed"

process(infile = infile,reference = reference,outfile = outfile)

In [7]:
infile =    "~/SoxOct/Soufi2015/EBZ_fimo/Oct4-OctNucHi28_200/fimo.txt"
reference = "/ru-auth/local/home/ezheng/SoxOct/Soufi2015/Oct_peaks_200.gtf"
outfile =   "~/SoxOct/public/Soufi2015/OctNucHi28_motifs_200.bed"

process(infile = infile,reference = reference,outfile = outfile)

In [8]:
infile =    "~/SoxOct/Soufi2015/EBZ_fimo/Oct4-OctNucHi42_200/fimo.txt"
reference = "/ru-auth/local/home/ezheng/SoxOct/Soufi2015/Oct_peaks_200.gtf"
outfile =   "~/SoxOct/public/Soufi2015/OctNucHi4_motifs_200.bed"

process(infile = infile,reference = reference,outfile = outfile)

## stranded processing

In [9]:
def process_strand(infile, reference, outfile, DNAflank):
    peaks = GTF.dataframe(filename=reference)
    peaks[0:10]

    FIMO_motifs = pd.read_table(infile)

    FIMO_motifs = FIMO_motifs[["sequence name", "start", "stop", "strand"]]
    FIMO_motifs.rename(columns={"stop":"end"}, inplace=True)
    #FIMO_motifs

    peaks.columns

    merge = pd.merge(peaks, FIMO_motifs, left_on="name", \
                     right_on="sequence name", how = 'right', suffixes=["_peak", "_motif"])

    merge[0:10]

    # do the coordinate conversion
    DNAflank = DNAflank # this is normally 200, but sometimes it is 0 -- this is the offset from the reference GTF!

    merge["start_motif_genomic"] = merge["start_peak"].astype(int) + merge["start_motif"].astype(int) - DNAflank -1 
    merge["end_motif_genomic"] = merge["start_peak"].astype(int) + merge["end_motif"].astype(int) - DNAflank -1 

    #merge.columns
    merge["dummy"] = "."
    merge[["seqname", "start_motif_genomic", "end_motif_genomic", "name", "dummy", "strand_motif"]].to_csv(\
                            outfile, \
                            header=False, index=False, sep = "\t")

In [10]:
# Sox

infile =    "/ru-auth/local/home/ezheng/SoxOct/Soufi2015/EBZ_fimo/Sox2_200/fimo.txt"
reference = "/ru-auth/local/home/ezheng/SoxOct/Soufi2015/Sox_peaks_200.gtf"
outfile =                            "~/SoxOct/public/Soufi2015/Sox_motifs_200.strand.bed"

process_strand(infile = infile,reference = reference,outfile = outfile, DNAflank=0)

In [11]:
infile =    "~/SoxOct/Soufi2015/EBZ_fimo/Sox2-SoxNucHi74_200/fimo.txt"
reference = "/ru-auth/local/home/ezheng/SoxOct/Soufi2015/Sox_peaks_200.gtf"
outfile =   "~/SoxOct/public/Soufi2015/SoxNucHi74_motifs_200.strand.bed"

process_strand(infile = infile,reference = reference,outfile = outfile, DNAflank=0)

In [12]:
infile =    "~/SoxOct/Soufi2015/EBZ_fimo/Oct4_200/fimo.txt"
reference = "/ru-auth/local/home/ezheng/SoxOct/Soufi2015/Oct_peaks_200.gtf"
outfile =   "~/SoxOct/public/Soufi2015/Oct_motifs_200.strand.bed"

process_strand(infile = infile,reference = reference,outfile = outfile, DNAflank=0)

In [13]:
infile =    "~/SoxOct/Soufi2015/EBZ_fimo/Oct4-OctNucHi28_200/fimo.txt"
reference = "/ru-auth/local/home/ezheng/SoxOct/Soufi2015/Oct_peaks_200.gtf"
outfile =   "~/SoxOct/public/Soufi2015/OctNucHi28_motifs_200.strand.bed"

process_strand(infile = infile,reference = reference,outfile = outfile, DNAflank=0)

In [14]:
infile =    "~/SoxOct/Soufi2015/EBZ_fimo/Oct4-OctNucHi42_200/fimo.txt"
reference = "/ru-auth/local/home/ezheng/SoxOct/Soufi2015/Oct_peaks_200.gtf"
outfile =   "~/SoxOct/public/Soufi2015/OctNucHi4_motifs_200.strand.bed"

process_strand(infile = infile,reference = reference,outfile = outfile, DNAflank=0)