In [9]:
import mysql.connector
import csv
import os
import sys

from datetime import datetime
from collections import defaultdict

In [10]:
# cnx = mysql.connector.connect(
#     user='genome', 
#     host='genome-mysql.cse.ucsc.edu',
#     database='hg19',
# )
# cursor = cnx.cursor()
# cursor.execute('SHOW COLUMNS FROM refGene')
# for entry in cursor:
#     print(entry[0])

In [11]:
def getTranscripts(assembly):
    cnx = mysql.connector.connect(
        user='genome', 
        host='genome-mysql.cse.ucsc.edu',
        database=assembly,
    )
    cursor = cnx.cursor(dictionary=True)
    cursor.execute('SELECT name,name2,exonStarts,exonEnds,strand,chrom FROM refGene')

    transcripts = []
    for entry in cursor:
       transcripts.append(entry)
    
    return transcripts

In [12]:
def createOutputFiles(transcripts, assembly, name):

    def writeHeader(OUTPUT, name, description):
        OUTPUT.write('track name={} description="{}"\n'.format(
            name,
            description,
        ))
        
#     def writeBedEntry(writer, transcript, coordinate):
#         writer.writerow([
#             transcript['chrom'],
#             str(int(coordinate)-1),
#             str(coordinate),
#             transcript['name'],
#             '0',
#             transcript['strand'],
#         ])
        
    def getPosition(transcript, coordinate):
        return (
            transcript['chrom'],
            transcript['strand'],
            str(coordinate),
        )
    
    def writeEntries(_dict, writer):
        sorted_positions = sorted(_dict, key=lambda x: (x[1], x[0], int(x[2])))
        for pos in sorted_positions:
            chrom, strand, coordinate = pos
            joined_name = ','.join(_dict[pos])
            writer.writerow([
                chrom,
                str(int(coordinate)-1),
                str(coordinate),
                joined_name,
                '0',
                strand,
            ])
    
    tss_fn = name + '_tss.bed'
    tes_fn = name + '_tes.bed'
    splice_donor_fn = name + '_spliceDonors.bed'
    splice_acceptor_fn = name + '_spliceAcceptors.bed'
    
    TSS_OUT = open(tss_fn, 'w')
    TES_OUT = open(tes_fn, 'w')
    SPLICE_DONOR_OUT = open(splice_donor_fn, 'w')
    SPLICE_ACCEPTOR_OUT = open(splice_acceptor_fn, 'w')

    writeHeader(TSS_OUT, tss_fn, 'RefGene TSS: {}, {}'.format(assembly, datetime.now().date()))
    writeHeader(TES_OUT, tes_fn, 'RefGene TES: {}, {}'.format(assembly, datetime.now().date()))
    writeHeader(SPLICE_DONOR_OUT, splice_donor_fn,
                'RefGene Splice Donors: {}, {}'.format(assembly, datetime.now().date()))
    writeHeader(SPLICE_ACCEPTOR_OUT, splice_acceptor_fn,
                'RefGene Splice Acceptors: {}, {}'.format(assembly, datetime.now().date()))
    
    tss_writer = csv.writer(TSS_OUT, delimiter='\t')
    tes_writer = csv.writer(TES_OUT, delimiter='\t')
    splice_donor_writer = csv.writer(SPLICE_DONOR_OUT, delimiter='\t')
    splice_acceptor_writer = csv.writer(SPLICE_ACCEPTOR_OUT, delimiter='\t')
    
    tss_dict = defaultdict(list)
    tes_dict = defaultdict(list)
    splice_donor_dict = defaultdict(list)
    splice_acceptor_dict = defaultdict(list)
        
    for transcript in transcripts:
        exon_starts = transcript['exonStarts'].decode('utf-8').split(',')[:-1]
        exon_ends = transcript['exonEnds'].decode('utf-8').split(',')[:-1]
        
        exons = []
        for start, end in zip(exon_starts, exon_ends):
            exons.append([start, end])
        exons.sort(key=lambda x: int(x[0]))
        
        donors = []
        acceptors = []
        if transcript['strand'] == '+':
            tss = exons[0][0]
            tes = exons[-1][1]
            for exon in exons[:-1]:
                donors.append(exon[1])
            for exon in exons[1:]:
                acceptors.append(exon[0])
        elif transcript['strand'] == '-':
            tss = exons[-1][1]
            tes = exons[0][0]
            for exon in exons[1:]:
                donors.append(exon[0])
            for exon in exons[:-1]:
                acceptors.append(exon[1])
                
        tss_dict[getPosition(transcript, tss)].append(transcript['name'])
        tes_dict[getPosition(transcript, tes)].append(transcript['name'])
        for donor in donors:
            splice_donor_dict[getPosition(transcript, donor)].append(transcript['name'])
        for acceptor in acceptors:
            splice_acceptor_dict[getPosition(transcript, acceptor)].append(transcript['name'])
                
#         writeBedEntry(tss_writer, transcript, tss)
#         writeBedEntry(tes_writer, transcript, tes)
#         for donor in donors:
#             writeBedEntry(splice_donor_writer, transcript, donor)
#         for acceptor in acceptors:
#             writeBedEntry(splice_acceptor_writer, transcript, acceptor)

    writeEntries(tss_dict, tss_writer)
    writeEntries(tes_dict, tes_writer)
    writeEntries(splice_donor_dict, splice_donor_writer)
    writeEntries(splice_acceptor_dict, splice_acceptor_writer)
                
    TSS_OUT.close()
    TES_OUT.close()
    SPLICE_DONOR_OUT.close()
    SPLICE_ACCEPTOR_OUT.close()

In [22]:
def renameDuplicateEntries(transcripts):
    name_count = defaultdict(int)
    
    for transcript in transcripts:
        name_count[transcript['name']] += 1
        transcript['name'] = transcript['name'] + ':' + str(name_count[transcript['name']])

In [23]:
# transcripts = getTranscripts('mm9')

In [24]:
# createOutputFiles(transcripts, 'mm9', 'test')

In [25]:
for assembly in ['mm9', 'hg19']:
    transcripts = getTranscripts(assembly)
    renameDuplicateEntries(transcripts)
    createOutputFiles(transcripts, assembly, assembly)