# Get miRNA Sequence

The simplest way to get the genomic sequence of these regions is to create a UCSC custom track and then download the sequences associated with that track using the UCSC genome browser.

To do this, first a custom track in <a href="https://genome.ucsc.edu/FAQ/FAQformat.html#format3">GFF format</a> was created containing the genomic location of each primary miRNA &plusmn;20 bp.

In [2]:
import data_processing as dp

def make_track(track_file_loc, db_name, sql_version="MySQL", firewall=False):
    db_con = dp.DatabaseConnection(sql_version, db_name=db_name, firewall=firewall)
    rows = db_con.fetch_query("SELECT PriID, Chr, ChrStrand, GenomeEnd, GenomeStart FROM PrimaryMicroRNA")
    
    with open(track_file_loc, "w") as fout:
        for row in rows:
            if sql_version == "MSSQL":
                out_str = "chr{}\tMiRBase\tExtSeq\t{}\t{}\t.\t{}\t.\t{}\n".format(row.Chr, row.GenomeStart-20, row.GenomeEnd+20, 
                                                                               row.chrStrand, row.PriID)
            else:
                priID, chrom, strand, end, start = row
                out_str = "chr{}\tMiRBase\tExtSeq\t{}\t{}\t.\t{}\t.\t{}\n".format(chrom, start-20, end+20, strand, priID)
            fout.write(out_str)
    db_con.close_cursor()
    db_con.close_connection()

In [3]:
make_track("Extended_miR_track.gff", "miR-test", firewall=True)

The created track was then uploaded using <a href="https://genome.ucsc.edu/cgi-bin/hgCustom">Add Custom Tracks</a>. The custom track was then viewed in <a href="https://genome.ucsc.edu/cgi-bin/hgTables">Table Browser</a> and the "sequence" option under output format was selected. The output sequences in fasta format were downloaded and saved as "Extended_miR_Sequences.fa".

The extended and primary sequences were then added to the database.

In [4]:
import re
import data_processing as dp

def import_pri_seq(fasta_file, db_name, sql_version="MySQL", firewall=False):
    db_con = dp.DatabaseConnection(sql_version, db_name=db_name, firewall=firewall)
    
    pri_re = re.compile("_(MI[0-9]{7}) ")
    with open(fasta_file, "r") as fin:
        seq_dict = {"StemLoopSeq": [], "LongSeq": []}
        pri_dict = {"PriID": []}
        seq = ""
        for line in fin:
            if line[0] == ">":
                if seq != "":
                    seq_dict["LongSeq"] += [seq]
                    seq_dict["StemLoopSeq"] += [seq[20:-20]]
                    seq = ""
                    pri_dict["PriID"] += [priID]
                priID = pri_re.search(line).group(1)
            else:
                seq += line.strip("\n")
        # add last lines
        seq_dict["LongSeq"] += [seq]
        seq_dict["StemLoopSeq"] += [seq[20:-20]]
        pri_dict["PriID"] += [priID]
        
        db_con.update_many_rows(seq_dict, pri_dict, "PrimaryMicroRNA")
    db_con.close_cursor()
    db_con.close_connection()

In [5]:
import_pri_seq("Extended_miR_Sequences.fa", "miR-test", firewall=True)

Once the primary miRNA sequences were loaded into the database, the mature miRNA sequences were added. These sequences were found from within the primary miRNA sequence.

In [8]:
import data_processing as dp

def import_mat_seq(db_name, sql_version="MySQL", firewall=False):
    
    db_con = dp.DatabaseConnection(sql_version, db_name=db_name, firewall=firewall)
    
    mat_seq_dict = {"MatSeq": []}
    mat_id_dict = {"MatID": []}
    rows = db_con.fetch_query("""SELECT p.StemLoopSeq, p.GenomeStart, p.GenomeEnd, p.ChrStrand, m.MatID, m.MatStart, m.MatEnd
FROM PrimaryMicroRNA AS p
JOIN MatureMicroRNA AS m
ON p.PriID = m.PriID;""")
    
    for row in rows:
        if sql_version == "MSSQL":
            strand = row.ChrStrand
            p_start = row.GenomeStart
            p_end = row.GenomeEnd
            seq = row.StemLoopSeq
            matID = row.MatID
            m_start = row.MatStart
            m_end = row.MatEnd
        else:
            seq, p_start, p_end, strand, matID, m_start, m_end = row
            # MySQL return bytearrays, not strings, so need to convert
            seq = str(seq)
            matID = str(matID)

        start_index = m_start - p_start
        end_index = p_end - m_end
        if strand == "+":
            if end_index != 0:
                mat_seq = seq[start_index : -1*end_index]
            else:
                mat_seq = seq[start_index : ]
        else:
            if start_index != 0:
                mat_seq = seq[end_index : -1*start_index]
            else:
                mat_seq = seq[end_index : ]
        mat_seq_dict["MatSeq"] += [mat_seq]
        mat_id_dict["MatID"] += [matID]
        
    db_con.update_many_rows(mat_seq_dict, mat_id_dict, "MatureMicroRNA")
    db_con.close_cursor()
    db_con.close_connection()

In [9]:
import_mat_seq("miR-test", firewall=True)

# Unit Tests

To verify the correct sequences were added, the primary, extended and mature sequences were tested using Python's <a href="https://docs.python.org/2/library/unittest.html">unittest module</a>.

In [10]:
import unittest
import data_processing as dp

class TestSeq(unittest.TestCase):
    def setUp(self):
        self.db_con = dp.DatabaseConnection("MySQL", db_name="miR-test", firewall=True)
        self.sql_version = "MySQL"
            
    def test_stemloop_pos(self):
        rows = self.db_con.fetch_query("SELECT StemLoopSeq FROM PrimaryMicroRNA WHERE PriID LIKE 'MI0022666'")
        row = rows[0]
        if self.sql_version == "MSSQL":
            seq = row.StemLoopSeq
        else:
            seq, = row
        self.assertEqual(seq, "GTGCGTGGTGGCTCGAGGCGGGGGTGGGGGCCTCGCCCTGCTTGGGCCCTCCCTGACCTCTCCGCTCCGCACAG", 
                         "MI0022666 sequence incorrect")
        
    def test_stemloop_neg(self):
        rows = self.db_con.fetch_query("SELECT StemLoopSeq FROM PrimaryMicroRNA WHERE PriID LIKE 'MI0001733'")
        row = rows[0]
        if self.sql_version == "MSSQL":
            seq = row.StemLoopSeq
        else:
            seq, = row
        self.assertEqual(seq, "GCTAAGCACTTACAACTGTTTGCAGAGGAAACTGAGACTTTGTAACTATGTCTCAGTCTCATCTGCAAAGAAGTAAGTGCTTTGC", 
                         "MI0001733 sequence incorrect")
        
    def test_longseq_pos(self):
        rows = self.db_con.fetch_query("SELECT LongSeq FROM PrimaryMicroRNA WHERE PriID LIKE 'MI0003167'")
        row = rows[0]
        if self.sql_version == "MSSQL":
            seq = row.LongSeq
        else:
            seq, = row
        self.assertEqual(seq, "ACAGTGCTGGAGCAAGAAGATCTCATGATGTGACCATCTGGAGGTAAGAAGCACTTTGTGTTTTGTGAAAGAAAGTGCTTCCTTTCAGAGGGTTACTCTTTGAGAAAAGCAGCATTGAAGTTGAT", 
                         "MI0003167 long sequence incorrect")
        
    def test_longseq_neg(self):
        rows = self.db_con.fetch_query("SELECT LongSeq FROM PrimaryMicroRNA WHERE PriID LIKE 'MI0005116'")
        row = rows[0]
        if self.sql_version == "MSSQL":
            seq = row.LongSeq
        else:
            seq, = row
        self.assertEqual(seq, "TACCTACCTGACCTAAGGGCTTTAGGCGCTGATGAAAGTGGAGTTCAGTAGACAGCCCTTTTCAAGCCCTACGAGAAACTGGGGTTTCTGGAGGAGAAGGAAGGTGATGAAGGATCTGTTCTCGTGAGCCTGAACTTTCTAGACAAAACATGTG", 
                         "MI0005116 long sequence incorrect")
        
    def test_matseq_pos(self):
        rows = self.db_con.fetch_query("SELECT MatSeq FROM MatureMicroRNA WHERE MatID LIKE 'MIMAT0002806'")
        row = rows[0]
        if self.sql_version == "MSSQL":
            seq = row.MatSeq
        else:
            seq, = row
        self.assertEqual(seq, "CAACCTGGAGGACTCCATGCTG", "MIMAT0002806 sequence incorrect")
        
    def test_matseq_neg(self):
        rows = self.db_con.fetch_query("SELECT MatSeq FROM MatureMicroRNA WHERE MatID LIKE 'MIMAT0019074'")
        row = rows[0]
        if self.sql_version == "MSSQL":
            seq = row.MatSeq
        else:
            seq, = row
        self.assertEqual(seq, "ACTGGACTAGGAGTCAGAAGG", "MIMAT0019074 sequence incorrect")

In [11]:
suite = unittest.TestLoader().loadTestsFromTestCase(TestSeq)
unittest.TextTestRunner(verbosity=2).run(suite)

test_longseq_neg (__main__.TestSeq) ... ok
test_longseq_pos (__main__.TestSeq) ... ok
test_matseq_neg (__main__.TestSeq) ... ok
test_matseq_pos (__main__.TestSeq) ... ok
test_stemloop_neg (__main__.TestSeq) ... ok
test_stemloop_pos (__main__.TestSeq) ... ok

----------------------------------------------------------------------
Ran 6 tests in 7.087s

OK


<unittest.runner.TextTestResult run=6 errors=0 failures=0>