# Finding sgRNAs

All possible sgRNAs (20nt followed by NGG) in the miRNA stemloop sequence plus the 20 nts on either side were identified. The sequence of these possible sgRNAs were then loaded into the SingleGuideRNA table, if the sgRNA sequence was not already present in the table. For every miRNA targeted by the sgRNA, the SgID, PriID, chromosome, strand, sgRNA genomic start and end sites, the Cas9 start and end of the cleavage site and PAM were then added to the SgRNATargetInformation table. 

To maintain consistancy amoung the various version of the database, the SgID for a given SgRNA sequence was determined using the SgID-SgRNA pairs downloaded from the original version of the database.

In [1]:
import re
import data_processing as dp

from string import maketrans

def find_sgRNAs(sgID_file, db_name, sql_version="MySQL", firewall=False):
    """
        Finds all possible sgRNAs within each primary microRNA sequence and add them to the 
        SingleGuideRNA and sgRNATargetInfo tables
        
        Uses sgID_file to consistantly associate the same sgRNA with the same sgID
    """
    db_con = dp.DatabaseConnection(sql_version, db_name=db_name, firewall=firewall)
    
    # Clear the tables
    db_con.clear_table("SgRNATargetInformation")
    db_con.clear_table("SingleGuideRNA")
    
    # These regular expressions find sgRNAs (any sequence of 20 ATCG's followed by NGG) on either strand
    sgRE = re.compile("(?=([ATCG]{20})([ATCG]GG))")
    revSGRE = re.compile("(?=(CC[ATCG])([ATCG]{20}))")
    
    # Get a list of all of the primary microRNAs from the PrimaryMicroRNA table on the server
    rows = db_con.fetch_query("SELECT PriID, LongSeq, Chr, ChrStrand, GenomeStart, GenomeEnd FROM PrimaryMicroRNA;")

    # Dictionaries to hold the information about the rows to be created
    sg_dict = {"SgID": [], "SgRNA": []}
    sg_targ_dict = {"SgID": [], "PriID": [], "PAM": [], "SgStrand": [], "SgChr": [], 
                    "SgStart": [], "SgEnd": [], "CleaveStart": [], "CleaveEnd": []}
    
    # Dictionary to find the sgID for a given sgRNA sequence
    sg_seq_to_id = {}
    # fill dictionary
    with open(sgID_file, "r") as fin:
        for line in fin:
            ele = line.strip("\n").split(",")
            sg_seq_to_id[ele[1]] = int(ele[0])
            
    # Loop through each primary sequence and find all of the sgRNAs
    for row in rows:
        # load data associate with row
        if sql_version == "MSSQL":
            seq = row.LongSeq
            priID = row.PriID
            chrom = row.Chr
            strand = str(row.ChrStrand)
            pri_start = row.GenomeStart
            pri_end = row.GenomeEnd
        else:
            priID, seq, chrom, strand, pri_start, pri_end = row
            priID = str(priID)
            seq = str(seq)
            chrom = str(chrom)
            strand = str(strand)
            
        seq = seq.upper() # Capitalize the sequence
        m = sgRE.finditer(seq) # regex to find sgRNAs on the same strand as the sgRNA
        for match in m:
            sg = match.group(1)
            pam = match.group(2)
            
            # Add novel sgRNAs to SingleGuideRNA table
            if sg not in sg_dict["SgRNA"]:
                sg_dict["SgRNA"] += [sg]
                sg_dict["SgID"] += [sg_seq_to_id[sg]]
                
            sg_targ_dict["SgID"] += [sg_seq_to_id[sg]]
            sg_targ_dict["PriID"] += [priID]
            sg_targ_dict["PAM"] += [pam]
            sg_targ_dict["SgStrand"] += [strand]
            sg_targ_dict["SgChr"] += [chrom]
            # 20 is added because the match is to LongSeq
            if strand == "+":
                # tested by sg24927
                sg_start = pri_start - 20 + match.start(1)
                sg_end = sg_start + 19
                clev_start = sg_start + 16
                clev_end = sg_start + 17
            else:
                # tested by sg3244
                sg_end = pri_end + 20 - match.start(1)
                sg_start = sg_end - 19
                clev_start = sg_end - 17
                clev_end = sg_end - 16
            sg_targ_dict["SgStart"] += [sg_start]
            sg_targ_dict["SgEnd"] += [sg_end]
            sg_targ_dict["CleaveStart"] += [clev_start]
            sg_targ_dict["CleaveEnd"] += [clev_end]  
        
        revM = revSGRE.finditer(seq) # regex to find sgRNAs on the opposite strand as the sgRNA
        for match in revM:
            sg = rev_comp(match.group(2))
            pam = rev_comp(match.group(1))
            if strand == "+":
                sg_strand = "-"
            else:
                sg_strand = "+"
            
            # Add novel sgRNAs to SingleGuideRNA table
            if sg not in sg_dict["SgRNA"]:
                sg_dict["SgRNA"] += [sg]
                sg_dict["SgID"] += [sg_seq_to_id[sg]]
                
            sg_targ_dict["SgID"] += [sg_seq_to_id[sg]]
            sg_targ_dict["PriID"] += [priID]
            sg_targ_dict["PAM"] += [pam]
            sg_targ_dict["SgStrand"] += [sg_strand]
            sg_targ_dict["SgChr"] += [chrom]
            if sg_strand == "+":
                # tested by sg11638: fine
                sg_end = pri_end + 20 - match.start(2)
                sg_start = sg_end - 19
                clev_start = sg_end - 3
                clev_end = sg_end - 2
            else:
                # tested by sg17089
                sg_start = pri_start - 20 + match.start(2)
                sg_end = sg_start + 19
                clev_start = sg_start + 2
                clev_end = sg_start + 3
            sg_targ_dict["SgStart"] += [sg_start]
            sg_targ_dict["SgEnd"] += [sg_end]
            sg_targ_dict["CleaveStart"] += [clev_start]
            sg_targ_dict["CleaveEnd"] += [clev_end]
            
    db_con.make_many_rows(sg_dict, "SingleGuideRNA")
    db_con.make_many_rows(sg_targ_dict, "SgRNATargetInformation")
    db_con.close_cursor()
    db_con.close_connection()
            
def rev_comp(seq):
    t = maketrans("ATCG", "TAGC")
    nseq = seq.translate(t)[::-1]
    return nseq

In [2]:
find_sgRNAs("SgID_SgRNA_from_local_db.csv", "miR-test", firewall=True)

In [3]:
db_con = dp.DatabaseConnection("MySQL", db_name="miR-test", firewall=True)
row_sg = db_con.fetch_query("SELECT COUNT(*) AS sgCount FROM SingleGuideRNA")
row_targ = db_con.fetch_query("SELECT COUNT(*) AS targCount FROM SgRNATargetInformation")
row_sg, row_targ

([(26344,)], [(28398,)])

There are 26,344 distinct sgRNAs. These sgRNAs target 28,398 sites.

# Unit Testing

The sgRNA selection and genomic location data was tested. This does not work very well behind a firewall because there is a limited number of ssh tunnels which can be formed for a given period of time.

In [6]:
import unittest
import data_processing as dp

class TestSeq(unittest.TestCase):
    def setUp(self):
        self.sql_version = "MySQL"
        self.db_con = dp.DatabaseConnection(self.sql_version, db_name="miR-test", firewall=True)
        
    ###### Sg24927 (+, +) #######
        
    def test_sg24927(self):
        """Tests MI0000060 sg24927"""
        rows = self.db_con.fetch_query("SELECT SgRNA FROM SingleGuideRNA WHERE SgID=24927")
        row = rows[0]
        if self.sql_version == "MSSQL":
            sg = row.SgRNA
        else:
            sg, = row
        self.assertEqual(str(sg), "CCTGGATGTTCTCTTCACTG", "Sg24927 sgRNA incorrect")
        
    def test_pam24927(self):
        """Tests MI0000060 sg24927"""
        rows = self.db_con.fetch_query("SELECT PAM FROM SgRNATargetInformation WHERE SgID=24927")
        row = rows[0]
        if self.sql_version == "MSSQL":
            pam = row.PAM
        else:
            pam, = row
        self.assertEqual(str(pam), "TGG", "Sg24927 PAM incorrect")
        
    def test_clev_start24927(self):
        """Tests MI0000060 sg24927"""
        rows = self.db_con.fetch_query("SELECT CleaveStart FROM SgRNATargetInformation WHERE SgID=24927")
        row = rows[0]
        if self.sql_version == "MSSQL":
            val = row.CleaveStart
        else:
            val, = row
        self.assertEqual(val, 94175953, "Sg24927 Cleavage Start incorrect")
        
    def test_clev_end24927(self):
        """Tests MI0000060 sg24927"""
        rows = self.db_con.fetch_query("SELECT CleaveEnd FROM SgRNATargetInformation WHERE SgID=24927")
        row = rows[0]
        if self.sql_version == "MSSQL":
            val = row.CleaveEnd
        else:
            val, = row
        self.assertEqual(val, 94175954, "Sg24927 Cleavage End incorrect")
    
    def test_sg_start24927(self):
        """Tests MI0000060 sg24927"""
        rows = self.db_con.fetch_query("SELECT SgStart FROM SgRNATargetInformation WHERE SgID=24927")
        row = rows[0]
        if self.sql_version == "MSSQL":
            start = row.SgStart
        else:
            start, = row
        self.assertEqual(start, 94175937, "Sg24927 start site incorrect")
        
    def test_sg_end24927(self):
        """Tests MI0000060 sg24927"""
        rows = self.db_con.fetch_query("SELECT SgEnd FROM SgRNATargetInformation WHERE SgID=24927")
        row = rows[0]
        if self.sql_version == "MSSQL":
            end = row.SgEnd
        else:
            end, = row
        self.assertEqual(end, 94175956, "Sg24927 end site incorrect")    

In [7]:
suite = unittest.TestLoader().loadTestsFromTestCase(TestSeq)
unittest.TextTestRunner(verbosity=2).run(suite)

test_clev_end24927 (__main__.TestSeq)
Tests MI0000060 sg24927 ... ok
test_clev_start24927 (__main__.TestSeq)
Tests MI0000060 sg24927 ... ok
test_pam24927 (__main__.TestSeq)
Tests MI0000060 sg24927 ... ok
test_sg24927 (__main__.TestSeq)
Tests MI0000060 sg24927 ... ok
test_sg_end24927 (__main__.TestSeq)
Tests MI0000060 sg24927 ... ok
test_sg_start24927 (__main__.TestSeq)
Tests MI0000060 sg24927 ... ok

----------------------------------------------------------------------
Ran 6 tests in 6.888s

OK


<unittest.runner.TextTestResult run=6 errors=0 failures=0>

In [3]:
import unittest
import data_processing as dp

class TestSeq(unittest.TestCase):
    def setUp(self):
        self.sql_version = "MySQL"
        self.db_con = dp.DatabaseConnection(self.sql_version, db_name="miR-test", firewall=True)
        
    ###### Sg3244 (-, -) #######
        
    def test_sg3244(self):
        """Tests MI0000061 sg3244"""
        rows = self.db_con.fetch_query("SELECT SgRNA FROM SingleGuideRNA WHERE SgID=3244")
        row = rows[0]
        if self.sql_version == "MSSQL":
            sg = row.SgRNA
        else:
            sg, = row
        self.assertEqual(str(sg), "CATTGTGACTGCATGCTCCC", "Sg3244 sgRNA incorrect")
        
    def test_pam3244(self):
        """Tests MI0000061 sg3244"""
        rows = self.db_con.fetch_query("SELECT PAM FROM SgRNATargetInformation WHERE SgID=3244")
        row = rows[0]
        if self.sql_version == "MSSQL":
            pam = row.PAM
        else:
            pam, = row
        self.assertEqual(str(pam), "AGG", "Sg3244 PAM incorrect")
        
    def test_clev_start3244(self):
        """Tests MI0000061 sg3244"""
        rows = self.db_con.fetch_query("SELECT CleaveStart FROM SgRNATargetInformation WHERE SgID=3244")
        row = rows[0]
        if self.sql_version == "MSSQL":
            val = row.CleaveStart
        else:
            val, = row
        self.assertEqual(val, 122146596, "Sg3244 Cleavage Start incorrect")
        
    def test_clev_end3244(self):
        """Tests MI0000061 sg3244"""
        rows = self.db_con.fetch_query("SELECT CleaveEnd FROM SgRNATargetInformation WHERE SgID=3244")
        row = rows[0]
        if self.sql_version == "MSSQL":
            val = row.CleaveEnd
        else:
            val, = row
        self.assertEqual(val, 122146597, "Sg3244 Cleavage End incorrect")
    
    def test_sg_start3244(self):
        """Tests MI0000061 sg3244"""
        rows = self.db_con.fetch_query("SELECT SgStart FROM SgRNATargetInformation WHERE SgID=3244")
        row = rows[0]
        if self.sql_version == "MSSQL":
            start = row.SgStart
        else:
            start, = row
        self.assertEqual(start, 122146594, "Sg3244 start site incorrect")
        
    def test_sg_end3244(self):
        """Tests MI0000061 sg3244"""
        rows = self.db_con.fetch_query("SELECT SgEnd FROM SgRNATargetInformation WHERE SgID=3244")
        row = rows[0]
        if self.sql_version == "MSSQL":
            end = row.SgEnd
        else:
            end, = row
        self.assertEqual(end, 122146613, "Sg3244 end site incorrect")

In [4]:
suite = unittest.TestLoader().loadTestsFromTestCase(TestSeq)
unittest.TextTestRunner(verbosity=2).run(suite)

test_clev_end3244 (__main__.TestSeq)
Tests MI0000061 sg3244 ... ok
test_clev_start3244 (__main__.TestSeq)
Tests MI0000061 sg3244 ... ok
test_pam3244 (__main__.TestSeq)
Tests MI0000061 sg3244 ... ok
test_sg3244 (__main__.TestSeq)
Tests MI0000061 sg3244 ... ok
test_sg_end3244 (__main__.TestSeq)
Tests MI0000061 sg3244 ... ok
test_sg_start3244 (__main__.TestSeq)
Tests MI0000061 sg3244 ... ok

----------------------------------------------------------------------
Ran 6 tests in 7.965s

OK


<unittest.runner.TextTestResult run=6 errors=0 failures=0>

In [5]:
import unittest
import data_processing as dp

class TestSeq(unittest.TestCase):
    def setUp(self):
        self.sql_version = "MySQL"
        self.db_con = dp.DatabaseConnection(self.sql_version, db_name="miR-test", firewall=True)
        
    ###### Sg17089 (+, -) #######
        
    def test_sg17089(self):
        """Tests MI0005568 sg17089"""
        rows = self.db_con.fetch_query("SELECT SgRNA FROM SingleGuideRNA WHERE SgID=17089")
        row = rows[0]
        if self.sql_version == "MSSQL":
            sg = row.SgRNA
        else:
            sg, = row
        self.assertEqual(str(sg), "CAGAGCACCTGCGGCCAGCA", "Sg17089 sgRNA incorrect")
        
    def test_pam17089(self):
        """Tests MI0005568 sg17089"""
        rows = self.db_con.fetch_query("SELECT PAM FROM SgRNATargetInformation WHERE SgID=17089")
        row = rows[0]
        if self.sql_version == "MSSQL":
            pam = row.PAM
        else:
            pam, = row
        self.assertEqual(str(pam), "GGG", "Sg17089 PAM incorrect")
        
    def test_clev_start17089(self):
        """Tests MI0005568 sg17089"""
        rows = self.db_con.fetch_query("SELECT CleaveStart FROM SgRNATargetInformation WHERE SgID=17089")
        row = rows[0]
        if self.sql_version == "MSSQL":
            val = row.CleaveStart
        else:
            val, = row
        self.assertEqual(val, 21652978, "Sg17089 Cleavage Start incorrect")
        
    def test_clev_end17089(self):
        """Tests MI0005568 sg17089"""
        rows = self.db_con.fetch_query("SELECT CleaveEnd FROM SgRNATargetInformation WHERE SgID=17089")
        row = rows[0]
        if self.sql_version == "MSSQL":
            val = row.CleaveEnd
        else:
            val, = row
        self.assertEqual(val, 21652979, "Sg17089 Cleavage End incorrect")
    
    def test_sg_start17089(self):
        """Tests MI0005568 sg17089"""
        rows = self.db_con.fetch_query("SELECT SgStart FROM SgRNATargetInformation WHERE SgID=17089")
        row = rows[0]
        if self.sql_version == "MSSQL":
            start = row.SgStart
        else:
            start, = row
        self.assertEqual(start, 21652976, "Sg17089 start site incorrect")
        
    def test_sg_end17089(self):
        """Tests MI0005568 sg17089"""
        rows = self.db_con.fetch_query("SELECT SgEnd FROM SgRNATargetInformation WHERE SgID=17089")
        row = rows[0]
        if self.sql_version == "MSSQL":
            end = row.SgEnd
        else:
            end, = row
        self.assertEqual(end, 21652995, "Sg17089 end site incorrect")

In [6]:
suite = unittest.TestLoader().loadTestsFromTestCase(TestSeq)
unittest.TextTestRunner(verbosity=2).run(suite)

test_clev_end17089 (__main__.TestSeq)
Tests MI0005568 sg17089 ... ok
test_clev_start17089 (__main__.TestSeq)
Tests MI0005568 sg17089 ... ok
test_pam17089 (__main__.TestSeq)
Tests MI0005568 sg17089 ... ok
test_sg17089 (__main__.TestSeq)
Tests MI0005568 sg17089 ... ok
test_sg_end17089 (__main__.TestSeq)
Tests MI0005568 sg17089 ... ok
test_sg_start17089 (__main__.TestSeq)
Tests MI0005568 sg17089 ... ok

----------------------------------------------------------------------
Ran 6 tests in 6.906s

OK


<unittest.runner.TextTestResult run=6 errors=0 failures=0>

In [7]:
import unittest
import data_processing as dp

class TestSeq(unittest.TestCase):
    def setUp(self):
        self.sql_version = "MySQL"
        self.db_con = dp.DatabaseConnection(self.sql_version, db_name="miR-test", firewall=True)
        
    ###### Sg11638 (-, +) #######
        
    def test_sg11638(self):
        """Tests MI0025914 sg11638"""
        rows = self.db_con.fetch_query("SELECT SgRNA FROM SingleGuideRNA WHERE SgID=11638")
        row = rows[0]
        if self.sql_version == "MSSQL":
            sg = row.SgRNA
        else:
            sg, = row
        self.assertEqual(str(sg), "GAATCCCAGGCCGGTCAGCC", "Sg11638 sgRNA incorrect")
        
    def test_pam11638(self):
        """Tests MI0025914 sg11638"""
        rows = self.db_con.fetch_query("SELECT PAM FROM SgRNATargetInformation WHERE SgID=11638")
        row = rows[0]
        if self.sql_version == "MSSQL":
            pam = row.PAM
        else:
            pam, = row
        self.assertEqual(str(pam), "CGG", "Sg11638 PAM incorrect")
        
    def test_clev_start11638(self):
        """Tests MI0025914 sg11638"""
        rows = self.db_con.fetch_query("SELECT CleaveStart FROM SgRNATargetInformation WHERE SgID=11638")
        row = rows[0]
        if self.sql_version == "MSSQL":
            val = row.CleaveStart
        else:
            val, = row
        self.assertEqual(val, 112315, "Sg11638 Cleavage Start incorrect")
        
    def test_clev_end11638(self):
        """Tests MI0025914 sg11638"""
        rows = self.db_con.fetch_query("SELECT CleaveEnd FROM SgRNATargetInformation WHERE SgID=11638")
        row = rows[0]
        if self.sql_version == "MSSQL":
            val = row.CleaveEnd
        else:
            val, = row
        self.assertEqual(val, 112316, "Sg11638 Cleavage End incorrect")
    
    def test_sg_start11638(self):
        """Tests MI0025914 sg11638"""
        rows = self.db_con.fetch_query("SELECT SgStart FROM SgRNATargetInformation WHERE SgID=11638")
        row = rows[0]
        if self.sql_version == "MSSQL":
            start = row.SgStart
        else:
            start, = row
        self.assertEqual(start, 112299, "Sg11638 start site incorrect")
        
    def test_sg_end11638(self):
        """Tests MI0025914 sg11638"""
        rows = self.db_con.fetch_query("SELECT SgEnd FROM SgRNATargetInformation WHERE SgID=11638")
        row = rows[0]
        if self.sql_version == "MSSQL":
            end = row.SgEnd
        else:
            end, = row
        self.assertEqual(end, 112318, "Sg11638 end site incorrect")

In [8]:
suite = unittest.TestLoader().loadTestsFromTestCase(TestSeq)
unittest.TextTestRunner(verbosity=2).run(suite)

test_clev_end11638 (__main__.TestSeq)
Tests MI0025914 sg11638 ... ok
test_clev_start11638 (__main__.TestSeq)
Tests MI0025914 sg11638 ... ok
test_pam11638 (__main__.TestSeq)
Tests MI0025914 sg11638 ... ok
test_sg11638 (__main__.TestSeq)
Tests MI0025914 sg11638 ... ok
test_sg_end11638 (__main__.TestSeq)
Tests MI0025914 sg11638 ... ok
test_sg_start11638 (__main__.TestSeq)
Tests MI0025914 sg11638 ... ok

----------------------------------------------------------------------
Ran 6 tests in 6.491s

OK


<unittest.runner.TextTestResult run=6 errors=0 failures=0>