# sgRNA Extended Sequence
 
We then fetched the LongSg (NNNNsgRNANGGNNN for on-target activity scoring) sequence using the <a href="http://rest.ensembl.org/documentation/info/sequence_region">ENSEMBL API</a> if the sgRNA extended sequence goes beyond the primary miRNA extended sequence.

In [1]:
import requests
import data_processing as dp

from string import maketrans

def add_extend_seq(db_name, sql_version="MySQL", firewall=False):
    db_con = dp.DatabaseConnection(sql_version, db_name=db_name, firewall=firewall)
    
    rows = db_con.fetch_query("""SELECT t.SgID, t.SgChr, t.SgStrand, t.SgStart, t.SgEnd, s.SgRNA, 
p.LongSeq, p.GenomeStart, p.GenomeEnd, p.ChrStrand
FROM SgRNATargetInformation AS t 
JOIN PrimaryMicroRNA AS p
ON t.PriID = p.PriID
JOIN SingleGuideRNA AS s 
ON t.SgID=s.SgID""")
    long_dict = {"LongSg":[]}
    sg_dict = {"SgID": [], "SgStart": [], "SgEnd": []}
    for row in rows:
        if sql_version == "MSSQL":
            sgID = row.SgID
            chrom = row.SgChr
            strand = row.SgStrand
            start = row.SgStart
            end = row.SgEnd
            sgRNA = row.SgRNA
            seq = str(row.LongSeq)
            g_start = row.GenomeStart
            g_end = row.GenomeEnd
            g_strand = row.ChrStrand
        else:
            sgID, chrom, strand, start, end, sgRNA, seq, g_start, g_end, g_strand = row
            seq = str(seq)
  
        if strand == "-":
            strand_num = -1
            long_start = start - 6 
            long_end = end + 4 
        else:
            strand_num = 1
            long_start = start - 4 
            long_end = end + 6
        
        # genome start and end have 20 subtracted/added to deal with long sequence
        g_start = g_start-20
        g_end = g_end+20
        
        # extended sgRNA sequence completely overlaps with long miRNA sequence
        if long_start >= g_start and long_end <= g_end:
            if g_strand == "+":
                sli_start = long_start-g_start
                # the len is necessary to deal with the 0 special case
                sli_end = len(seq) + long_end - g_end
            else:
                sli_start = g_end - long_end
                # the len is necessary to deal with the 0 special case
                sli_end = len(seq) + g_start - long_start
                
            if strand == g_strand:
                ext_seq = seq[sli_start:sli_end]
            else:
                ext_seq_rev = seq[sli_start:sli_end]
                ext_seq = rev_comp(ext_seq_rev)
        else:
            # Query ENSEMBL
            url = "http://rest.ensembl.org/sequence/region/homo_sapiens/{}:{}..{}:{}".format(chrom, long_start, 
                                                                                             long_end, strand_num)
            #print url
            r = requests.get(url, headers={"Content-Type" : "text/plain"})
            # ends the program if our response is not good
            if not r.ok:
                r.raise_for_status()
                sys.exit()
            ext_seq = str(r.text)
            
        if ext_seq[4:-6] != sgRNA:
            print "sg{}: LongSgRNA {} does not match sgRNA {}".format(sgID, ext_seq, sgRNA)
        else:
            long_dict["LongSg"] += [ext_seq]
            sg_dict["SgID"] += [sgID]
            sg_dict["SgStart"] += [start]
            sg_dict["SgEnd"] += [end]
    db_con.update_many_rows(long_dict, sg_dict, "SgRNATargetInformation")
    db_con.close_cursor()
    db_con.close_connection()
                
def rev_comp(seq):
    t = maketrans("ATCG", "TAGC")
    nseq_rev = seq.translate(t)
    nseq = nseq_rev[::-1]

    return nseq

In [None]:
add_extend_seq("miR-test", firewall=True)