# Cleave Site Position

Once the genomic location of the cleavage site had been determined, where in the stem-loop the cleavage site fell was determined. Those cleavage sites which are outside of the stem-loop are marked as 'ext'. Those sgRNAs which cleave inside a mature miRNA are annotated 'miR', 'miR-5p' or 'miR-3p' depending on if the name of the mature miRNA includes the strand information. All other cleavage sites are marked 'other'. 

In [1]:
import data_processing as dp

def find_cleavage(db_name, sql_version="MySQL", firewall=False):
    """
        Figures out where the sgRNA cleavage site falls in the miRNA
    """
    db_con = dp.DatabaseConnection(sql_version, db_name=db_name, firewall=firewall)
    
    rows = db_con.fetch_query("""SELECT t.SgID, t.CleaveStart, t.CleaveEnd, 
p.PriID, p.GenomeStart, p.GenomeEnd, 
m.MatID, m.MatMiRName, m.MatStart, m.MatEnd
FROM SgRNATargetInformation AS t
JOIN PrimaryMicroRNA AS p
ON t.PriID = p.PriID
JOIN MatureMicroRNA AS m
ON p.PriID = m.PriID""")
    
    # keep track of already made annotations
    anno_dict = {}
    for row in rows:
        if sql_version == "MSSQL":
            sgID = row.SgID
            c_start = row.CleaveStart
            c_end = row.CleaveEnd
            priID = row.PriID
            p_start = row.GenomeStart
            p_end = row.GenomeEnd
            matID = row.MatID
            matName = row.MatMiRName
            m_start = row.MatStart
            m_end = row.MatEnd
        else:
            sgID, c_start, c_end, priID, p_start, p_end, matID, matName, m_start, m_end = row
            priID = str(priID)
            matID = str(matID)
        
        # if cleavage site is outside stemloop, mark 'ext'
        if c_start not in range(p_start, p_end+1) and c_end not in range(p_start, p_end+1):
            # ext annotation should only be added if no other annotation is present
            if (sgID, priID) not in anno_dict:
                anno_dict[(sgID, priID)] = ["ext", None]
            else:
                pass
        elif c_start not in range(m_start, m_end+1) and c_end not in range(m_start, m_end+1):
            if (sgID, priID) not in anno_dict:
                anno_dict[(sgID, priID)] = ["other", None]
            # if the sgRNA is marked as extended, change to other
            # only happens when same sg targets same miRNA twice
            # ie sg14635, 13546, 26080 with MI0016783, 16832, 25911
            elif anno_dict[(sgID, priID)][0] == "ext":
                anno_dict[(sgID, priID)] = ["other", None]
            else:
                pass
        else:
            if "3p" in matName:
                anno_dict[(sgID, priID)] = ["miR-3p", matID]
            elif "5p" in matName:
                anno_dict[(sgID, priID)] = ["miR-5p", matID]
            else:
                anno_dict[(sgID, priID)] = ["miR", matID]
    
    # reformat for update rows
    site_dict = {"CleavageSite": [], "MatID": []}
    sg_dict = {"SgID": [], "PriID": []}
    for sg, pri in anno_dict:
        sg_dict["SgID"] += [sg]
        sg_dict["PriID"] += [pri]
        site_dict["CleavageSite"] += [anno_dict[(sg, pri)][0]]
        site_dict["MatID"] += [anno_dict[(sg, pri)][1]]
        
    db_con.update_many_rows(site_dict, sg_dict, "SgRNATargetInformation")
    db_con.close_cursor()
    db_con.close_connection()

In [2]:
find_cleavage("miR-test", firewall=True)