# Find Overlapping Seed sgRNAs

The <b>OverlappingSgRNAs</b> table contains information about which sgRNAs have identical seeds (the 12 bases closest to the PAM). These sgRNAs which target the same miRNA are likely to have similar off-target sites and may create false positives. Therefore, only one sgRNA with any given seed is used when there are enough sgRNA to allow for the top four to be choosen. The SelectedSgID has a higher ZhangScore than the OverlappingSgID, which is discarded. 

In [None]:
import itertools
import getpass
import pandas as pd
import data_processing as dp

def find_overlapping(db_name, sql_version="MySQL", firewall=False):
    db_con = dp.DatabaseConnection(sql_version, db_name=db_name, firewall=firewall)
        
    db_con.make_table("OverlappingSgRNAs", {"SelectedSgID": ["INT"], 
                                            "OverlappingSgID": ["INT"]})
    
    # find primary miRNAs which have greater than five sgRNAs targeting them
    rows = db_con.fetch_query("""SELECT p.PriID
FROM SgRNATargetInformation AS t
JOIN PrimaryMicroRNA AS p
ON t.PriID = p.PriID
JOIN (SELECT SgID FROM SingleGuideRNA WHERE Exclude IS NULL) AS s
ON t.SgID = s.SgID
GROUP BY p.PriID
HAVING COUNT(DISTINCT s.SgID) > 5""")
    if sql_version == "MSSQL":
        priIDs = ["{}".format(row.PriID) for row in rows]
    else:
        priIDs = ["{}".format(priID) for priID, in rows]
    pri_str = ",".join(["'{}'".format(pri) for pri in priIDs])
    
    # find corresponding possible sgRNAs
    df = db_con.fetch_query_as_df("""SELECT t.PriID, s.SgID, s.SgRNA, s.ZhangScore
FROM SingleGuideRNA AS s
JOIN SgRNATargetInformation AS t
ON s.SgID = t.SgID
WHERE s.Exclude IS NULL AND t.PriID IN ({})""".format(pri_str), "SgID")
    
    make_row = {"SelectedSgID": [], "OverlappingSgID": []}
    for pri in priIDs:
        sg_df = df[df["PriID"] == pri]
        sgs = sg_df.index.tolist()
        
        # This will look at all possible combinations of two sgRNAs in from the list of sgRNAs
        for sg1, sg2 in itertools.combinations(sgs, 2):
            if sg_df.get_value(sg1, "SgRNA")[8:] == sg_df.get_value(sg2, "SgRNA")[8:]:
                if sg_df.get_value(sg1, "ZhangScore") > sg_df.get_value(sg2, "ZhangScore"):
                    make_row["SelectedSgID"] += [sg1]
                    make_row["OverlappingSgID"] += [sg2]
                else:
                    make_row["SelectedSgID"] += [sg2]
                    make_row["OverlappingSgID"] += [sg1]
    
    # inserts rows into the OverlappingSgRNAs table
    db_con.make_many_rows(make_row, "OverlappingSgRNAs")
    db_con.close_cursor()
    db_con.close_connection()

In [None]:
find_overlapping("miR-test", firewall=True)