# Off Target Scoring

Along with the predicted activity at sgRNA target sites, we want to make sure the sgRNAs were not cutting other DNA sequences. Several algorthms were used to predicted the off-target activity of each sgRNA

## CRISPRSeek

After using these on-target scoring algorthms, I tried the CRISPRseek sgRNA off-target scoring method (see <a href="http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0108424">this paper</a>. CRISPRseek is a <a href="https://www.bioconductor.org/packages/3.3/bioc/html/CRISPRseek.html">bioconductor package</a> which is written in R. Unfortunately, calculating a score which takes the top 100 off-target sites for each sgRNA into account using this package is highly resource intensive. The function, when run using multicore mode (see <a href="https://www.bioconductor.org/packages/3.3/bioc/vignettes/CRISPRseek/inst/doc/CRISPRseek.pdf">the documentation</a>), took all the processing power on the server to examine 1,000 sgRNAs. Therefore, only the first 1,000 sgRNAs have CRISPRseek scores. 

In [1]:
import data_processing as dp

### CRISPRseek (R) Score Export and Import ###

def export_sgRNA_fasta(out_file, db_name, sql_version="MySQL", firewall=False):
    """
        Exports a list of sgRNAs in fasta format for scoring by CRISPRseek
    """
    db_con = dp.DatabaseConnection(sql_version, db_name=db_name, firewall=firewall)
    rows = db_con.fetch_query("SELECT SgID, LongSg FROM SgRNATargetInformation")
    db_con.close_cursor()
    db_con.close_connection()
    
    with open(out_file, "w") as f:
         for row in rows:
            if sql_version == "MSSQL":
                sgID = row.SgID
                longSg = row.LongSg
            else:
                sgID, longSg = row
                longSg = str(longSg)
            header_str = ">sgRNA-{}\n".format(sgID)
            f.write(header_str)
            f.write("{}\n".format(longSg[4:-3]))

def import_CRISPRseek(in_file, db_name, sql_version="MySQL", firewall=False):
    sg_dict = {"SgID": []}
    seek_dict = {"CRISPRseek": []}
    with open(in_file, "r") as fin:
        for line in fin:
            ele = line.split('\t')
            if ele[0][0] == 's':
                sgID = int(ele[0].strip('sgRNA-'))
                if ele[6] == 'NA':
                    print 'No score for sgRNA {}'.format(sgID)
                else:
                    sg_dict["SgID"] += [sgID]
                    seek_dict["CRISPRseek"] += [float(ele[6])]
    db_con = dp.DatabaseConnection(sql_version, db_name=db_name, firewall=firewall)
    db_con.update_many_rows(seek_dict, sg_dict, "SingleGuideRNA")
    db_con.close_cursor()
    db_con.close_connection()

In [2]:
out_file = "sgRNA Scoring/sgRNA_sequences.fa"
export_sgRNA_fasta(out_file, "miR-test", firewall=True)
in_file = "sgRNA Scoring/CRISPRseek_Scores/All sgRNA Summary.txt"
import_CRISPRseek(in_file, "miR-test", firewall=True)

No score for sgRNA 185
No score for sgRNA 245
No score for sgRNA 268
No score for sgRNA 272
No score for sgRNA 369
No score for sgRNA 379
No score for sgRNA 446
No score for sgRNA 813
No score for sgRNA 814
No score for sgRNA 891
No score for sgRNA 893


## Zhang Score

One of the most popular sgRNA off-target scoring algorthm is the method developed by the Zhang lab in <a href="http://www.nature.com/nbt/journal/v31/n9/full/nbt.2647.html">Hsu et al., 2013</a>. The code for this method was not publicly avaliable when I started (it is now avaliable <a href="https://github.com/bh0085/crispr">here</a>), however they do have a <a href="http://crispr.mit.edu/">website</a>. I scripted against this site the fall of 2015 to get the Zhang scores for each sgRNA. The code below is included for historical reasons, but no longer works with the changes made to the site.

In [3]:
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

import urllib2
import re
import time
import pyodbc

#### Takes a list of all unscored, high quality sgRNAs and submits them to the Zhang website & retrieves scores ####

class ZhangScores:
    def __init__(self):
        self.driver = webdriver.Firefox()
        self.baseURL = "http://crispr.mit.edu"

    def eachmiRNA(self, file_loc):
        """
        Pass in a file location with the sgRNAs to be scored in the format priID/tPriMiRName/tLongSeq/tSgID/tSgRNA
        """
        f = open(file_loc, 'r')

        for line in f:
            elements = line.split('\t')
            # submit with name "PriID-sgRNA-SgID"
            self.miRNA = "{}-sgRNA-{}".format(elements[0], elements[3])
            # Pass in sgRNA sequence plus pam
            self.seq = "{}{}".format(elements[4], elements[5])
            self.driver.get(self.baseURL)
            try:
                WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.NAME, "name")))
                outURL= self.queryCRISPR()
            except:
                print "Could not get to {}. Will not find sgRNA for {}".format(self.baseURL, self.miRNA)
                outURL = "N/A"
            
            if outURL == "N/A":
                print "{} was not submitted correctly".format(self.miRNA)
            elif outURL == "":
                print "No sgRNAs were found for {}".format(self.miRNA)
            else:
                match = re.search('/job/([0-9]+)',outURL)
                job = match.group(1)
                time.sleep(120) # Stops for 2 min to give time for sgRNA scores to be calculated
                self.getScores(job)
        self.driver.quit()

    def queryCRISPR(self, email_adress):
        """
        Actually fills in the form and submits the sequence
        """

        # find the element 
        nameInput = self.driver.find_element_by_name("name")
        nameInput.send_keys(self.miRNA)

        emailInput = self.driver.find_element_by_name("email")
        emailInput.send_keys(email_address)

        seqInput = self.driver.find_element_by_name("query")
        seqInput.send_keys(self.seq)

        # submit the form 
        seqInput.submit()
        # !!!need something here to handle the error message when no sgRNAs are found!!!
        try:
            WebDriverWait(self.driver, 10).until(EC.alert_is_present())
            alert = self.driver.switch_to_alert()
            alert.accept()
            new_url = ""
        except:
            try:
                WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID,"downloads-tab")))

                new_url = self.driver.current_url
            except: 
                new_url = "N/A"
        finally:
            return new_url

    def getScores(self, jobNum):

        url = self.baseURL+"/guides/"+jobNum
        try:
            response = urllib2.urlopen(url)
        except urllib2.URLError as e:
            print "Failed to reach a server"
            print e.reason
        except urllib2.HTTPError as e:
            print "Couldn\'t fullfill the request"
            print e.code
            print "At url "+url
        else:
            html = response.read()
            sgRNAs = re.findall("\"score\": (0\.[0-9]+|null),.+?\"nrg\": \"([ATGC]GG)\".+?\"position\": ([0-9]+), \"guide\": \"([ATGC]{20})\"", html)
            # The parens indicate a "capture group" the [0-9] indicates a match to any number between 0 and 9. The {n} indicates we are looking for
            # n matches of what came before

            # The following ensures the scores for all sgRNA have been calculated
            if 'null' in [x[0] for x in sgRNAs]:
                time.sleep(60) # If the sgRNA score has still not been calculated, wait another minute
                self.getScores(jobNum)
            else:
                self.printWB(sgRNAs)        
        
        return

    def printWB(self, sgRNAs, out_file):

        f = open(out_file, 'a')

        for i in range(len(sgRNAs)):
            score,PAM,start,sgRNASeq = sgRNAs[i]
            output = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(self.miRNA, score, sgRNASeq, PAM, start, self.seq)
            f.write(output)

The scores scraped from the site were then imported into the database.

In [4]:
import data_processing as dp

def import_zhang_score(fileList, db_name, sql_version="MySQL", firewall=False):
    """
        For importing Zhang scores from the Oct 2015 quering of the Zhang website
    """
    sg_dict = {"SgRNA": []}
    zhang_dict = {"ZhangScore": []}
    for f in fileList:
        with open(f, "r") as fin:
            for line in fin:
                # skip header line when present
                if line[:3] == "miR":
                    continue
                elements = line.split('\t')
                score = float(elements[1])
                sgSeq = elements[2]
                    
                sg_dict["SgRNA"] += [sgSeq]
                zhang_dict["ZhangScore"] += [score]
    
    db_con = dp.DatabaseConnection(sql_version, db_name=db_name, firewall=firewall)
    db_con.update_many_rows(zhang_dict, sg_dict, "SingleGuideRNA")
    db_con.close_cursor()
    db_con.close_connection()

In [5]:
fileList =["sgRNA Scoring/Zhang_Scores/ZhangScores-10-20-27-15.txt", 
           "sgRNA Scoring/Zhang_Scores/ZhangScores-10-30-15.txt",
           "sgRNA Scoring/Zhang_Scores/Zhang_Scored_sgRNAs_2-6-15.txt"]
import_zhang_score(fileList, "miR-test", firewall=True)

To make sure all the scores from the original database are added to the database, since it seems a file was lost along the way, the below code was used.

In [24]:
import data_processing as dp
import pandas as pd

def fill_in_zhang(db_name, sql_version="MySQL", firewall=False):
    df = pd.read_csv("sgRNA Scoring/Zhang_Scores/OtherZhangScores.csv", header=0, index_col=0)
    sg_dict = {"SgID": []}
    zhang_dict = {"ZhangScore": []}
    for sgID, row in df.iterrows():
        sg_dict["SgID"] += [int(sgID)]
        zhang_dict["ZhangScore"] += [float(row["ZhangScore"])]
    db_con = dp.DatabaseConnection(sql_version, db_name=db_name, firewall=firewall)
    db_con.update_many_rows(zhang_dict, sg_dict, "SingleGuideRNA")
    db_con.close_cursor()
    db_con.close_connection()

In [25]:
fill_in_zhang("miR-test", firewall=True)

This leaves 4332 sgRNAs without Zhang scores, however these sgRNAs are those which are excluded from the library due to their cleavage site or poly(T) sequences.