# Align to Genome

To find the number of exact matches in the genome for each sgRNA, the sgRNAs were aligned to hg19 (DIDN'T USE hg38 originally, which will change classification of 63 sgRNAs targeting 29 miRNAs) using bowtie. 

First, the sgRNA sequences were fetched to create a fasta file.

In [None]:
import data_processing as dp

def export_sgRNA_fastq(out_file, db_name, sql_version="MySQL", firewall=False):
    """
        Exports a list of sgRNAs in fastq format for alignment in bowtie
    """
    db_con = dp.DatabaseConnection(sql_version, db_name=db_name, firewall=firewall)
    rows = db_con.fetch_query("SELECT SgID, SgRNA FROM SingleGuideRNA")
    db_con.close_cursor()
    db_con.close_connection()
    with open(out_file, "w") as fout:
        for row in rows:
            if sql_version == "MSSQL":
                sgID = row.SgID
                sgRNA = row.SgRNA
            else:
                sgID, sgRNA = row
            fout.write("@sgRNA-{}\n{}NGG\n+\nJJJJJJJJJJJJJJJJJJJJJJJ\n".format(sgID, sgRNA))

In [None]:
export_sgRNA_fastq("Align sgRNAs/miR_sgRNAs.fastq", "miR-test", firewall=True)

The reads were then aligned.

In [None]:
import data_processing.trim_align as ta

def align_sgRNAs(fileloc, logfile, genomeloc, outputdir):
    """
        Aligns the sgRNAs to the human genome
        using Bowtie. Only one mismatch (-v 1) due to ambiguous characters
        is allowed. Reports all alignments (-a). Uses 8 processors (-p 8)
    """
    sampleName = "miR_sgRNAs"
    # set up instance, log and server connection
    obj = ta.TrimAndAlign(logfile)
    
    obj.fileToServer(fileloc, sampleName, ".fastq")
    obj.align_bowtie(sampleName, genomeloc, options=" -v 1 -a -p 8")
    sampleName += "_bowtie-aligned"
    
    obj.fileFromServer(outputdir, sampleName, ".sam")

In [None]:
align_sgRNAs("Align sgRNAs/miR_sgRNAs.fastq", 
             "Align sgRNAs/Align miR sgRNAs Log.log", 
             "../UCSC/hg19/bowtie-indexes/hg19", "Align sgRNAs/")

## Import Aligned Reads

The aligned reads were then processed to find the number of exact matches per sgRNA.

In [None]:
import data_processing as dp

def import_aligned_reads(sam_file, db_name, sql_version="MySQL", firewall=False):
    """
        Import the number of alignments per sgRNA
    """
    db_con = dp.DatabaseConnection(sql_version, db_name=db_name, firewall=firewall)
    # set column to 0
    db_con.update_row({"NumExactMatch": 0}, {}, "SingleGuideRNA")
    
    match_dict = {}
    with open(sam_file, "r") as f:
        for line in f:
            # skip header lines
            if line[0] != '@':
                elements = line.split('\t')
                sgID = int(elements[0].lstrip('sgRNA-'))
                if sgID in match_dict:
                    match_dict[sgID] += 1
                else:
                    match_dict[sgID] = 1
    sg_dict = {"SgID": []}
    exact = {"NumExactMatch": []}
    for key, val in match_dict.iteritems():
        sg_dict["SgID"] += [key]
        exact["NumExactMatch"] += [val]
    db_con.update_many_rows(exact, sg_dict, "SingleGuideRNA")
    db_con.close_cursor()
    db_con.close_connection()

In [None]:
import_aligned_reads("Align sgRNAs/miR_sgRNAs_bowtie-aligned.sam", "miR-test", firewall=True)

## hg38 Alignment

The sgRNAs can also be aligned to hg38.

In [None]:
align_sgRNAs("Align sgRNAs/miR_sgRNAs.fastq", 
             "Align sgRNAs/Align hg38 miR sgRNAs Log.log", 
             "../UCSC/hg38/Sequence/BowtieIndex/genome", "Align sgRNAs/hg38/")

## Import hg38 Alignment

The hg38 read alignment was then imported into the column: 'hg38NumExactMatch'.

In [None]:
import data_processing as dp

def import_aligned_reads_hg38(sam_file, db_name, sql_version="MySQL", firewall=False):
    """
        Import the number of alignments per sgRNA
    """
    db_con = dp.DatabaseConnection(sql_version, db_name=db_name, firewall=firewall)
    # set column to 0
    db_con.update_row({"hg38NumExactMatch": 0}, {}, "SingleGuideRNA")
    
    match_dict = {}
    with open(sam_file, "r") as f:
        for line in f:
            # skip header lines
            if line[0] != '@':
                elements = line.split('\t')
                sgID = int(elements[0].lstrip('sgRNA-'))
                if sgID in match_dict:
                    match_dict[sgID] += 1
                else:
                    match_dict[sgID] = 1
    sg_dict = {"SgID": []}
    exact = {"hg38NumExactMatch": []}
    for key, val in match_dict.iteritems():
        sg_dict["SgID"] += [key]
        exact["hg38NumExactMatch"] += [val]
    
    db_con.update_many_rows(exact, sg_dict, "SingleGuideRNA")
    db_con.close_cursor()
    db_con.close_connection()

In [None]:
import_aligned_reads("Align sgRNAs/hg38/miR_sgRNAs_bowtie-aligned.sam", "miR-test", firewall=True)