In [2]:
#Modules
import numpy as np
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
import re

In [None]:
#Functions
def read_fasta(name):
    fasta_seqs = SeqIO.parse(open(name + '.fa'),'fasta')
    data = []
    for fasta in fasta_seqs:
        data.append([fasta.id, str(fasta.seq).strip().upper()])
            
    return data

In [None]:
#Data 
in_fasta_1 = 'sd108'
seqs_df = pd.DataFrame(read_fasta(in_fasta_1), columns = ['name', 'sequence'])
grna_list = pd.read_csv('grna_with_6bp_mismatch.csv')  
grna_list.columns = ['grna_seq']

In [3]:
#Search for grna location in 130 scaffold and its reverse complementary scaffolds 
count = 0
grna_location = []
for i in range(np.shape(grna_list)[0]):
    last_count = count
    grna_flag = 0
    for j in range(np.shape(seqs_df)[0]):
        
        #search in forward strand
        curr_scaffold = seqs_df['sequence'][j]
        if grna_list['grna_seq'][i] in curr_scaffold:  
            res = [i.start() for i in re.finditer(grna_list['grna_seq'][i], curr_scaffold)]
            for k in range(len(res)):
                curr_grna_loc = res[k]
                curr_strand = '+'
                if curr_scaffold[curr_grna_loc+21:curr_grna_loc+23] == 'GG':
                    grna_flag = 1
                    break
         
        if grna_flag == 1:
            count = count + 1
            grna_location.append([seqs_df['name'][j],curr_grna_loc, curr_strand])
            break
            
        #reverse strand
        scaffold_seq = Seq(seqs_df['sequence'][j])
        curr_scaffold = str(scaffold_seq.reverse_complement())
        if grna_list['grna_seq'][i] in curr_scaffold:
            res = [i.start() for i in re.finditer(grna_list['grna_seq'][i], curr_scaffold)]
            for k in range(len(res)):
                curr_grna_loc = res[k]    
                curr_strand = '-'
                if curr_scaffold[curr_grna_loc+21:curr_grna_loc+23] == 'GG':
                    grna_flag = 1
                    break
            
        if grna_flag == 1:
            count = count + 1
            grna_location.append([seqs_df['name'][j], curr_grna_loc, curr_strand])
            break
        
    if count == last_count:
        print(grna_list['grna_seq'][i])
        break
        
grna_list['scaffold'] = [row[0] for row in grna_location]
grna_list['loc'] = [row[1] for row in grna_location]
grna_list['strand'] =  [row[2] for row in grna_location]

In [4]:
#Removing Intergenic gRNA
gene_list = pd.read_csv('sd108_genetable.csv')  
gene_loc = gene_list[["name", "chrom", "txStart", "txEnd"]]    

grna_len_with_pam = 23
intergenic_grna_list = []
for i in range(np.shape(grna_list)[0]):
    ig_flag = 1
    curr_gene_loc = gene_loc.loc[gene_loc['chrom'] == grna_list['scaffold'][i]].reset_index(drop=True)
    if grna_list['strand'][i] == '+':
            curr_grna_loc = grna_list['loc'][i]
    else:
        curr_grna_loc = len(seqs_df['sequence'][int(grna_list['scaffold'][i].split('_')[1])-1]) - grna_list['loc'][i] - grna_len_with_pam

    for j in range(np.shape(curr_gene_loc)[0]):
        if curr_grna_loc > curr_gene_loc['txStart'][j] - 1 - grna_len_with_pam and curr_grna_loc < curr_gene_loc['txEnd'][j]:
            ig_flag = 0 
            
        if ig_flag == 0:
            break
            
    if ig_flag == 1:
        intergenic_grna_list.append(grna_list.iloc[i])
        
intergenic_grna_list = pd.DataFrame(intergenic_grna_list, columns = ['grna_seq', 'scaffold', 'loc', 'strand']).reset_index(drop=True)

In [6]:
#Poly T,G,C,A, BSA1 site based filtering
filter_index = []
for i in range(np.shape(intergenic_grna_list)[0]):
    curr_seq = intergenic_grna_list['grna_seq'][i]
    flag = 1
    
    if curr_seq.find('TTTTT')>-1:
        flag = flag * 0
    if curr_seq.find('GGGGGG')>-1:
        flag = flag * 0
    if curr_seq.find('CCCCCC')>-1:
        flag = flag * 0
    if curr_seq.find('AAAAAA')>-1:
        flag = flag * 0

    if curr_seq.find('GGTCTC')>-1:
        flag = flag * 0
    if curr_seq.find('GAGACC')>-1:
        flag = flag * 0
        
    if flag == 0:
        filter_index.append(i)
    
filter_grna_list = intergenic_grna_list.drop(filter_index).reset_index(drop=True)

#Homology Arm Filtering
hr_filter_index = []
homology_arm = []
for i in range(np.shape(filter_grna_list)[0]):
    index_curr_scaffold = int(filter_grna_list['scaffold'][i].split('_')[1])-1
    curr_grna_loc = filter_grna_list['loc'][i]
    if filter_grna_list['strand'][i] == '+':
        LHR = seqs_df['sequence'][index_curr_scaffold][curr_grna_loc-50:curr_grna_loc]
        RHR = seqs_df['sequence'][index_curr_scaffold][curr_grna_loc+23:curr_grna_loc+73]
        grna_with_PAM = seqs_df['sequence'][index_curr_scaffold][curr_grna_loc:curr_grna_loc+23]
    else:
        scaffold_seq = Seq(seqs_df['sequence'][index_curr_scaffold])
        curr_scaffold = str(scaffold_seq.reverse_complement())
        LHR = curr_scaffold[curr_grna_loc-50:curr_grna_loc]
        RHR = curr_scaffold[curr_grna_loc+23:curr_grna_loc+73]
        grna_with_PAM = curr_scaffold[curr_grna_loc:curr_grna_loc+23]
        
    LHR_flag = 1
    if LHR.find('TTTTT')>-1:
        LHR_flag = LHR_flag*0
    if LHR.find('GGGGGG')>-1:
        LHR_flag = LHR_flag*0
    if LHR.find('CCCCCC')>-1:
        LHR_flag = LHR_flag*0
    if LHR.find('AAAAAA')>-1:
        LHR_flag = LHR_flag*0
    if LHR.find('GGTCTC')>-1:
        LHR_flag = LHR_flag*0
    if LHR.find('GAGACC')>-1:
        LHR_flag = LHR_flag*0

    RHR_flag = 1
    if RHR.find('TTTTT')>-1:
        RHR_flag = RHR_flag*0
    if RHR.find('GGGGGG')>-1:
        RHR_flag = RHR_flag*0
    if RHR.find('CCCCCC')>-1:
        RHR_flag = RHR_flag*0
    if RHR.find('AAAAAA')>-1:
        RHR_flag = RHR_flag*0
    if RHR.find('GGTCTC')>-1:
        RHR_flag = RHR_flag*0
    if RHR.find('GAGACC')>-1:
        RHR_flag = RHR_flag*0

    if LHR_flag == 0 or RHR_flag == 0:
        hr_filter_index.append(i)
    else:
        homology_arm.append([LHR,RHR,grna_with_PAM])
        
filter_2_grna_list = filter_grna_list.drop(hr_filter_index).reset_index(drop=True)
filter_2_grna_list['LHR'] = [row[0] for row in homology_arm]
filter_2_grna_list['RHR'] = [row[1] for row in homology_arm]
filter_2_grna_list['grna_with_PAM'] = [row[2] for row in homology_arm]

In [18]:
#Output
filter_2_grna_list.to_csv('grna_list_to_rank.csv',index=False)