# Probe Designer 

## Environment

In [None]:
# basci env
import os
import pandas as pd
import time
import json

# data process of file from ncbi
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

# get gene data from ncbi
from Bio import Entrez

# blast and xml file process
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML


# dir
os.chdir('C:/Users/14187/Documents/Github/Probe_designer/example_dataset_immune')
current_time = time.localtime()
formatted_time = time.strftime("%Y%m%d_%H%M%S", current_time)
tmp = './results/' + formatted_time + '/tmp/'
output = './results/' + formatted_time + '/'
# tmp = './results/' + '20230425_103142' + '/tmp/'
# output = './results/' + '20230425_103142' + '/'
pre_binding_dir = tmp + 'pre_binding/'
try:os.makedirs(tmp)
except:pass

# basic vatiables
gene_name_list_tosearch = 'gene_name_list_tosearch.txt'
pre_binding_file_suffix = '_pre_binding.fasta'
total_pre_binding_file_name = '_total_pre_binding.fasta'

# tmp file
gene_name_list_file = '1_gene_name_list.txt'
gene_id_name_file = '2_id_list.txt'
gene_seq_in_file = '3_gene_seq_in_file.gb'
pre_binding_num_file = '4_pre_binding_num.json'
blast_results_file = '5_blast_results.xml'

In [None]:
# Initiation of array
binding_site_FOIs = ['accession', 'gene_name', 'mol_type', 'organism', 'binding', 'wanted']
align_FOIs = ['align_num', 'align_accession', 'align_descrip', 'plus/minus']
FOI = pd.DataFrame(columns=binding_site_FOIs+
                   align_FOIs)

## Get genbank file of each gene from ncbi dataset
https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch

In [None]:
# Get gene id and other information from ncbi dataset(api)
## Generate gene_search_list from gene_name_list
organism_of_interest = 'Mus musculus'
n_type_of_interest = 'mRNA'
with open(tmp+gene_name_list_file) as f:
    gene_name_list = f.read().splitlines()
gene_search_list = [', '.join([name, organism_of_interest, n_type_of_interest]) for name in gene_name_list]


In [None]:
## Get gene id list using Entrez.esearch
id_list = []
for gene_search in gene_search_list:
    Entrez.email = "1418767067@qq.com"
    handle = Entrez.esearch(db="nuccore", term=gene_search)
    record = Entrez.read(handle)
    handle.close()
    id_list += record["IdList"][:1] # set number of search results to read
with open(tmp+gene_id_name_file, 'w') as f:
    f.write('\n'.join(id_list))

In [None]:
## Read id_list from existing file
with open(tmp+gene_id_name_file, 'r') as f:
    id_list = f.read().split('\n')

In [None]:
# Get the genbank file of each gene by search for id list
fetch_per_round = 3
round = -(-len(id_list) // fetch_per_round)
for i in range(round):
    id_list_per_round = id_list[i*fetch_per_round: (i+1)*fetch_per_round]
    Entrez.email = "1418767067@qq.com"
    handle = Entrez.efetch(db="nuccore", strand=1,  # plus if strand=1
                        id=id_list_per_round, rettype="gbwithparts", retmode="text")
    seq_record = handle.read()
    handle.close()
    print(i+1, '{:.2f} %'.format((i+1)/round*100))
    with open(tmp+gene_seq_in_file, 'a') as f:
        f.write(seq_record)

## Binding site Searcher

In [None]:
# Search binding sites on mRNA sequence
file_in = tmp + gene_seq_in_file
file_out_dir = pre_binding_dir
try:
    os.mkdir(file_out_dir)
except:
    pass

BDS_len = 40
max_num = 35

pre_binding_num = {}
pos, length = 0, 0
for gene_seq_in in SeqIO.parse(tmp+gene_seq_in_file, 'genbank'):
    # get information of gene
    id = gene_seq_in.id # get seq id
    # get gene_name
    if gene_seq_in.features:
        for feature in gene_seq_in.features:
            if feature.type == "CDS":
                gene_name = feature.qualifiers.get("gene", ['NAN'])[0]
    # get molecule_type
    mol_type = gene_seq_in.annotations['molecule_type']
    # get organism
    organism = gene_seq_in.annotations['organism'] 
    # get minus seq
    translib = {'A':'T','T':'A','C':'G','G':'C'}
    try: 
        seq_minus = [translib[i] for i in str(gene_seq_in.seq)]
        seq = ''.join(list(reversed(seq_minus)))
    except: seq = str(gene_seq_in.seq)
    

    # set start point and pre_binding_num
    length = len(seq)
    pre_binding_num_tmp = min(length//BDS_len, max_num)
    st = length // 2 - pre_binding_num_tmp * BDS_len // 2
    # generate and write pre_binding for each gene in a fasta file
    record_list = []
    file_out = file_out_dir + id + pre_binding_file_suffix
    for i in range(pre_binding_num_tmp):
        pre_binding_tmp = seq[st+i*BDS_len:st+i*BDS_len+BDS_len]
        record_list.append(SeqRecord(
            Seq(pre_binding_tmp),
            id='pre_binding'+str(i), 
            description='|'.join([id,gene_name,organism,mol_type])
        ))
        # add information about binding sites to FOI
        FOI.loc[i+pos, binding_site_FOIs] \
             = [id, gene_name, mol_type, organism, pre_binding_tmp, True]
        
    pos += pre_binding_num_tmp
    # write pre_binding to files
    with open(file_out, "w") as f:
        for new_record in record_list:
            SeqIO.write(new_record, f, "fasta")
    with open(file_out_dir+total_pre_binding_file_name, "a") as handle:
        for new_record in record_list:
            SeqIO.write(new_record, handle, "fasta")

    # record the num of pre_binding for each gene
    pre_binding_num[id] = pre_binding_num_tmp

with open(tmp+pre_binding_num_file, "w") as f:
    json.dump(pre_binding_num, f)

## Blast and extract blast results
NCBIXML: https://homolog.us/Biopython/Bio.Blast.NCBIXML.html#read/0

BlastRecord: https://biopython.org/docs/1.75/api/Bio.Blast.Record.html

XMLReader: https://codebeautify.org/xmlviewer# 

In [None]:
with open(file_out_dir+total_pre_binding_file_name, 'r') as f:
    fasta_string = f.read()
txid = [2697049]  # organism
# Submit BLAST search and get handle object
handle = NCBIWWW.qblast(program='blastn', megablast="yes",
                        database='refseq_rna', sequence=fasta_string,
                        url_base='https://blast.ncbi.nlm.nih.gov/Blast.cgi', 
                        format_object='Alignment',
                        format_type='Xml')
# read handle object and save to a file
with open(tmp+blast_results_file, 'w') as f:
    f.write(handle.read())

In [None]:
# Extract interested information from blast_results
align_num = []

# read the id/plus-minus part/align_num
with open(tmp+blast_results_file, 'r') as blast_output:
    blast_records = NCBIXML.parse(blast_output)
    loca = 0
    for blast_record in blast_records:
        align_accession = []
        align_descrip_list = []
        # get align num of each binding site
        length = len(blast_record.alignments)
        align_num.append(length)
        for i in range(length):
            descrip = blast_record.descriptions[i].title.split('|')
            # get accession and descrip of each align seq
            align_accession.append(descrip[3])
            align_descrip_list.append(descrip[-1])
        FOI.loc[loca, 'align_accession'] = '|'.join(str(_) for _ in align_accession)
        
        # add align_descrip to df
        FOI.loc[loca, 'align_descrip'] = '|'.join(str(_) for _ in align_descrip_list)
        
        # get plus/minus of each align seq
        p_m = [blast_record.alignments[_].hsps[0].frame[1] for _ in range(length)]
        
        # add plus/minus to df
        try: FOI.loc[loca, 'plus/minus'] = ','.join([str(_) for _ in p_m])
        except: FOI.loc[loca, 'plus/minus'] = 'NAN'
        
        loca += 1

FOI['align_num'][:len(align_num)] = align_num

## Select wanted binding site 

In [None]:
FOI['wanted'] = [True]*len(FOI)

In [None]:
# sieve for the suitable binding site
gene_name_list_out = [i for i in gene_name_list]
for i in range(len(FOI)):
    # check gene_name
    gene_name = FOI.loc[i, 'gene_name']
    if gene_name not in gene_name_list:
        FOI.loc[i, 'wanted'] = False
    else:
        try: gene_name_list_out.remove(gene_name)
        except: pass

    # check DNA or mRNA type
    if FOI.loc[i, 'wanted'] == True:
        if FOI.loc[i, 'mol_type'] != 'mRNA':
            FOI.loc[i, 'wanted'] = False
        
    # check gene_organism name
    if FOI.loc[i, 'wanted'] == True:
        spe_ori, gene_ori = FOI.loc[i, 'organism'], FOI.loc[i, 'gene_name']        
        descrip = FOI.loc[i, 'align_descrip'].split('|')
        for des in descrip:
            if gene_ori not in des and spe_ori in des:
                FOI.loc[i, 'wanted'] = False
                break

    # check plus/minus
    if FOI.loc[i, 'wanted'] == True:
        if pd.isnull(FOI.loc[i, 'plus/minus']):
            FOI.loc[i, 'wanted'] = False
        else:    
            pm_list = FOI.loc[i, 'plus/minus'].split(',')
            if '-1' not in pm_list:
                FOI.loc[i, 'wanted'] = False
    
    # check G 40%-70%, non consective 5 base
    if FOI.loc[i, 'wanted'] == True:
        seq = FOI.loc[i, 'binding']
        if 'GGGGG' in seq:
            FOI.loc[i, 'wanted'] = False
        else:
            G_per = seq.count('G')/len(seq)
            if G_per < 0.3 or G_per > 0.7:
                FOI.loc[i, 'wanted'] = False

# write the gene name to search next round
with open(output+gene_name_list_tosearch, 'w') as f:
    f.write('\n'.join(gene_name_list_out))

# write the whole information of interest to a excel file in tmp dir
FOI.to_excel(tmp+'probes_sieve.xlsx')

# get the sub dataframe of wanted probes
output_df = FOI[FOI['wanted'] == True]
# write the output to a xlsx file
output_df.to_excel(output+'probes_wanted.xlsx')