In [None]:
# An interactive notebook that helps benchmark a directory containing FASTA sequences across the following metrics:
'''
- Codon Adaptation Index (CAI)
- GC Content
- CFD (known un-optimized gene that reduces efficiency)
- Negative CIS elements
- Negative repeat elements
'''

In [2]:
# import modules
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.select import Select
import os
from Bio import SeqIO
import time
import numpy as np

#init webdriver + selenium opts
chromeOptions = webdriver.ChromeOptions()

In [3]:
browser = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chromeOptions)
url_login="https://www.genscript.com/tools/rare-codon-analysis"
browser.get(url_login)



Current google-chrome version is 91.0.4472
Get LATEST driver version for 91.0.4472
Driver [C:\Users\risha\.wdm\drivers\chromedriver\win32\91.0.4472.101\chromedriver.exe] found in cache
  browser = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chromeOptions)


In [29]:
#Read all files from directory and create three arrays
#One array contains each sequence's name
#One array contains each sequence's "seq"
#One array will contain benchmark values for each seq.

dir = r"C:\Users\risha\Desktop\icor-codon-optimization\benchmark_sequences\super_naive"
summary_name = "super_naive_benchmarks"

arr_names = []
arr_seqs = []
arr_benchmarks = []

for entry in os.scandir(dir):
    record = SeqIO.read(entry,'fasta')
    arr_names.append(entry.name)
    arr_benchmarks.append(record.seq)

print(arr_names)
print(arr_benchmarks)

['akt1_dna', 'BIRC5_dna', 'BRAF1_dna', 'CAV1_dna', 'CDK1_dna', 'CEBPZ_dna', 'CLN3_dna', 'CSNK1A1_dna', 'emg1_dna', 'falvac-1_dna', 'FGFR4_dna', 'flt1_dna', 'GSK3B_dna', 'hpdf_dna', 'KIF11_dna', 'LAMP1_dna', 'lck_dna', 'LEMD3_dna', 'MAPK1_dna', 'MAPKAPK5_dna', 'mmpl3_dna', 'msox_dna', 'NGFR_dna', 'NOC2L_dna', 'npr1_dna', 'OPRM1_dna', 'pak1_dna', 'pa_dna', 'PDCD_dna', 'pea_dna', 'PF3D7_dna', 'pim1_dna', 'PLK1_dna', 'ptp4a3_dna', 'RPS6KB1_dna', 'SMARCD1_dna', 'soxB_dna', 'TAP1_dna', 'TAS2R10_dna', 'ubtf_dna']
[Seq('ATGTCAGATGTCGCAATAGTTAAGGAGGGTTGGCTACACAAACGCGGAGAATAT...GCC'), Seq('ATGGGAGCTCCAACACTACCACCGGCATGGCAACCCTTTTTAAAGGATCATCGA...GAT'), Seq('ATGGCTGCCTTATCAGGGGGAGGGGGGGGAGGAGCGGAACCTGGACAGGCCCTT...CAC'), Seq('ATGTCTGGGGGTAAGTACGTAGACAGTGAGGGACACTTATACACTGTACCCATC...ATC'), Seq('ATGGAGGACTACACCAAGATTGAAAAGATAGGCGAAGGAACATACGGCGTTGTA...ATG'), Seq('ATGGCAGCAGTGAAAGAACCCCTTGAGTTTCACGCCAAAAGACCGTGGAGACCA...AAG'), Seq('ATGGGGGGCTGTGCAGGCTCCCGCCGAAGATTCTCTGACTCGGAGGGCGAGGAG...TCT'), Seq(

In [31]:
benchmarks = np.array(["Gene Name", "CAI", "GC Content", "CFD", "Negative CIS Elements", "Negative Repeat Elements", "Raw Sequence"])

for i in range(len(arr_benchmarks)):
    analysis_button=browser.find_element_by_name("op")
    seq_box=browser.find_element_by_name("seq")
    time.sleep(0.5)
    seq_box.clear()
    seq_box.send_keys(arr_benchmarks[i])

    time.sleep(0.5)
    analysis_button.click()

    #wait 10s for analysis
    time.sleep(10)

    #get tables on genscript page
    tables = browser.find_elements_by_xpath('//table[1]')

    #find seq
    seq = str(tables[0].text).split('Sequence')[1].strip().split('\n')
    seq.pop()
    seq = "".join(seq)

    #find body of first table
    body_1 = tables[2].text.split('\n')

    #find cai, gc, cfd
    CAI = [i for i in body_1 if i.startswith('CAI')][0][4:8]
    GC = [i for i in body_1 if i.startswith('GC Content')][0][11:17]
    CFD = [i for i in body_1 if i.startswith('CFD')][0][4:7]

    #find body of second table
    body_2 = tables[3].text.split('\n')[1].split(' ')

    #find negative cis, negative repeats
    CIS = body_2[0]
    Repeats = body_2[1]
    benchmarks = np.vstack((benchmarks,[arr_names[i],CAI,GC,CFD,CIS,Repeats,seq]))
    browser.back()

    #Wait one second before repeating
    time.sleep(1)

In [33]:
#Save calculated benchmarks to CSV file for later review!
#Repeat this script for each benchmark (original, super_naive, naive, ICOR tool)

np.savetxt("%s.csv" % (summary_name), benchmarks, fmt='%s', delimiter=",")