In [None]:
# An interactive notebook that helps benchmark a directory containing FASTA sequences across the following metrics:
'''
- Codon Adaptation Index (CAI)
- GC Content
- CFD (known un-optimized gene that reduces efficiency)
- Negative CIS elements
- Negative repeat elements
'''

In [1]:
# import modules
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.select import Select
import os
from Bio import SeqIO
import time
import numpy as np

#init webdriver + selenium opts
chromeOptions = webdriver.ChromeOptions()

In [2]:
browser = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chromeOptions)
url_login="https://www.genscript.com/tools/rare-codon-analysis"
browser.get(url_login)



Current google-chrome version is 91.0.4472
Get LATEST driver version for 91.0.4472
Driver [C:\Users\risha\.wdm\drivers\chromedriver\win32\91.0.4472.101\chromedriver.exe] found in cache
  browser = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chromeOptions)


In [3]:
#Read all files from directory and create two arrays
#One array (arr_names) contains each sequence's name
#One array (arr_sequences) contains each sequence's "seq"

#benchmark_sequences/dna folder contains the original sequences, so it was used for original benchmarks.
#benchmark_sequences/super_naive contains the super_naive sequences so it was used for those.
#benchmark_sequences/naive contains the naive sequences so it was used for those.
#benchmark_sequences/ICOR contains the ICOR sequences so it was used for those.

dir = r"C:\Users\risha\Desktop\icor-codon-optimization\benchmark_sequences\dna"
summary_name = "original_benchmarks"

arr_names = []
arr_sequences = []

for entry in os.scandir(dir):
    record = SeqIO.read(entry,'fasta')
    arr_names.append(entry.name)
    arr_sequences.append(record.seq)

print(arr_names)
print(arr_sequences)

['akt1_dna.fasta', 'BIRC5_dna.fasta', 'BRAF1_dna.fasta', 'CAV1_dna.fasta', 'CDK1_dna.fasta', 'CEBPZ_dna.fasta', 'CLN3_dna.fasta', 'CSNK1A1_dna.fasta', 'emg1_dna.fasta', 'falvac-1_dna.fasta', 'FGFR4_dna.fasta', 'flt1_dna.fasta', 'GSK3B_dna.fasta', 'hpdf_dna.fasta', 'KIF11_dna.fasta', 'LAMP1_dna.fasta', 'lck_dna.fasta', 'LEMD3_dna.fasta', 'MAPK1_dna.fasta', 'MAPKAPK5_dna.fasta', 'mmpl3_dna.fasta', 'msox_dna.fasta', 'NGFR_dna.fasta', 'NOC2L_dna.fasta', 'npr1_dna.fasta', 'OPRM1_dna.fasta', 'pak1_dna.fasta', 'pa_dna.fasta', 'PDCD11_dna.fasta', 'pea_dna.fasta', 'PF3D7_dna.fasta', 'pim1_dna.fasta', 'PLK1_dna.fasta', 'ptp4a3_dna.fasta', 'RPS6KB1_dna.fasta', 'SMARCD1_dna.fasta', 'soxB_dna.fasta', 'TAP1_dna.fasta', 'TAS2R10_dna.fasta', 'ubtf_dna.fasta']
[Seq('TAATTATGGGTCTGTAACCACCCTGGACTGGGTGCTCCTCACTGACGGACTTGT...AAA'), Seq('GCGGCGCGCCATTAACCGCCAGATTTGAATCGCGGGACCCGTTGGCAGAGGTGG...CTA'), Seq('CTTCCCCCAATCCCCTCAGGCTCGGCTGCGCCCGGGGCCGCGGGCCGGTACCTG...GCA'), Seq('ACAGTTTTCATCCAGCCACGGGCCAGCATGTCT

In [4]:
benchmarks = np.array(["Gene Name", "CAI", "GC Content", "CFD", "Negative CIS Elements", "Negative Repeat Elements", "Raw Sequence"])

for i in range(len(arr_sequences)):
    print("We are now benchmarking sequence #%d!" % (i + 1))
    analysis_button=browser.find_element_by_name("op")
    seq_box=browser.find_element_by_name("seq")
    time.sleep(0.5)
    seq_box.clear()
    seq_box.send_keys(arr_sequences[i])

    time.sleep(0.5)
    analysis_button.click()

    #wait 9s for analysis
    time.sleep(9)

    #get tables on genscript page
    tables = browser.find_elements_by_xpath('//table[1]')

    #find seq
    seq = str(tables[0].text).split('Sequence')[1].strip().split('\n')
    seq.pop()
    seq = "".join(seq)

    #find body of first table
    body_1 = tables[2].text.split('\n')

    #find cai, gc, cfd
    CAI = [i for i in body_1 if i.startswith('CAI')][0][4:8]
    GC = [i for i in body_1 if i.startswith('GC Content')][0][11:17]
    CFD = [i for i in body_1 if i.startswith('CFD')][0][4:7]

    #find body of second table
    body_2 = tables[3].text.split('\n')[1].split(' ')

    #find negative cis, negative repeats
    CIS = body_2[0]
    Repeats = body_2[1]
    benchmarks = np.vstack((benchmarks,[arr_names[i],CAI,GC,CFD,CIS,Repeats,seq]))
    browser.back()

    #Wait one second before repeating
    time.sleep(1)

In [5]:
#Save calculated benchmarks to CSV file for later review!
#Repeat this script for each benchmark (original, super_naive, naive, ICOR tool)

np.savetxt("%s.csv" % (summary_name), benchmarks, fmt='%s', delimiter=",")
#the saved files were then moved to the summaries directory.