In [1]:
# An interactive notebook that helps benchmark a directory containing FASTA sequences across the following metrics:
'''
- Codon Adaptation Index (CAI)
- GC Content
- CFD (known un-optimized gene that reduces efficiency)
- Negative CIS elements
- Negative repeat elements
'''

'\n- Codon Adaptation Index (CAI)\n- GC Content\n- CFD (known un-optimized gene that reduces efficiency)\n- Negative CIS elements\n- Negative repeat elements\n'

In [1]:
# import modules
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.select import Select
import os
from Bio import SeqIO
import time
import numpy as np

#init webdriver + selenium opts
chromeOptions = webdriver.ChromeOptions()

In [2]:
#set up chromedriver
browser = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chromeOptions)
url_login="https://www.genscript.com/tools/rare-codon-analysis"
browser.get(url_login)



Current google-chrome version is 92.0.4515
Get LATEST driver version for 92.0.4515
There is no [win32] chromedriver for browser 92.0.4515 in cache
Get LATEST driver version for 92.0.4515
Trying to download new driver from https://chromedriver.storage.googleapis.com/92.0.4515.107/chromedriver_win32.zip
Driver has been saved in cache [C:\Users\risha\.wdm\drivers\chromedriver\win32\92.0.4515.107]
  browser = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chromeOptions)


In [3]:
#Read all files from directory and create two arrays
#One array (arr_names) contains each sequence's name
#One array (arr_sequences) contains each sequence's "seq"

#benchmark_sequences\dna folder contains the original sequences, so it was used for original benchmarks.
#benchmark_sequences\super_naive contains the super_naive sequences so it was used for those.
#benchmark_sequences\naive contains the naive sequences so it was used for those.
#benchmark_sequences\ICOR contains the ICOR sequences so it was used for those.

dir = r"C:\Users\risha\Desktop\icor-codon-optimization\benchmark_sequences\brute"
summary_name = "brute"

#initialize arrays
arr_names = []
arr_sequences = []

#read all files from directory
for entry in os.scandir(dir):
    record = SeqIO.read(entry,'fasta')
    arr_names.append(entry.name)
    arr_sequences.append(record.seq)

#sanity check: print the array names to verify they were read correctly
print(arr_names)

['AKT1_dna.fasta', 'BIRC5_dna.fasta', 'BRAF1_dna.fasta', 'CAV1_dna.fasta', 'CD80_dna.fasta', 'CDK1_dna.fasta', 'CEBPZ_dna.fasta', 'CLN3_dna.fasta', 'CREB1_dna.fasta', 'CSNK1A1_dna.fasta', 'EMG1_dna.fasta', 'FALVAC-1_dna.fasta', 'FGFR4_dna.fasta', 'FLT1_dna.fasta', 'GSK3B_dna.fasta', 'HPDF_dna.fasta', 'JUN_dna.fasta', 'KIF11_dna.fasta', 'LAMP1_dna.fasta', 'LCK_dna.fasta', 'LEMD3_dna.fasta', 'MAPK1_dna.fasta', 'MAPKAPK5_dna.fasta', 'MMPL3_dna.fasta', 'NGFR_dna.fasta', 'NOC2L_dna.fasta', 'NPR1_dna.fasta', 'OPRM1_dna.fasta', 'PAK1_dna.fasta', 'PA_dna.fasta', 'PDCD11_dna.fasta', 'PEA_dna.fasta', 'PIM1_dna.fasta', 'PLK1_dna.fasta', 'PTP4A3_dna.fasta', 'RPS6KB1_dna.fasta', 'SMARCD1_dna.fasta', 'TAP1_dna.fasta', 'TAS2R10_dna.fasta', 'UBTF_dna.fasta']


In [4]:
#initialize benchmark array consisting of the following:
benchmarks = np.array(["Gene Name", "CAI", "GC Content", "CFD", "Negative CIS Elements", "Negative Repeat Elements", "Raw Sequence"])

#loop through each sequence and run the benchmark
for i in range(len(arr_sequences)):
    
    print("We are now benchmarking sequence #%d!" % (i + 1))
    analysis_button=browser.find_element_by_name("op")
    seq_box=browser.find_element_by_name("seq")

    #clear the sequence box
    time.sleep(0.5)
    seq_box.clear()
    seq_box.send_keys(arr_sequences[i])

    #click the analysis button
    time.sleep(0.5)
    analysis_button.click()

    #wait 7s for analysis (doing less may be too fast)
    time.sleep(7)

    #get tables on genscript page
    tables = browser.find_elements_by_xpath('//table[1]')

    #find seq
    seq = str(tables[0].text).split('Sequence')[1].strip().split('\n')
    seq.pop()
    seq = "".join(seq)

    #find body of first table
    body_1 = tables[2].text.split('\n')

    #find cai, gc, cfd
    CAI = [i for i in body_1 if i.startswith('CAI')][0][4:8]
    GC = [i for i in body_1 if i.startswith('GC Content')][0][11:17]
    CFD = [i for i in body_1 if i.startswith('CFD')][0][4:7]

    #find body of second table
    body_2 = tables[3].text.split('\n')[1].split(' ')

    #find negative cis, negative repeats
    CIS = body_2[0]
    Repeats = body_2[1]
    benchmarks = np.vstack((benchmarks,[arr_names[i],CAI,GC,CFD,CIS,Repeats,seq]))
    browser.back()

    #Wait one second before repeating
    time.sleep(1)

We are now benchmarking sequence #1!
We are now benchmarking sequence #2!
We are now benchmarking sequence #3!
We are now benchmarking sequence #4!
We are now benchmarking sequence #5!
We are now benchmarking sequence #6!
We are now benchmarking sequence #7!
We are now benchmarking sequence #8!
We are now benchmarking sequence #9!
We are now benchmarking sequence #10!
We are now benchmarking sequence #11!
We are now benchmarking sequence #12!
We are now benchmarking sequence #13!
We are now benchmarking sequence #14!
We are now benchmarking sequence #15!
We are now benchmarking sequence #16!
We are now benchmarking sequence #17!
We are now benchmarking sequence #18!
We are now benchmarking sequence #19!
We are now benchmarking sequence #20!
We are now benchmarking sequence #21!
We are now benchmarking sequence #22!
We are now benchmarking sequence #23!
We are now benchmarking sequence #24!


In [None]:
#Save calculated benchmarks to CSV file for later review!
#Repeat this script for each benchmark (original, super_naive, naive, ICOR tool)

np.savetxt("%s.csv" % (summary_name), benchmarks, fmt='%s', delimiter=",")
#the saved files were then moved to the summaries directory.