In [152]:
import pyperclip
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys 

In [164]:
def parsing_dscTable(table_element):
    html_table = table_element.get_attribute('outerHTML')
    table = pd.read_html(html_table)[0]
    table = table[table['E value'] < 0.001]
    
    msg = []
    if len(table) == 0:
        msg.append('-')
    else:
        table_cut = table[:20][
            (~table.Description.str.contains('hypothetical protein', regex=False))
        ]

        if len(table_cut) == 0:
            msg.append('hp')
        else:
            for idx, row in table.iterrows():
                if (
                    (('hypothetical protein' not in row['Description'])
                     and ("conserved hypothetical protein" not in row['Description'])
                     and ("unnamed protein product" not in row['Description'])
                     and ("conserved protein of unknown function" not in row['Description'])
                    ) 
                    and len(msg) < 3):

                    msg.append(f"{row['Description']};{'{:.1e}'.format(row['E value'])}")
    
    ratio_hp = (
        1 - (len(table)/len(table[table.Description.str.contains('hypothetical protein')]))/100
    )
    if len(msg) == 0:
        if ratio_hp >= 0.2:
            msg.append('hp')
        else:
            msg.append('-')
    
    return msg


def parsing_cddInfo(table_element):
    html_table = table_element.get_attribute('outerHTML')
    table_results = pd.read_html(html_table)[0]
    # Удаление всего лишнего из таблицы
    table_results = table_results.iloc[1:, 1:].dropna()[::2]
    # Оставляем только данные где E-value < 0.001
    table_results['E_value_float'] = table_results[5].astype(float)
    table_results = table_results[table_results['E_value_float'] < 0.001]
    
    msg = []
    if len(table_results) == 0:
        msg.append('-')
    else:
        for idx, row in table_results.iterrows():
            if len(msg) < 3:
                msg.append(f'{row[1]};{row[5]}')
    
    return msg

def send_submit(sequence):
    box = driver.wait.until(EC.presence_of_element_located(
            (By.NAME, "QUERY")))
    button = driver.wait.until(EC.element_to_be_clickable(
                (By.CLASS_NAME, "blastbutton")))

    pyperclip.copy(sequence) 
    box.send_keys(Keys.CONTROL, "a")
    box.send_keys(Keys.DELETE)
    box.send_keys(Keys.CONTROL, "v")
    
    button.click()

    # Пауза реализована следующим образом:
    # Прокнамма с паузами в 5 секунд пытается найти элемент "Edit Search" в течении 500 секунд
    # как только элемент найден, пограмма идет дальше.
    # Если элемент не найден даже по прошествии 500 секунд, программа сообщает пользователю
    # что все плохо.
    element_exist = False
    for i in range(100): # 500 секунд ожмдания
        try:
            edit_search = driver.wait.until(EC.element_to_be_clickable(
                (By.XPATH, '//*[@id="searchOptions"]')))
            element_exist = True
            break
        except TimeoutException:
            continue
    if element_exist:
        try:
            table_result = driver.wait.until(EC.presence_of_element_located(
                    (By.XPATH, '//*[@id="dscTable"]')))

            msg_main = parsing_dscTable(table_result)


            graphic_summary = driver.wait.until(EC.element_to_be_clickable(
                        (By.XPATH, '//*[@id="btnGrph"]')))
            graphic_summary.click()

            lable_cddInfo = driver.wait.until(EC.presence_of_element_located(
                        (By.XPATH, '//*[@id="cddDesc"]')))

            NO_SWITCH = "No putative conserved domains have been detected"

            if NO_SWITCH not in lable_cddInfo.get_attribute('outerHTML'):
                # Поиск кнопки и переход на дополнительную страницу
                cddInfo = driver.wait.until(EC.element_to_be_clickable(
                            (By.XPATH, '//*[@id="cddInfo"]/a/img')))
                cddInfo.click()

                # открытые табы
                #driver.window_handles

                # перемещание по табам
                driver.switch_to.window(driver.window_handles[1])
                # Поиск элемента таблицы на дополнительной странице (не забыть активировать таб)
                table_cddInfo = driver.wait.until(EC.presence_of_element_located(
                            (By.XPATH, '//*[@id="std"]')))

                msg_cddInfo = parsing_cddInfo(table_cddInfo)

                # Закрытие активного таба и перемещение к основному. 
                # !!! Не активировать при одном активном табе!!!
                driver.close()
                driver.switch_to.window(driver.window_handles[0])

            else:
                msg_cddInfo = ['-']
        except TimeoutException:
            msg_main = ['-']
            msg_cddInfo = ['-']

    else:
        print('We did not find Edit Search:(')
        
    return msg_main, msg_cddInfo

In [165]:
table = pd.read_excel('CP012104_1_-_RAST.xls', sheet_name='Sheet1')

In [167]:
driver = webdriver.Chrome()
driver.wait = WebDriverWait(driver, 5)
driver.get(f"https://blast.ncbi.nlm.nih.gov/"
           f"Blast.cgi?PROGRAM=blastp&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome")

# basic adjasment
organism = driver.wait.until(EC.presence_of_element_located(
            (By.NAME, "EQ_MENU")))
psi_blast = driver.wait.until(EC.element_to_be_clickable(
            (By.XPATH, '//*[@id="psiBlast"]')))
organism.send_keys(f'Viruses (taxid:10239)')
psi_blast.click()

with open('results_blast_analyzer.csv', 'w') as file:
    for idx, elem in enumerate(table.aa_sequence):
        msg_1, msg_2, = send_submit(elem)
        file.write(f'{idx + 1}\t')
        file.write('$'.join(msg_1))
        file.write('\t')
        file.write('$'.join(msg_2))
        file.write('\n')
        # переходим на основное меню
        edit_search = driver.wait.until(EC.element_to_be_clickable(
                (By.XPATH, '//*[@id="searchOptions"]')))
        edit_search.click()
        
driver.quit() # завершение работы драйвера

['hp']


KeyboardInterrupt: 

In [160]:
table.aa_sequence[0]

'MGFKFRKSIKIAPGVRMNVSKKGVGVSAGVKGARVSTGPSGTRITTSVPGTGLSYEKRL'

Unnamed: 0,contig_id,feature_id,type,location,orf,start,stop,strand,aa_sequence,function,aliases,figfam,evidence_codes,nucleotide_sequence
0,CP012104.1,fig|6666666.586532.peg.1,peg,CP012104.1_177_1,1,177,1,-,MGFKFRKSIKIAPGVRMNVSKKGVGVSAGVKGARVSTGPSGTRITT...,hypothetical protein,,,,gtgggatttaaattccgtaaaagtataaagattgctccaggtgtta...
1,CP012104.1,fig|6666666.586532.peg.2,peg,CP012104.1_842_201,2,842,201,-,MNKKLITTLACSALLFGLSACGSTEKTNNDVQTKEKDQPQQEVKKE...,hypothetical protein,,,,atgaataagaaactaataacaacattagcttgtagcgccttgttat...
2,CP012104.1,fig|6666666.586532.peg.3,peg,CP012104.1_1382_1038,3,1382,1038,-,MRHFGQILKKLRKSRGLTQEQLSHKLNLSRSQIKNWETDRYQPDLD...,"prophage LambdaBa04, DNA-binding protein",,,,atgagacattttggacagattctaaaaaaactaagaaagtcacgtg...
3,CP012104.1,fig|6666666.586532.peg.4,peg,CP012104.1_1877_3019,4,1877,3019,+,MEMLLKKMHEDLKSNGYTNRKLATLFNVSHTTVNSYFSQSTKFDFM...,hypothetical protein,,,,atggaaatgttgttaaagaagatgcacgaagacttgaagtctaacg...
4,CP012104.1,fig|6666666.586532.peg.5,peg,CP012104.1_3052_3204,5,3052,3204,+,MKLKAIKVVIAAAIIACVGFTPVHNDKEVSQQSQTPIEYRMMADPG...,hypothetical protein,,,,ttgaagctaaaagctatcaaagtagtaatagcagcagcaattattg...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,CP012104.1,fig|6666666.586532.peg.69,peg,CP012104.1_39584_39225,69,39584,39225,-,MNRKMSNLFPGSATRTTEDINKSEIQHPENNNETLTKQNDNVNNNV...,hypothetical protein,,,,ttgaatcgcaaaatgtctaatcttttccctggtagtgctacaagaa...
69,CP012104.1,fig|6666666.586532.peg.70,peg,CP012104.1_40285_39698,70,40285,39698,-,MFAKQGKKVLIVDADEQGNVLLSFGKNPDEYKLTLYDVLVDYVSPK...,Chromosome (plasmid) partitioning protein ParA,,,isu;Bacterial_Cytoskeleton isu;Plasmid_replica...,ttgttcgctaaacaagggaaaaaagtattgattgttgacgctgacg...
70,CP012104.1,fig|6666666.586532.peg.71,peg,CP012104.1_40814_41305,71,40814,41305,+,MSVAIRKPQGETFIDSWYECYLSERKKSGYVVTIDLSNEQHKQIWY...,Replication protein,,,,atgtcagtggcaattcggaagccgcaaggggaaacgttcattgatt...
71,CP012104.1,fig|6666666.586532.peg.72,peg,CP012104.1_41283_42119,72,41283,42119,+,MLMPLGADGSCSDLSRVFRLPYSTHGKTGQQITVDLWTEREYSLQE...,hypothetical protein,,,,atgcttatgccgttaggcgctgacggttcgtgttcagacttatctc...
