In [1]:
from Bio import SeqIO
from pathlib import Path
from pprint import pprint
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd
import numpy as np
import glob
import os

In [2]:
base_dir = "data/development_set"
batch_csv_files = glob.glob(os.path.join(base_dir, "all_binding_sites_batch_*.csv"))

In [3]:
all_binding_sites_df = pd.DataFrame()
for csv_file in batch_csv_files:
    batch_df = pd.read_csv(csv_file)
    all_binding_sites_df = pd.concat([all_binding_sites_df, batch_df], ignore_index=True)

In [15]:
all_binding_sites_df.to_csv("data/development_set/all_binding_sites_complete.csv", index = False,
                            encoding = "utf-8-sig")

In [4]:
missing_val_df = all_binding_sites_df.loc[(all_binding_sites_df['sequence_length'] == 0)]
display(missing_val_df)

Unnamed: 0,prot_id,binding_sites,ligand_type,sequence,sequence_length,binary_binding_sites
1,P26514,"[384, 388, 408, 409, 410, 417, 419, 421, 424, ...",small,,0,[]
5,Q9S1H5,"[33, 154, 136, 202, 114, 55, 88, 153, 56, 155,...",small,,0,[]
8,Q9WYT0,"[144, 147, 30, 163, 165, 166, 169, 173, 174, 1...",small,,0,[]
9,P0AAJ3,"[133, 134, 135, 136, 137, 138, 139, 143, 144, ...",small,,0,[]
11,P12473,"[101, 155, 146, 187, 188, 189, 190]",small,,0,[]
...,...,...,...,...,...,...
19046,Q8T6T7,"[322, 178, 307, 115, 117, 310, 119, 312, 281, ...",small,,0,[]
19048,C6KT68,"[271, 272, 273, 156, 157, 158, 159, 172, 173, ...",small,,0,[]
19049,Q46085,"[486, 455, 456, 487, 426, 459, 427, 428, 430, ...",small,,0,[]
19051,P11362,"[640, 641, 512, 643, 644, 514, 642, 531, 535, ...",small,,0,[]


In [12]:
# missing_val_df.to_csv(f"data/development_set/missing_val_binding_sites_df.csv", index = False,
#                       encoding = "utf-8")

missing_val_df = pd.read_csv("data/development_set/missing_val_binding_sites_df.csv")

In [14]:
batch_size = 1000
total_samples = len(missing_val_df)

for batch_start in range(0, total_samples, batch_size): # edit to total_samples soon
    batch_end = min(batch_start + batch_size, total_samples)
    batch_df = missing_val_df.iloc[batch_start:batch_end].copy()
    display(batch_df)
    break

Unnamed: 0,prot_id,binding_sites,ligand_type,sequence,sequence_length,binary_binding_sites
0,P26514,"[384, 388, 408, 409, 410, 417, 419, 421, 424, ...",small,,0,[]
1,Q9S1H5,"[33, 154, 136, 202, 114, 55, 88, 153, 56, 155,...",small,,0,[]
2,Q9WYT0,"[144, 147, 30, 163, 165, 166, 169, 173, 174, 1...",small,,0,[]
3,P0AAJ3,"[133, 134, 135, 136, 137, 138, 139, 143, 144, ...",small,,0,[]
4,P12473,"[101, 155, 146, 187, 188, 189, 190]",small,,0,[]
...,...,...,...,...,...,...
995,Q06453,"[128, 130, 131, 132, 134, 135, 137, 139, 141, ...",nuclear,,0,[]
996,Q9UKV8,"[522, 524, 525, 526, 529, 533, 544, 545, 546, ...",nuclear,,0,[]
997,Q6AZJ8,"[33, 36, 43, 76, 77, 46, 14, 44, 16, 18, 45, 7...",nuclear,,0,[]
998,Q9RWH8,"[129, 2, 186, 110, 111, 113, 114, 115, 116, 11...",nuclear,,0,[]


In [5]:
def get_sequence_info(prot_id, driver):
    print(f"Processing {prot_id} ID")
    info_prot_url = f"https://www.uniprot.org/uniprotkb/{prot_id}/entry#sequences"
    initial_url = info_prot_url
    driver.get(info_prot_url)
    driver.implicitly_wait(1)
    WebDriverWait(driver, 3).until(
                lambda d: d.execute_script('return document.readyState') == 'complete'
            )
    current_url = driver.current_url
    if current_url != initial_url:
        print(f"This entry sequence: {info_prot_url} is no longer annotated in UniProtKB")
        return None
    try:
        button = WebDriverWait(driver, 3).until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Show sequence')]")))
        button.click()
    except:
        print(f"No need to click the button 'Show sequence' ")

    try:
        # Use explicit wait for sequence chunks
        sequence_chunks = WebDriverWait(driver, 3).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "sequence__chunk"))
        )
        final_sequence = "".join(chunk.text for chunk in sequence_chunks)
        return final_sequence
    except Exception as e:
        print(f"Failed to find sequence chunks: {str(e)}")
        return None

In [6]:
def convert_to_binary_list(original_binding_sites_lst, sequence_len):
    """Convert a Binding-Active site string to a binary list based on the sequence length."""
    binary_list = [0] * sequence_len  # Initialize a list of zeros
    
    # Ensure original_binding_sites_lst is a list and not empty
    if isinstance(original_binding_sites_lst, list) and len(original_binding_sites_lst) > 0:
        for idx in original_binding_sites_lst:
            if isinstance(idx, int) and 1 <= idx <= sequence_len:  # Ensure index is valid
                binary_list[idx - 1] = 1

    return binary_list

In [7]:

options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
