In [13]:
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import StaleElementReferenceException
import urllib.parse
import time
import re
from bs4 import BeautifulSoup
import pandas as pd

In [14]:
os.environ['PATH'] += r'C:\Users\gavvi\ChromeDrivers\chrome-win64\chrome-win64'

In [15]:
driver = webdriver.Chrome()
url = "https://skb-insilico.com/dlip/compound-search/ppi-library/rule-based"
driver.get(url)

driver.implicitly_wait(5)

In [16]:
# Click on the "All" button
all_btn = driver.find_element(By.XPATH, "//button[text()=' All']")
all_btn.click()

driver.implicitly_wait(2)

In [17]:
search_btn = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.CLASS_NAME, 'btn-green'))
)
search_btn.click()

In [18]:
table = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CLASS_NAME, "dataTables_scrollBody"))
        )

# Extract the HTML content of the table
table_html = table.get_attribute('outerHTML')

# Use BeautifulSoup to parse the HTML
soup = BeautifulSoup(table_html, 'html.parser')

# Extract column headers
header = [th.text for th in soup.find_all('th')]

# Construct empty DataFrame
df = pd.DataFrame([], columns=header)

In [19]:
df

Unnamed: 0,DLiP-ID,Mol Image,MW,XLogP,HBA,HBD,PSA,nRotatableBonds,nRings


In [20]:
def update_dataframe_on_new_page(driver, base_url, existing_df):
    # Extract the HTML content of the table
    table_html = driver.find_element(By.CLASS_NAME, "dataTables_scrollBody").get_attribute('outerHTML')

    # Use BeautifulSoup to parse the HTML
    soup = BeautifulSoup(table_html, 'html.parser')

    data = []

    # Extract table data manually
    for row in soup.find_all('tr')[1:]:
        row_data = [td.text for td in row.find_all('td')]

        # Extract the DLiP-ID and Canonical SMILES(RDKit) links
        dlip_id_link = row.find('a', {'href': re.compile(r'/dlip/compound/')})
        smiles_link = row.find('a', {'href': re.compile(r'/dlip/compound/[A-Z]\d+')}), 

        # Navigate to the DLiP-ID link
        dlip_id_url = urllib.parse.urljoin(base_url, dlip_id_link['href'])
        driver.get(dlip_id_url)

        # Extract the Canonical SMILES(RDKit) value
        smiles_value = driver.find_element(By.XPATH, '//td[text()="Canonical SMILES(RDKit)"]/following-sibling::td').text

        # Replace the Mol Image value with the Canonical SMILES(RDKit)
        mol_image_index = existing_df.columns.get_loc("Mol Image")
        row_data[mol_image_index] = smiles_value

        # Return to the initial page
        driver.back()

        # Append the modified row_data to the DataFrame
        data.append(row_data)

    # Ensure the columns are in the correct order
    new_df = pd.DataFrame(data, columns=existing_df.columns)

    # Concatenate DataFrames
    updated_df = pd.concat([existing_df, new_df], ignore_index=True)

    return updated_df


In [21]:
page_threshold = 609

In [None]:
base_url = "https://skb-insilico.com"

# Updated the empty data frame with the content of the first page using our helper function
df = update_dataframe_on_new_page(driver, base_url, df)

# Loop through pages until the last page
page_number = 1
while True:
    try:
        # Wait for the loading overlay to disappear
        WebDriverWait(driver, 5).until(
            EC.invisibility_of_element_located((By.CLASS_NAME, "loadingoverlay"))
        )
        
        # Find the "Next" button
        next_button = driver.find_element(By.XPATH, '//*[@id="compound-list-table_next"]/a')
        
        # Click on the "Next" button
        next_button.click()

        # Wait for the table to be present on the next page
        time.sleep(3) 
        table = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CLASS_NAME, "dataTables_scrollBody"))
        )

        # Update the old dataframe with the content of the next website page using our helper function
        df = update_dataframe_on_new_page(driver, base_url, df)

        page_number += 1

        # Check if there are more pages
        if 'disabled' in next_button.get_attribute('class') or page_number > page_threshold:
            break

    except StaleElementReferenceException:
        continue


In [11]:
df

Unnamed: 0,DLiP-ID,Mol Image,MW,XLogP,HBA,HBD,PSA,nRotatableBonds,nRings
0,D00000,COc1cccc2c1OCC21CCN(C(=O)CC2(c3cccc(Br)c3)CCNC...,499.449,3.49,4,1,50.8,4,5
1,D00001,COc1ccccc1Cn1nc(C)c(C(=O)N2CCC(CN3Cc4ccc(F)cc4...,490.579,4.05,5,0,67.67,6,5
2,D00002,COc1cccc(C2(CC(=O)NC3(Cc4ccc(Cl)cc4)CCS(=O)(=O...,505.08,3.868,5,2,84.5,7,4
3,D00003,NC1CCN(Cc2ccccc2C(=O)NC2CCN(Cc3ccccc3C(=O)O)CC...,450.583,2.666,5,3,98.9,7,4
4,D00004,COc1cccc(C2(CC(=O)Nc3ccccc3N3CCC(C(=O)O)CC3)CC...,451.567,2.902,5,3,90.9,7,4
...,...,...,...,...,...,...,...,...,...
5620,D0045K,COc1ccccc1NC(=O)NC1CCN(C(=O)C2(Cc3ccc(F)cc3)CC...,503.565,4.626,3,2,70.67,6,4
5621,D0045L,COc1ccc2c(c1)C(=O)N(C1CCN(C(=O)CC3(c4ccc(Br)cc...,527.459,3.817,4,0,59.08,5,5
5622,D0045M,O=C(N[C@H]1CCCN(C(=O)C2CCCN(c3ncnc4ccccc34)C2)...,461.541,3.91,5,1,78.43,4,5
5623,D0045N,NC1CCN(C(=O)c2cccc(C(=O)NC3(Cc4ccccc4Cl)CCNCC3...,455.002,3.382,4,3,87.46,5,4


In [163]:
# Save the DataFrame to a CSV file
csv_file_path = "temp1.csv"
df.to_csv(csv_file_path, index=False)