# scrape with html code (advanced)

In [8]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import urlparse
import pandas as pd

url = "https://asia.develon-ce.com/en/equipment/excavators/?param1=CIS%2COCEANIA%2CSOUTH_EAST_ASIA%2CINDIA&param2=&param3="
model = "DX89R-7"

def extract_domain(url):
    parsed_url = urlparse(url)
    return parsed_url.netloc

def search_google(query):
    search_url = f"https://www.google.com/search?q={query}"
    
    # Set up the Chrome WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in headless mode (without opening a browser window)
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(search_url)
    
    # Wait for results to load (adjust time if necessary)
    time.sleep(3)  # Wait for the page to load, adjust if needed
    
    return driver

def get_first_search_result(url, model):
    query = extract_domain(url) + "+" + model
    driver = search_google(query)
    
    # Find the first search result using the href attribute
    try:
        first_result = driver.find_element(By.CSS_SELECTOR, 'a[jsname="UWckNb"]')
        link = first_result.get_attribute('href')  # Extract the href attribute (the URL)
        return link
    except Exception as e:
        return f"Error: {e}"
    finally:
        driver.quit()  # Close the browser when done

# Example usage
first_result = get_first_search_result(url, model)


# Use Selenium to extract the table from the first_result URL
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(first_result)


# Extract the table using the specified class name
try:
    # Extract the table containing details using its class name from the first result page
    detail_table = driver.find_element(By.CSS_SELECTOR, ".equipment-detail_spec-table.excavators1.en_plain_l")
    rows = detail_table.find_elements(By.TAG_NAME, "tr")

    # Extract header and data from each row
    flattened_details = []
    for row in rows:
        cells = row.find_elements(By.TAG_NAME, "th")
        if len(cells) > 1:  # Ignore rows that don't have data
            flattened_details.append((cells[1].text))  # Append a tuple with two values

    if not flattened_details:
        flattened_details = ["Details not found"]
except Exception as e:
    flattened_details = ["Error: " + str(e)]

finally:
    driver.quit()

display(flattened_details)


['DX89R-7',
 '9,518 kg',
 '0.28 m³',
 '48.5/2,100 (kW/rpm)',
 'Develon D24',
 '2.7/4.7 km/h',
 '7,015 mm',
 '4,110 mm',
 '7,030 mm',
 '6,430 mm',
 '2,250 mm',
 '2,657 mm']

# scrape with read_html (simple)

In [None]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import urlparse
import pandas as pd

url = "https://asia.develon-ce.com/en/equipment/excavators/?param1=CIS%2COCEANIA%2CSOUTH_EAST_ASIA%2CINDIA&param2=&param3="
model = "DX89R-7"

def extract_domain(url):
    parsed_url = urlparse(url)
    return parsed_url.netloc

def search_google(query):
    search_url = f"https://www.google.com/search?q={query}"
    
    # Set up the Chrome WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in headless mode (without opening a browser window)
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(search_url)
    
    # Wait for results to load (adjust time if necessary)
    time.sleep(3)  # Wait for the page to load, adjust if needed
    
    return driver

def get_first_search_result(url, model):
    query = extract_domain(url) + "+" + model
    driver = search_google(query)
    
    # Find the first search result using the href attribute
    try:
        first_result = driver.find_element(By.CSS_SELECTOR, 'a[jsname="UWckNb"]')
        link = first_result.get_attribute('href')  # Extract the href attribute (the URL)
        return link
    except Exception as e:
        return f"Error: {e}"
    finally:
        driver.quit()  # Close the browser when done

# Example usage
first_result = get_first_search_result(url, model)


tables = pd.read_html(first_result)
combined_table = pd.concat([tables[0], tables[1]], axis=1)
combined_table_transposed = combined_table.T

display(combined_table_transposed)
