# Scrape model name

In [2]:
pip install --upgrade selenium


Collecting selenium
  Downloading selenium-4.28.0-py3-none-any.whl.metadata (7.1 kB)
Downloading selenium-4.28.0-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: selenium
  Attempting uninstall: selenium
    Found existing installation: selenium 4.27.1
    Uninstalling selenium-4.27.1:
      Successfully uninstalled selenium-4.27.1
Successfully installed selenium-4.28.0
Note: you may need to restart the kernel to use updated packages.


In [6]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import openpyxl
from tqdm import tqdm

def scrape_excavator_data():
    url = "https://www.komatsu.com/en/products/excavators/"
    
    # Set up WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)

    # Create an Excel workbook
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    sheet.title = "Excavator Data"
    sheet.append(["Model Name"])

    # Track already scraped models
    scraped_models = set()

    while True:
        try:
            # Wait for product cards to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".product-card__name"))
            )

            # Find all product names
            model_elements = driver.find_elements(By.CSS_SELECTOR, ".product-card__name")
            for model_element in model_elements:
                model_name = model_element.text.strip()
                
                # Skip if model already scraped
                if model_name in scraped_models:
                    continue
                
                # Add model to Excel sheet and scraped set
                sheet.append([model_name])
                scraped_models.add(model_name)

            # Click all "Load More" buttons on the page
            load_more_buttons = driver.find_elements(By.CSS_SELECTOR, ".action-button--secondary.product-tiles__load-more")
            if load_more_buttons:
                for button in load_more_buttons:
                    try:
                        # Click the "Load More" button
                        button.click()

                        # Wait for new products to load
                        time.sleep(3)  # Allow time for products to load
                    except Exception as e:
                        print(f"Error clicking 'Load More' button: {e}")
                        continue
            else:
                print("No more 'Load More' buttons found. Exiting loop.")
                break

        except Exception as e:
            print(f"Error during scraping: {e}")
            break

    # Save data to an Excel file
    file_name = "komatsu_excavator_model.xlsx"
    workbook.save(file_name)
    print(f"Data successfully saved to {file_name}")

    driver.quit()


if __name__ == "__main__":
    scrape_excavator_data()


No more 'Load More' buttons found. Exiting loop.
Data successfully saved to komatsu_excavator_model.xlsx


# First url

In [23]:
import random
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import urlparse

url = "https://www.komatsu.com/en/products/excavators/"
model = "PC30MR-5"

def extract_domain(url):
    parsed_url = urlparse(url)
    return parsed_url.netloc

def search_google(query):
    search_url = f"https://www.google.com/search?q={query}"
    
    # Set up the Chrome WebDriver
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless")  # Run in headless mode (without opening a browser window)
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument(
        f"user-agent=Mozilla/5.0 (Windows NT {random.randint(6, 10)}.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(90, 110)}.0.4472.{random.randint(100, 200)} Safari/537.36"
    )
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(search_url)
    
    time.sleep(20)  # Wait for the page to load, adjust if needed
    
    return driver

def get_first_search_result(url, model):
    query = extract_domain(url) + "+" + model + "english"
    driver = search_google(query)
    
    # Find the first search result using the href attribute
    try:
        first_result = driver.find_element(By.CSS_SELECTOR, 'a[jsname="UWckNb"]')
        link = first_result.get_attribute('href')  # Extract the href attribute (the URL)
        return link
    except Exception as e:
        return f"Error: {e}"
    finally:
        driver.quit()  # Close the browser when done

# Example usage
first_result = get_first_search_result(url, model)
print(first_result)


https://www.komatsu.com/en/products/excavators/small-excavators/pc30mr-5/


# Scrape Info

In [28]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import urlparse
import pandas as pd

url = "https://www.komatsu.com/en/products/excavators/"
model = "PC30MR-5"

def extract_domain(url):
    parsed_url = urlparse(url)
    return parsed_url.netloc

def search_google(query):
    search_url = f"https://www.google.com/search?q={query}"
    
    # Set up the Chrome WebDriver
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless")  # Run in headless mode (without opening a browser window)
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(search_url)
    
    # Wait for results to load (adjust time if necessary)
    time.sleep(20)  # Wait for the page to load, adjust if needed
    
    return driver

def get_first_search_result(url, model):
    query = extract_domain(url) + "+" + model + "english"
    driver = search_google(query)
    
    # Find the first search result using the href attribute
    try:
        first_result = driver.find_element(By.CSS_SELECTOR, 'a[jsname="UWckNb"]')
        link = first_result.get_attribute('href')  # Extract the href attribute (the URL)
        return link
    except Exception as e:
        return f"Error: {e}"
    finally:
        driver.quit()  # Close the browser when done

# Example usage
first_result = get_first_search_result(url, model)


# Use Selenium to extract the table from the first_result URL
options = webdriver.ChromeOptions()
# options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(first_result)


# Extract the table using the specified class name
try:
    # Extract the table containing details using its class name from the first result page
    detail_table = driver.find_element(By.CSS_SELECTOR, "table-wrapper")
    rows = detail_table.find_elements(By.TAG_NAME, "tr")

    # Extract header and data from each row
    flattened_details = []
    for row in rows:
        cells = row.find_elements(By.TAG_NAME, "td")
        if len(cells) > 1:  # Ignore rows that don't have data
            flattened_details.append((cells[1].text))  # Append a tuple with two values

    if not flattened_details:
        flattened_details = ["Details not found"]
except Exception as e:
    flattened_details = ["Error: " + str(e)]

finally:
    driver.quit()

display(flattened_details)


['Error: Message: no such element: Unable to locate element: {"method":"css selector","selector":"table-wrapper"}\n  (Session info: chrome=132.0.6834.84); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception\nStacktrace:\n0   chromedriver                        0x00000001051777a4 cxxbridge1$str$ptr + 2589716\n1   chromedriver                        0x000000010517005c cxxbridge1$str$ptr + 2559180\n2   chromedriver                        0x0000000104d13f5c cxxbridge1$string$len + 88260\n3   chromedriver                        0x0000000104d59024 cxxbridge1$string$len + 371084\n4   chromedriver                        0x0000000104d931a8 cxxbridge1$string$len + 609040\n5   chromedriver                        0x0000000104d4ce40 cxxbridge1$string$len + 321448\n6   chromedriver                        0x0000000104d4da88 cxxbridge1$string$len + 324592\n7   chromedriver                        0x000000010514

# Scrape info (with link given)

In [None]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import urlparse
import pandas as pd

# Example usage
first_result = "https://www.komatsu.com/en/products/excavators/small-excavators/pc30mr-5/"


# Use Selenium to extract the table from the first_result URL
options = webdriver.ChromeOptions()
# options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(first_result)


# Extract the table using the specified class name
try:
    # Extract the table containing details using its class name from the first result page
    detail_table = driver.find_element(By.CSS_SELECTOR, "v-tab-content-container")
    rows = detail_table.find_elements(By.TAG_NAME, "tr")

    # Extract header and data from each row
    flattened_details = []
    for row in rows:
        cells = row.find_elements(By.TAG_NAME, "td")
        if len(cells) > 1:  # Ignore rows that don't have data
            flattened_details.append((cells[1].text))  # Append a tuple with two values

    if not flattened_details:
        flattened_details = ["Details not found"]
except Exception as e:
    flattened_details = ["Error: " + str(e)]

finally:
    driver.quit()

display(flattened_details)


ValueError: Shape of passed values is (1, 1), indices imply (1, 3)