# Develon

## for row in rows:

In [26]:
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import urlparse
import pandas as pd
import openpyxl
from tqdm import tqdm  # For progress bar

# Function to extract domain from URL
def extract_domain(url):
    parsed_url = urlparse(url)
    return parsed_url.netloc

# Function to perform a Google search and return the driver
def search_google(query):
    search_url = f"https://www.google.com/search?q={query}"
    
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("start-maximized")
    options.add_argument(
        f"user-agent=Mozilla/5.0 (Windows NT {random.randint(6, 10)}.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(90, 110)}.0.4472.{random.randint(100, 200)} Safari/537.36"
    )
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(search_url)
    return driver

# Function to extract details table from the first search result
def get_details_from_result(url, model):
    query = extract_domain(url) + "+" + model
    driver = search_google(query)
    
    try:
        # Find the first search result
        first_result = driver.find_element(By.CSS_SELECTOR, 'a[jsname="UWckNb"]')
        link = first_result.get_attribute('href')  # Extract the href attribute (URL)
        driver.get(link)
        
        # Extract the table from the details page
        try:
            detail_table = driver.find_element(By.CSS_SELECTOR, ".equipment-detail_spec-table.excavators1.en_plain_l")
            rows = detail_table.find_elements(By.TAG_NAME, "tr")
            details = []
            for row in rows:
                cells = row.find_elements(By.TAG_NAME, "th")
                if len(cells) > 1:  # Check if there is a second column
                    details.append(cells[1].text)  # Extract the second column's text
        except NoSuchElementException:
            details = ["Details not found"]
    except Exception as e:
        details = [f"Error: {e}"]
    finally:
        driver.quit()
    
    return details

# Main function to scrape excavator data
def scrape_excavator_data():
    url = "https://asia.develon-ce.com/en/equipment/excavators/?param1=CIS%2COCEANIA%2CSOUTH_EAST_ASIA%2CINDIA&param2=&param3="
    
    options = webdriver.ChromeOptions()
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)

    # Create an Excel workbook
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    sheet.title = "Excavator Data"
    sheet.append(["Model"] + [f"Detail {i}" for i in range(1, 21)])  # Adjust number of details as needed

    # Track already scraped models
    scraped_models = set()

    while True:
        try:
            model_elements = driver.find_elements(By.CSS_SELECTOR, ".product_name.en_plain_b")
            
            for model_element in tqdm(model_elements, total=len(model_elements), desc="Scraping Models"):
                model_name = model_element.text
                
                # Skip if model already scraped
                if model_name in scraped_models:
                    continue

                # Get details from search result
                details = get_details_from_result(url, model_name)
                sheet.append([model_name] + details)

                # Mark the model as scraped
                scraped_models.add(model_name)
            
            # Handle "Load More" button if available
            try:
                load_more_button = driver.find_element(By.CLASS_NAME, "btn_more")
                ActionChains(driver).move_to_element(load_more_button).perform()
                load_more_button.click()
            except NoSuchElementException:
                print("No 'Load More' button found. Exiting loop.")
                break
        except Exception as e:
            print(f"Error: {e}")
            break

    # Save data to an Excel file
    file_name = "develon_excavator_data_combined.xlsx"
    workbook.save(file_name)
    print(f"Data successfully saved to {file_name}")

    driver.quit()

if __name__ == "__main__":
    scrape_excavator_data()


Scraping Models: 100%|██████████| 22/22 [01:31<00:00,  4.14s/it]
Scraping Models: 100%|██████████| 27/27 [00:24<00:00,  1.10it/s]
Scraping Models: 100%|██████████| 33/33 [00:25<00:00,  1.31it/s]
Scraping Models: 100%|██████████| 39/39 [00:23<00:00,  1.67it/s]
Scraping Models: 100%|██████████| 41/41 [00:08<00:00,  4.93it/s]
Scraping Models: 100%|██████████| 45/45 [00:16<00:00,  2.66it/s] 

No 'Load More' button found. Exiting loop.
Data successfully saved to develon_excavator_data_combined.xlsx





## for row in rows[1:]:

In [29]:
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import urlparse
import pandas as pd
import openpyxl
from tqdm import tqdm  # For progress bar

# Function to extract domain from URL
def extract_domain(url):
    parsed_url = urlparse(url)
    return parsed_url.netloc

# Function to perform a Google search and return the driver
def search_google(query):
    search_url = f"https://www.google.com/search?q={query}"
    
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("start-maximized")
    options.add_argument(
        f"user-agent=Mozilla/5.0 (Windows NT {random.randint(6, 10)}.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(90, 110)}.0.4472.{random.randint(100, 200)} Safari/537.36"
    )
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(search_url)
    return driver

# Function to extract details table from the first search result
def get_details_from_result(url, model):
    query = extract_domain(url) + "+" + model
    driver = search_google(query)
    
    try:
        # Find the first search result
        first_result = driver.find_element(By.CSS_SELECTOR, 'a[jsname="UWckNb"]')
        link = first_result.get_attribute('href')  # Extract the href attribute (URL)
        driver.get(link)
        
        # Extract the table from the details page
        try:
            detail_table = driver.find_element(By.CSS_SELECTOR, ".equipment-detail_spec-table.excavators1.en_plain_l")
            rows = detail_table.find_elements(By.TAG_NAME, "tr")
            details = []
            for row in rows[1:]:
                cells = row.find_elements(By.TAG_NAME, "th")
                if len(cells) > 1:  # Check if there is a second column
                    details.append(cells[1].text)  # Extract the second column's text
        except NoSuchElementException:
            details = ["Details not found"]
    except Exception as e:
        details = [f"Error: {e}"]
    finally:
        driver.quit()
    
    return details

# Main function to scrape excavator data
def scrape_excavator_data():
    url = "https://asia.develon-ce.com/en/equipment/excavators/?param1=CIS%2COCEANIA%2CSOUTH_EAST_ASIA%2CINDIA&param2=&param3="
    
    options = webdriver.ChromeOptions()
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)

    # Create an Excel workbook
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    sheet.title = "Excavator Data"
    sheet.append(["Model"] + [f"Detail {i}" for i in range(1, 21)])  # Adjust number of details as needed

    # Track already scraped models
    scraped_models = set()

    while True:
        try:
            model_elements = driver.find_elements(By.CSS_SELECTOR, ".product_name.en_plain_b")
            
            for model_element in tqdm(model_elements, total=len(model_elements), desc="Scraping Models"):
                model_name = model_element.text
                
                # Skip if model already scraped
                if model_name in scraped_models:
                    continue

                # Get details from search result
                details = get_details_from_result(url, model_name)
                sheet.append([model_name] + details)

                # Mark the model as scraped
                scraped_models.add(model_name)
            
            # Handle "Load More" button if available
            try:
                load_more_button = driver.find_element(By.CLASS_NAME, "btn_more")
                ActionChains(driver).move_to_element(load_more_button).perform()
                load_more_button.click()
            except NoSuchElementException:
                print("No 'Load More' button found. Exiting loop.")
                break
        except Exception as e:
            print(f"Error: {e}")
            break

    # Save data to an Excel file
    file_name = "develon_excavator_data_combined(2).xlsx"
    workbook.save(file_name)
    print(f"Data successfully saved to {file_name}")

    driver.quit()

if __name__ == "__main__":
    scrape_excavator_data()


WebDriverException: Message: Service /Users/jiminpark/.wdm/drivers/chromedriver/mac64/131.0.6778.264/chromedriver-mac-arm64/chromedriver unexpectedly exited. Status code was: -9


## headers.append(cells[0].text)

In [None]:
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import urlparse
import pandas as pd
import openpyxl
from tqdm import tqdm  # For progress bar

# Function to extract domain from URL
def extract_domain(url):
    parsed_url = urlparse(url)
    return parsed_url.netloc

# Function to perform a Google search and return the driver
def search_google(query):
    search_url = f"https://www.google.com/search?q={query}"
    
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("start-maximized")
    options.add_argument(
        f"user-agent=Mozilla/5.0 (Windows NT {random.randint(6, 10)}.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(90, 110)}.0.4472.{random.randint(100, 200)} Safari/537.36"
    )
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(search_url)
    return driver

# Function to extract details table from the first search result
def get_details_from_result(url, model):
    query = extract_domain(url) + "+" + model
    driver = search_google(query)
    
    try:
        # Find the first search result
        first_result = driver.find_element(By.CSS_SELECTOR, 'a[jsname="UWckNb"]')
        link = first_result.get_attribute('href')  # Extract the href attribute (URL)
        driver.get(link)
        
        # Extract the table from the details page
        headers = []
        details = []
        try:
            detail_table = driver.find_element(By.CSS_SELECTOR, ".equipment-detail_spec-table.excavators1.en_plain_l")
            rows = detail_table.find_elements(By.TAG_NAME, "tr")
            
            # Skip the first row
            for row in rows[1:]:  # Start iteration from the second row
                cells = row.find_elements(By.TAG_NAME, "th")
                if len(cells) > 1:  # Ensure there are at least two columns
                    headers.append(cells[0].text)  # First column (header)
                    details.append(cells[1].text)  # Second column (detail)
                else:
                    headers.append("")  # Placeholder if header is missing
                    details.append("")  # Placeholder if detail is missing
        except NoSuchElementException:
            headers = ["Header not found"]
            details = ["Details not found"]
    except Exception as e:
        headers = [f"Error: {e}"]
        details = [f"Error: {e}"]
    finally:
        driver.quit()
    
    return headers, details


# Main function to scrape excavator data
def scrape_excavator_data():
    url = "https://asia.develon-ce.com/en/equipment/excavators/?param1=CIS%2COCEANIA%2CSOUTH_EAST_ASIA%2CINDIA&param2=&param3="
    
    options = webdriver.ChromeOptions()
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)

    # Create an Excel workbook
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    sheet.title = "Excavator Data"

    # Track already scraped models
    scraped_models = set()

    while True:
        try:
            model_elements = driver.find_elements(By.CSS_SELECTOR, ".product_name.en_plain_b")
            
            for model_element in tqdm(model_elements, total=len(model_elements), desc="Scraping Models"):
                model_name = model_element.text
                
                # Skip if model already scraped
                if model_name in scraped_models:
                    continue

                # Get headers and details from search result
                headers, details = get_details_from_result(url, model_name)
                
                # Set column headers only once
                if sheet.max_row == 1:
                    sheet.append(["Model"] + headers)

                # Append model and details
                sheet.append([model_name] + details)

                # Mark the model as scraped
                scraped_models.add(model_name)
            
            # Handle "Load More" button if available
            try:
                load_more_button = driver.find_element(By.CLASS_NAME, "btn_more")
                ActionChains(driver).move_to_element(load_more_button).perform()
                random_delay()
                load_more_button.click()
                random_delay(2, 4)  # Longer delay for content load
            except NoSuchElementException:
                print("No 'Load More' button found. Exiting loop.")
                break
        except Exception as e:
            print(f"Error: {e}")
            break

    # Save data to an Excel file
    file_name = "develon_excavator_data_combined.xlsx"
    workbook.save(file_name)
    print(f"Data successfully saved to {file_name}")

    driver.quit()

if __name__ == "__main__":
    scrape_excavator_data()


# Komatsu

In [None]:
import time
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
from urllib.parse import urlparse
import openpyxl
import pandas as pd
from selenium.common.exceptions import NoSuchElementException
from tqdm import tqdm  # Import tqdm for progress bar

# Function to extract domain from URL
def extract_domain(url):
    parsed_url = urlparse(url)
    return parsed_url.netloc

# Function to search Google
def search_google(query):
    search_url = f"https://www.google.com/search?q={query}"
    
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("start-maximized")
    options.add_argument(f"user-agent=Mozilla/5.0 (Windows NT {random.randint(6, 10)}.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(90, 110)}.0.4472.{random.randint(100, 200)} Safari/537.36")
        
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(search_url)
    return driver

# Add a delay to mimic human behavior
def random_delay(min_seconds=1, max_seconds=3):
    time.sleep(random.uniform(min_seconds, max_seconds))

def get_first_search_result(url, model):
    query = extract_domain(url) + "+" + model
    driver = search_google(query)
    
    try:
        first_result = driver.find_element(By.CSS_SELECTOR, 'a[jsname="UWckNb"]')
        link = first_result.get_attribute('href')
        return link
    except Exception as e:
        return f"Error: {e}"
    finally:
        driver.quit()

# Main function to scrape excavator data
def scrape_excavator_data():
    url = "https://www.komatsu.com/en/products/excavators/" #APAC
    domain = "komatsu.com"
    options = webdriver.ChromeOptions()
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)

    # Create Excel workbook
    workbook = openpyxl.Workbook()
    table = workbook.active
    table.title = "Excavator Data"
    table.append(["Model"] + [f"Detail {i}" for i in range(1, 20)])  # Header row (adjust number of details as needed)

    while True:
        try:
            model_elements = driver.find_elements(By.CLASS_NAME, "product-card__name")

            for model in tqdm(model_elements, total=len(model_elements), desc="Scraping Models"):
                random_delay()
                first_result = get_first_search_result(domain, model.text)

                try:
                    tables = pd.read_html(first_result)
                    detail_info = tables[1]  # Assuming details are in the second table
                    flattened_details = detail_info.values.flatten().tolist()  # Flatten DataFrame into a single list
                except Exception as e:
                    flattened_details = ["Details not found"]

                # Append model and flattened details in one row
                table.append([model.text] + flattened_details)
            
            try:
                load_more_button = driver.find_element(By.CLASS_NAME, "action-button action-button--secondary product-tiles__load-more")
                ActionChains(driver).move_to_element(load_more_button).perform()
                random_delay()
                load_more_button.click()
                random_delay(2, 4)  # Longer delay for content load
            except NoSuchElementException:
                print("No 'Load More' button. Exiting loop.")
                break

        except Exception as e:
            print(f"Error: {e}")
            break

    file_name = "excavator_data_with_details.xlsx"
    workbook.save(file_name)
    print(f"Data successfully saved to {file_name}")

    driver.quit()

if __name__ == "__main__":
    scrape_excavator_data()


# Two-wheeler
