In [None]:
import time
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import urlparse
import openpyxl
import pandas as pd
from selenium.common.exceptions import NoSuchElementException
from tqdm import tqdm  # Import tqdm for progress bar

# Function to extract domain from URL
def extract_domain(url):
    parsed_url = urlparse(url)
    return parsed_url.netloc

# Function to search Google
def search_google(query):
    search_url = f"https://www.google.com/search?q={query}"
    
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("start-maximized")
    options.add_argument(f"user-agent=Mozilla/5.0 (Windows NT {random.randint(6, 10)}.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(90, 110)}.0.4472.{random.randint(100, 200)} Safari/537.36")
        
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(search_url)
    return driver

# Add a delay to mimic human behavior
def random_delay(min_seconds=1, max_seconds=3):
    time.sleep(random.uniform(min_seconds, max_seconds))

def get_first_search_result(url, model):
    query = extract_domain(url) + "+" + model
    driver = search_google(query)
    
    try:
        first_result = driver.find_element(By.CSS_SELECTOR, 'a[jsname="UWckNb"]')
        link = first_result.get_attribute('href')
        return link
    except Exception as e:
        return f"Error: {e}"
    finally:
        driver.quit()

# Main function to scrape excavator data
def scrape_excavator_data():
    url = "https://asia.develon-ce.com/en/equipment/excavators/"
    options = webdriver.ChromeOptions()
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)

    # Create Excel workbook
    workbook = openpyxl.Workbook()
    table = workbook.active
    table.title = "Excavator Data"
    table.append(["Model"] + [f"Detail {i}" for i in range(1, 20)])  # Header row (adjust number of details as needed)

    while True:
        try:
            model_elements = driver.find_elements(By.CSS_SELECTOR, ".product_name.en_plain_b")

            for model in tqdm(model_elements, total=len(model_elements), desc="Scraping Models"):
                random_delay()
                first_result = get_first_search_result(url, model.text)

                try:
                    tables = pd.read_html(first_result)
                    detail_info = tables[1]  # Assuming details are in the second table
                    flattened_details = detail_info.values.flatten().tolist()  # Flatten DataFrame into a single list
                except Exception as e:
                    flattened_details = ["Details not found"]

                # Append model and flattened details in one row
                table.append([model.text] + flattened_details)
            
            try:
                load_more_button = driver.find_element(By.CLASS_NAME, "btn_more")
                ActionChains(driver).move_to_element(load_more_button).perform()
                random_delay()
                load_more_button.click()
                random_delay(2, 4)  # Longer delay for content load
            except NoSuchElementException:
                print("No 'Load More' button. Exiting loop.")
                break

        except Exception as e:
            print(f"Error: {e}")
            break

    file_name = "excavator_data_with_details.xlsx"
    workbook.save(file_name)
    print(f"Data successfully saved to {file_name}")

    driver.quit()

if __name__ == "__main__":
    scrape_excavator_data()
