# Basic


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
import time
import openpyxl

# Initialize the WebDriver
options = Options()
b = webdriver.Chrome(options=options)

# Open the target URL
url = "https://asia.develon-ce.com/en/equipment/excavators/"
b.get(url)

# Create a new Excel workbook and sheet
workbook = openpyxl.Workbook()
sheet = workbook.active
element = b.find_element(By.CLASS_NAME, "en_plain_b")  # Get the WebElement
sheet.title = element.text + "data"  # Concatenate its text content with "data"

# Add headers to the Excel sheet
sheet.append(["Model", "Info"])

# Extract data and insert into the Excel sheet
while True:
    try:
        # Find elements for model names and info
        model_elements = b.find_elements(By.CSS_SELECTOR, ".product_name.en_plain_b")
        info_elements = b.find_elements(By.CSS_SELECTOR, ".product_info.en_plain_l")

        # Check if both lists have the same length
        if len(model_elements) != len(info_elements):
            print("Mismatch in number of model and info elements. Exiting loop.")
            break

        # Add data to the Excel sheet
        for model, info, detail in zip(model_elements, info_elements,):
            sheet.append([model.text, info.text, detail.text])  # Append model name and info to the sheet

        # Check if a "Load More" button exists
        try:
            load_more_button = b.find_element(By.CLASS_NAME, "btn_more")
            ActionChains(b).move_to_element(load_more_button).perform()
            time.sleep(0.5)
            load_more_button.click()
            time.sleep(2)  # Wait for content to load
        except NoSuchElementException:
            print("No 'Load More' button found. Exiting loop.")
            break  # Exit loop if no "Load More" button is found
    except Exception as e:
        print(f"Error: {e}")
        break

# Save the workbook to an Excel file
file_name = "excavator_data.xlsx"
workbook.save(file_name)
print(f"Data successfully saved to {file_name}")

# Close the WebDriver
b.quit()


# Type1

In [None]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import urlparse
import pandas as pd
import openpyxl

# Function to extract domain from URL
def extract_domain(url):
    parsed_url = urlparse(url)
    return parsed_url.netloc

# Function to search Google
def search_google(query):
    search_url = f"https://www.google.com/search?q={query}"
    
    # Set up the Chrome WebDriver
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless")  # Run in headless mode (without opening a browser window)
    # options.add_argument("--disable-gpu")
    # options.add_argument("--no-sandbox")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(search_url)
    
    # Wait for results to load
    time.sleep(3)  # Wait for the page to load, adjust if needed
    
    return driver

# Function to get the first search result
def get_first_search_result(url, model):
    query = extract_domain(url) + "+" + model
    driver = search_google(query)
    
    # Find the first search result using the href attribute
    try:
        first_result = driver.find_element(By.CSS_SELECTOR, 'a[jsname="UWckNb"]')
        link = first_result.get_attribute('href')  # Extract the href attribute (the URL)
        return link
    except Exception as e:
        return f"Error: {e}"
    finally:
        driver.quit()  # Close the browser when done

# Initialize the WebDriver for the first website
options = Options()
b = webdriver.Chrome(options=options)

# Open the target URL
url = "https://asia.develon-ce.com/en/equipment/excavators/"
b.get(url)

# Create a new Excel workbook and sheet
workbook = openpyxl.Workbook()
table = workbook.active
element = b.find_element(By.CLASS_NAME, "en_plain_b")  # Get the WebElement
table.title = element.text + "data"  # Concatenate its text content with "data"

# Add headers to the Excel sheet
table.append(["Model", "Info", "Details"])

# Extract data and insert into the Excel sheet
while True:
    try:
        # Find elements for model names and info
        model_elements = b.find_elements(By.CSS_SELECTOR, ".product_name.en_plain_b")
        info_elements = b.find_elements(By.CSS_SELECTOR, ".product_info.en_plain_l")

        # Check if both lists have the same length
        if len(model_elements) != len(info_elements):
            print("Mismatch in number of model and info elements. Exiting loop.")
            break

        # Add data to the Excel sheet
        for model, info in zip(model_elements, info_elements):
            # Get the first search result for the model
            first_result = get_first_search_result(url, model.text)

            # Extract details from the first result
            try:
                tables = pd.read_html(first_result)
                combined_table = pd.concat([tables[0], tables[1]], axis=1)  # Combine tables if needed
                combined_table_transposed = combined_table.T  # Transpose the combined table
                details = combined_table_transposed
            except Exception as e:
                details = "Details not found"

            # Append model, info, and details to the sheet
            table.append([model.text, info.text, details])

        # Check if a "Load More" button exists
        try:
            load_more_button = b.find_element(By.CLASS_NAME, "btn_more")
            ActionChains(b).move_to_element(load_more_button).perform()
            time.sleep(0.5)
            load_more_button.click()
            time.sleep(2)  # Wait for content to load
        except NoSuchElementException:
            print("No 'Load More' button found. Exiting loop.")
            break  # Exit loop if no "Load More" button is found
    except Exception as e:
        print(f"Error: {e}")
        break

# Save the workbook to an Excel file
file_name = "excavator_data_with_details.xlsx"
workbook.save(file_name)
print(f"Data successfully saved to {file_name}")

# Close the WebDriver
b.quit()

# TEST

import time
from selenium.webdriver import Remote, ChromeOptions
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chromium.remote_connection import ChromiumRemoteConnection
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import WebDriverException
from urllib.parse import urlparse
import pandas as pd
import openpyxl

AUTH = 'brd-customer-hl_e5bd583b-zone-ai_sraper:n1arzv61qr2a'
SBR_WEBDRIVER = f'https://{AUTH}@brd.superproxy.io:9515'

# Function to extract domain from URL
def extract_domain(url):
    parsed_url = urlparse(url)
    return parsed_url.netloc

# Function to search Google
def search_google(query):
    search_url = f"https://www.google.com/search?q={query}"
    
    # Set up Chrome options
    options = ChromeOptions()
    options.add_argument("--headless")  # Run in headless mode
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument(f"--proxy-server=http://{AUTH.split(':')[0]}")  # Use proxy
    
    driver = None  # Initialize driver variable
    
    try:
        # Create a Remote WebDriver instance
        driver = Remote(command_executor=SBR_WEBDRIVER, options=options)
        driver.get(search_url)
        
        # Wait for the page to load and return the HTML
        time.sleep(3)  # You can use WebDriverWait for better control
        html = driver.page_source
        return html
    
    except WebDriverException as e:
        print(f"Error during WebDriver execution: {e}")
        return None
    
    finally:
        # Ensure the driver is quit properly
        if driver:
            try:
                driver.quit()
            except Exception as e:
                print(f"Error while quitting WebDriver: {e}")

# Function to get the first search result
def get_first_search_result(url, model):
    query = extract_domain(url) + "+" + model
    driver = search_google(query)

    if driver is None:
        return "Error: Could not complete Google search."

    try:
        # Find the first search result using the href attribute
        first_result = driver.find_element(By.CSS_SELECTOR, 'a[jsname="UWckNb"]')
        link = first_result.get_attribute('href')  # Extract the href attribute (the URL)
        return link
    except Exception as e:
        return f"Error: {e}"
    finally:
        driver.quit()  # Ensure the driver is properly closed



# Initialize the WebDriver for the first website
options = Options()
b = webdriver.Chrome(options=options)

# Open the target URL
url = "https://asia.develon-ce.com/en/equipment/excavators/"
b.get(url)



# Create a new Excel workbook and sheet
workbook = openpyxl.Workbook()
table = workbook.active
element = b.find_element(By.CLASS_NAME, "en_plain_b")  # Get the WebElement
table.title = element.text + "data"  # Concatenate its text content with "data"

# Add headers to the Excel sheet
table.append(["Model", "Info", "Details"])

# Extract data and insert into the Excel sheet
while True:
    try:
        # Find elements for model names and info
        model_elements = b.find_elements(By.CSS_SELECTOR, ".product_name.en_plain_b")
        info_elements = b.find_elements(By.CSS_SELECTOR, ".product_info.en_plain_l")

        # Check if both lists have the same length
        if len(model_elements) != len(info_elements):
            print("Mismatch in number of model and info elements. Exiting loop.")
            break

        # Add data to the Excel sheet
        for model, info in zip(model_elements, info_elements):
            # Get the first search result for the model
            first_result = get_first_search_result(url, model.text)

            # Extract details from the first result
            try:
                tables = pd.read_html(first_result)
                combined_table = pd.concat([tables[0], tables[1]], axis=1)  # Combine tables if needed
                combined_table_transposed = combined_table.T  # Transpose the combined table
                details = combined_table_transposed
            except Exception as e:
                details = "Details not found"

            # Append model, info, and details to the sheet
            table.append([model.text, info.text, details])

        # Check if a "Load More" button exists
        try:
            load_more_button = b.find_element(By.CLASS_NAME, "btn_more")
            ActionChains(b).move_to_element(load_more_button).perform()
            time.sleep(0.5)
            load_more_button.click()
            time.sleep(2)  # Wait for content to load
        except NoSuchElementException:
            print("No 'Load More' button found. Exiting loop.")
            break  # Exit loop if no "Load More" button is found
    except Exception as e:
        print(f"Error: {e}")
        break

# Save the workbook to an Excel file
file_name = "excavator_data_with_details.xlsx"
workbook.save(file_name)
print(f"Data successfully saved to {file_name}")

# Close the WebDriver
b.quit()
