## imports

In [1]:
# Selenium modules for controlling Chrome browser
from selenium.webdriver import Chrome  # For initializing and controlling the Chrome browser
from selenium import webdriver  # Provides access to the webdriver, allowing interaction with web browsers
from selenium.webdriver.chrome.options import Options  # For configuring Chrome browser options (e.g., headless mode)
from selenium.webdriver.chrome.service import Service  # For managing the ChromeDriver service (e.g., starting, stopping)

# Selenium modules for interacting with web elements
from selenium.webdriver.common.by import By  # For locating elements on a webpage (e.g., By.ID, By.XPATH)
from selenium.webdriver.support.ui import Select  # For interacting with <select> HTML elements (dropdowns)
from selenium.webdriver.support.ui import WebDriverWait  # For implementing explicit waits until a condition is met
from selenium.webdriver.support import expected_conditions as EC  # For defining conditions to wait for (e.g., element visibility)

# Other useful libraries
from fake_useragent import UserAgent  # For generating random user agents to mimic different browsers
import time  # For adding delays (e.g., time.sleep) during the script execution
import requests  # For making HTTP requests to interact with websites directly without using a browser
from bs4 import BeautifulSoup  # For parsing and extracting data from HTML content
import pandas as pd 

## browser Automation with Selenium

In [None]:
# Set up Chrome options
options = Options()
ua = UserAgent()
userAgent = ua.random

In [None]:
# Set random user-agent and disable notifications
options.add_argument(f'user-agent={userAgent}')
options.add_argument("--disable-notifications")  # Disable notification popups
options.add_argument('--blink-settings=imagesEnabled=false')  # Disable images to save bandwidth
# options.add_argument("--headless")  # Uncomment to run in headless mode

In [None]:
# Initialize WebDriver with options
driver = webdriver.Chrome(options=options)

In [None]:
# Open the Jarir website
driver.get("https://www.jarir.com/computers-tablets.html")

In [None]:
# List to store the laptop details
laptop_data = []

In [None]:
try:
    # Wait for the page to load and the "English" link to appear
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, "//a[contains(@href, '/sa-en/computers-tablets.html?country=sa')]"))
    )

    # Find and click the "English" link to switch the language
    english_button = driver.find_element(By.XPATH, "//a[contains(@href, '/sa-en/computers-tablets.html?country=sa')]")
    english_button.click()

    # Wait for the language change to take effect
    time.sleep(3)

    # Wait for the page to load and the specific element to appear
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="viewport"]/div[2]/div[2]'))
    )

    # Scroll the page to load more products
    def scroll_page():
        # Scroll down the page by a small amount, simulating user scrolling
        driver.execute_script("window.scrollBy(0, 1000);")
        time.sleep(3)  # Wait for new products to load

    # Scroll and scrape multiple times
    for _ in range(5):  # Adjust the range to scroll more times, or use a while loop for infinite scroll
        scroll_page()

        # Locate the starting point (specific element) where we want to begin scraping
        start_element = driver.find_element(By.XPATH, '//*[@id="viewport"]/div[2]/div[2]')

        # Find all product elements within this container
        product_elements = start_element.find_elements(By.CLASS_NAME, 'product-title__title')

        # Loop through multiple products and extract data
        for product_name in product_elements:
            try:
                # Extracting product price (adjust XPath based on your actual page structure)
                product_price = product_name.find_element(By.XPATH, '../../..//span[contains(@class,"price_alignment")]//span[2]')

                # Find the product-title__info container that holds the specifications
                product_info_container = product_name.find_element(By.XPATH, '../../..//p[contains(@class,"product-title__info")]')

                # Find all the <span> elements inside the product-title__info container
                info_boxes = product_info_container.find_elements(By.CLASS_NAME, 'product-title__info--box')

                # Assigning each product detail to a variable, with 'null' if not found
                details = {
                    'product_name': product_name.text.strip(),
                    'screen_size': 'null',
                    'processor': 'null',
                    'ram': 'null',
                    'storage': 'null',
                    'os': 'null',
                    'price': 'null'  # Set default price as 'null'
                }

                # Get price if it exists
                try:
                    details['price'] = product_price.text.strip()
                except:
                    details['price'] = 'null'  # If price not found, set it as 'null'

                # Loop through the info_boxes and assign values to the details dictionary
                if len(info_boxes) > 0:
                    details['screen_size'] = info_boxes[0].text.strip()
                if len(info_boxes) > 1:
                    details['processor'] = info_boxes[1].text.strip()
                if len(info_boxes) > 2:
                    details['ram'] = info_boxes[2].text.strip()
                if len(info_boxes) > 3:
                    details['storage'] = info_boxes[3].text.strip()
                if len(info_boxes) > 4:
                    details['os'] = info_boxes[4].text.strip()

                # Append the details to the laptop_data list
                laptop_data.append(details)

            except Exception as e:
                print(f"Error while processing product '{product_name.text}':", e)


except Exception as e:
    print("Error occurred while scraping:", e)

finally:
    # Close the driver after scraping
    driver.quit()

In [3]:
df = pd.DataFrame(laptop_data)

In [4]:
# Print the DataFrame to see the results
df

Unnamed: 0,product_name,screen_size,processor,ram,storage,os,price
0,MSI Titan 18HX Gaming Laptop,"18""",Intel Core i9,128 GB RAM,,,20029
1,MSI Vector 16 HX Gaming Laptop,"16""",Intel Core i9,32 GB RAM,,,9029
2,MSI Raider 18 HX Gaming Laptop,"18""",Intel Core i9,32 GB RAM,,,15029
3,HP OMEN (with Headset) Gaming Laptop,"16.1""",Intel Core i7,32 GB RAM,,,9499
4,Acer Predator Helios 16 Gaming Laptop,"16""",Intel Core i9,32 GB RAM,1 TB SSD,,10029
...,...,...,...,...,...,...,...
235,Huawei MatePad Pro 13.2 Wi-Fi Tablet PC,"13.2""",256 GB,Octa Core,Golden Black,,2949
236,Apple iPad 10.2 9th Gen Tablet - 4G,"10.2""",256 GB,Silver,,,2199
237,Huawei MatePad 11.5 S Tablet - Wi-Fi (with Key...,"11.5""",256 GB,Octa Core,Violet,,1549
238,Huawei MatePad 11.5 S Tablet - Wi-Fi (with Key...,"11.5""",256 GB,Octa Core,Space Grey,,1549


In [5]:
df.shape

(240, 7)

In [6]:
# Optionally, save the DataFrame to a CSV file
df.to_csv('laptop_data.csv', index=False)