```ruby

               __                 __               
  ______ ____ |  |   ____   ____ |__|__ __  _____  
 /  ___// __ \|  | _/ __ \ /    \|  |  |  \/     \ 
 \___ \\  ___/|  |_\  ___/|   |  \  |  |  /  Y Y  \
/____  >\___  >____/\___  >___|  /__|____/|__|_|  /
     \/     \/          \/     \/               \/ 
```
~ a guide to selenium ____-_ðŸ–‹

____table of contents
* [imports](#imports)
* [setup webdriver](#setup-webdriver)
* [finding elements](#finding-elements)
* [loop & extract data](#loop--extract-data)
* [wait for elements (optional)](#wait-for-elements-optional)
* [saving stuff](#saving-stuff)
* [finish & close browser](#finish--close-browser)
* [exercise one](#exercise-one)
* [exercise two](#exercise-two)
* [exercise two but with pagination](#exercise-two-but-with-pagination)

In [None]:
!pip install selenium requests webdriver-manager pandas

## imports

In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By # 'By' is used to locate elements
from selenium.webdriver.chrome.service import Service # 'Service' is used to manage the ChromeDriver
import pandas as pd
import time

## setup webdriver

In [None]:
# NOTE: for chrome, adjust path if needed
service = Service("chromedriver.exe")
driver = webdriver.Chrome(service=service)
driver.get("https://example.com")

# NOTE: for edge
# from selenium.webdriver.edge.service import Service
# service = Service("msedgedriver.exe")
# driver = webdriver.Edge(service=service)

# NOTE: for mac
# service = Service()  # Selenium will try to find chromedriver in your PATH
# driver = webdriver.Chrome(service=service)
# driver.get("https://www.example.com")

## finding elements

### by tag, ID, class and more

In [None]:
# Finds h1 tags
driver.find_element(By.TAG_NAME, "h1")

# Finds elements by their name attribute
# driver.find_element(By.ID, "exampleId")

# Finds elements by their class name
# driver.find_element(By.CLASS_NAME, "exampleClass")

# Finds elements by link text
# driver.find_element(By.LINK_TEXT, "Example Link")

# Finds elements by partial link text
# driver.find_element(By.PARTIAL_LINK_TEXT, "Example")

# Finds elements by tag name
# driver.find_elements(By.TAG_NAME, "p")  # Returns a list of all <p> elements

# Finds elements by name
# driver.find_elements(By.NAME, "exampleName")  # Returns a list of all elements with name="exampleName"

<selenium.webdriver.remote.webelement.WebElement (session="928c2ea8efa2e600d7813be1fcf7bedc", element="f.30BA493C03CABC9D03895A97D6D0E550.d.F4A797658EBAD061EB4276A2DBE8115F.e.3")>

### by css selector or xpath

Basic Syntax

| XPath Syntax | Description                                                                 |
|--------------|-----------------------------------------------------------------------------|
| `/`          | Selects from the root node.                                                 |
| `//`         | Selects nodes in the document from the current node that match the selection no matter where they are. |
| `.`          | Selects the current node.                                                   |
| `..`         | Selects the parent of the current node.                                     |
| `@`          | Used to select attributes.                                                  |

In [39]:
driver.find_element(By.CSS_SELECTOR, "h1")
driver.find_element(By.XPATH, "//h1")

<selenium.webdriver.remote.webelement.WebElement (session="928c2ea8efa2e600d7813be1fcf7bedc", element="f.30BA493C03CABC9D03895A97D6D0E550.d.F4A797658EBAD061EB4276A2DBE8115F.e.3")>

### find multiple elements

In [None]:
articles = driver.find_elements(By.CLASS_NAME, "p")

## loop & extract data

In [None]:
data = []

articles = driver.find_elements(By.CLASS_NAME, "article")
for article in articles:
    title = article.find_element(By.TAG_NAME, "h2").text
    date = article.find_element(By.CLASS_NAME, "date").text
    data.append([title.strip(), date.strip()])

## wait for elements (optional)

In [None]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

element = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.ID, "someId"))
)

## saving stuff

In [None]:
df = pd.DataFrame(data, columns=["Title", "Date"])
df.to_csv("output.csv", index=False) # Save as CSV
df.to_excel("output.xlsx", index=False) # Save as Excel

# df.to_json("output.json", orient="records", lines=True) # Save as JSON
# df.to_html("output.html", index=False) # Save as HTML
# df.to_sql("table_name", con=connection, if_exists="replace", index=False) # Save to SQL database
# product_list_df.to_csv('study_desks.txt', index=False, sep='\t') # to txt, tab-separated

## navigate pages (pagination)

In [None]:
for page in range(1, 4):
    url = f"https://example.com/page/{page}"
    driver.get(url)
    time.sleep(2)  # allow time to load

    articles = driver.find_elements(By.CLASS_NAME, "article")
    for article in articles:
        title = article.find_element(By.TAG_NAME, "h2").text
        date = article.find_element(By.CLASS_NAME, "date").text
        data.append([title.strip(), date.strip()])

## finish & close browser

In [None]:
driver.quit()

## exercise one

In [46]:
import requests
import pandas as pd
import time

from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

url = "https://www.nytimes.com/books/best-sellers/combined-print-and-e-book-fiction/"

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(url)
time.sleep(5)

# parse and extract html
books = driver.find_elements(By.CSS_SELECTOR, "li.css-sggj6j")
print(f"Found {len(books)} books")

book_list = []
for book in books:
    try:
        title = book.find_element(By.CSS_SELECTOR, "h3.css-2jegzb").text
        author = book.find_element(By.CSS_SELECTOR, "p.css-1aaqvca").text

        data = {
            "title": title,
            "author": author
        }
        book_list.append(data)
    except NoSuchElementException as e:
        print("Element not found in one of the entries:", e)

# Convert the list of dictionaries to a DataFrame
book_list_df = pd.DataFrame(book_list)
print(book_list_df)

# Save the DataFrame to a CSV file
book_list_df.to_csv("books.csv", index=False)

# Close the browser
driver.quit()


Found 15 books
                          title                 author
0        A CURSE CARVED IN BONE  by Danielle L. Jensen
1                    THE TENANT     by Freida McFadden
2       THE EMPEROR OF GLADNESS         by Ocean Vuong
3      GREAT BIG BEAUTIFUL LIFE         by Emily Henry
4              CAN'T GET ENOUGH        by Kennedy Ryan
5             ONE GOLDEN SUMMER      by Carley Fortune
6                    THE DEVILS     by Joe Abercrombie
7                   FEVER BEACH        by Carl Hiaasen
8   REMARKABLY BRIGHT CREATURES     by Shelby Van Pelt
9            SHIELD OF SPARROWS        by Devney Perry
10                   MY FRIENDS     by Fredrik Backman
11                   ONYX STORM      by Rebecca Yarros
12          MARBLE HALL MURDERS    by Anthony Horowitz
13                  FOURTH WING      by Rebecca Yarros
14          I HOPE YOU REMEMBER         by Josie Balka


## exercise two

In [51]:
import requests
import pandas as pd
import time

from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

url = "https://www.courts.com.sg/furniture/furniture/study-desks"

# Setup Chrome WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

try:
    driver.get(url)
    time.sleep(5)  # Allow time for the page to load

    # Find all product list items
    products = driver.find_elements(By.CSS_SELECTOR, "li[class = 'item product product-item']")
    print(f"Found {len(products)} products")

    product_list = []
    for product in products:
        try:
            # Extract product name
            product_name_element = product.find_element(By.CSS_SELECTOR, "h3[class = 'product name product-item-name']")
            product_name = product_name_element.text.strip()
        except NoSuchElementException:
            product_name = "N/A"

        current_price = "N/A"
        try:
            # Check for special price
            special_price_element = product.find_element(By.CSS_SELECTOR, "span[class = 'special-price']")
            current_price_element = special_price_element.find_element(By.CSS_SELECTOR, "span[class = 'price']")
            current_price = current_price_element.text.strip()
        except NoSuchElementException:
            try:
                # If no special price, get the regular price
                regular_price_element = product.find_element(By.CSS_SELECTOR, "span[class = 'price']")
                current_price = regular_price_element.text.strip()
            except NoSuchElementException:
                current_price = "N/A"

        product_info = {
            'name': product_name,
            'price': current_price
        }
        product_list.append(product_info)

    # Print the extracted data
    print("\nExtracted Product Information:")
    for item in product_list:
        print(f"Name: {item['name']}, Price: {item['price']}")

    # Optionally, convert the list of dictionaries to a Pandas DataFrame
    df = pd.DataFrame(product_list)
    print("\nPandas DataFrame:")
    print(df)

    # Save the DataFrame to a CSV file
    df.to_csv("furniture_products.csv", index=False)
    print("\nData saved to 'furniture_products.csv'")

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    # Close the browser
    driver.quit()

Found 16 products

Extracted Product Information:
Name: INDEX AGENT OFFICE CHAIR (HIGH BACK), Price: S$199.00
Name: COOLERMASTER CMI-GCR2C-GY CALIBER R2C GAMING CHAIR WITH COOL IN TECH, Price: S$349.00
Name: COOLERMASTER CMI-GCX1C-GY CALIBER X1C GAMING CHAIR WITH COOL IN TECH, Price: S$399.00
Name: ORTHO BACK BACK SUPPORT - ASSORTED COLOUR, Price: S$77.00
Name: JOURNALIST OFFICE CHAIR, Price: S$89.00
Name: INDEX TEMPO MESH OFFICE CHAIR, Price: S$129.00
Name: MORGEN MESH CHAIR (BLACK), Price: S$139.00
Name: MORGEN MESH CHAIR (BLUE), Price: S$139.00
Name: HEALING ORTHO BACK FOLDING CHAIR, Price: S$148.00
Name: CURVO HOME OFFICE CHAIR BLACK MID BACK CHAIR, Price: S$198.00
Name: CURVO HOME OFFICE CHAIR GREY MID BACK CHAIR, Price: S$198.00
Name: INDEX EXECUTIVE HIGH BACK OFFICE CHAIR, Price: S$199.00
Name: CELLO MID BACK STUDY CHAIR - BLACK, Price: S$228.00
Name: MOLLER HI BACK DIRECTOR MESH CHAIR, Price: S$228.00
Name: INDEX GRIS OFFICE CHAIR, Price: S$229.00
Name: INDEX FORUMER HIGH BACK 

## exercise two but with pagination

### w page button 123

In [4]:
import requests
import pandas as pd
import time

from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

url = "https://www.courts.com.sg/furniture/furniture/study-desks"

# --- USER CONFIGURATION ---
# Set the maximum number of pages you want to scrape.
# For example, set to 1 to scrape only the first page.
# Set to 5 to scrape pages 1 through 5.
PAGES_TO_SCRAPE = 3 # <--- CHANGE THIS VALUE TO YOUR DESIRED NUMBER OF PAGES
# --------------------------

# Setup Chrome WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

all_product_data = []  # Initialize an empty list to store data from all pages
page_number = 1  # Start with the first page

try:
    # Loop through pages until the specified limit is reached OR no more products are found
    while page_number <= PAGES_TO_SCRAPE:
        print(f"\n--- Scraping Page {page_number} ---")
        # Construct the URL for the current page using the page number parameter 'p'
        current_url = f"{url}?p={page_number}"
        print(f"Navigating to: {current_url}")
        driver.get(current_url)
        time.sleep(5)  # Allow sufficient time for the page to load completely

        # Find all product list items on the current page
        products = driver.find_elements(By.CSS_SELECTOR, "li[class = 'item product product-item']")
        print(f"Found {len(products)} products on this page")

        # If no products are found on the current page, it means we have reached the end
        # of available products, even if PAGES_TO_SCRAPE hasn't been met.
        if not products:
            print("No more products found on this page. Stopping scraping.")
            break # Exit the loop if no products are found

        product_list = []  # Initialize a list to store products from the current page
        for product in products:
            try:
                # Extract product name
                product_name_element = product.find_element(By.CSS_SELECTOR, "h3[class = 'product name product-item-name']")
                product_name = product_name_element.text.strip()
            except NoSuchElementException:
                product_name = "N/A"

            current_price = "N/A"
            try:
                # Check for special price
                special_price_element = product.find_element(By.CSS_SELECTOR, "span[class = 'special-price']")
                current_price_element = special_price_element.find_element(By.CSS_SELECTOR, "span[class = 'price']")
                current_price = current_price_element.text.strip()
            except NoSuchElementException:
                try:
                    # If no special price, get the regular price
                    regular_price_element = product.find_element(By.CSS_SELECTOR, "span[class = 'price']")
                    current_price = regular_price_element.text.strip()
                except NoSuchElementException:
                    current_price = "N/A"

            product_info = {
                'name': product_name,
                'price': current_price
            }
            product_list.append(product_info)

        all_product_data.extend(product_list)  # Add the extracted data from the current page to the main list

        # Increment page number for the next iteration
        page_number += 1

    # Print a message if the loop finished because the page limit was reached
    if page_number > PAGES_TO_SCRAPE:
        print(f"\nScraping completed. Reached the specified limit of {PAGES_TO_SCRAPE} pages.")


    # Print the total number of products extracted across all pages
    print(f"\nTotal products extracted: {len(all_product_data)}")

    # Convert the list of dictionaries containing all product data into a Pandas DataFrame
    df = pd.DataFrame(all_product_data)
    print("\nPandas DataFrame:")
    print(df)

    # Save the DataFrame to a CSV file named 'all_furniture_products.csv' without the index
    df.to_csv("all_furniture_products_pp.csv", index=False)
    print("\nAll data saved to 'all_furniture_products_pp.csv'")

except Exception as e:
    # Handle any exceptions that might occur during the scraping process
    print(f"An error occurred: {e}")

finally:
    # Ensure the browser is closed even if an error occurs
    driver.quit()



--- Scraping Page 1 ---
Navigating to: https://www.courts.com.sg/furniture/furniture/study-desks?p=1
Found 16 products on this page

--- Scraping Page 2 ---
Navigating to: https://www.courts.com.sg/furniture/furniture/study-desks?p=2
Found 16 products on this page

--- Scraping Page 3 ---
Navigating to: https://www.courts.com.sg/furniture/furniture/study-desks?p=3
Found 14 products on this page

Scraping completed. Reached the specified limit of 3 pages.

Total products extracted: 46

Pandas DataFrame:
                                                 name       price
0                INDEX AGENT OFFICE CHAIR (HIGH BACK)    S$199.00
1   COOLERMASTER CMI-GCR2C-GY CALIBER R2C GAMING C...    S$349.00
2   COOLERMASTER CMI-GCX1C-GY CALIBER X1C GAMING C...    S$399.00
3           ORTHO BACK BACK SUPPORT - ASSORTED COLOUR     S$77.00
4                             JOURNALIST OFFICE CHAIR     S$89.00
5                       INDEX TEMPO MESH OFFICE CHAIR    S$129.00
6                           M

### manipulating url

In [53]:
import requests
import pandas as pd
import time

from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

url = "https://www.courts.com.sg/furniture/furniture/study-desks"
num_pages_to_scrape = 3

# Setup Chrome WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

all_product_data = []

try:
    for page_number in range(1, num_pages_to_scrape + 1):
        print(f"\n--- Scraping Page {page_number} ---")
        current_url = f"{url}?p={page_number}"
        print(f"Navigating to: {current_url}")
        driver.get(current_url)
        time.sleep(5)  # Allow time for the page to load

        # Find all product list items on the current page
        products = driver.find_elements(By.CSS_SELECTOR, "li[class = 'item product product-item']")
        print(f"Found {len(products)} products on this page")

        product_list = []
        for product in products:
            try:
                # Extract product name
                product_name_element = product.find_element(By.CSS_SELECTOR, "h3[class = 'product name product-item-name']")
                product_name = product_name_element.text.strip()
            except NoSuchElementException:
                product_name = "N/A"

            current_price = "N/A"
            try:
                # Check for special price
                special_price_element = product.find_element(By.CSS_SELECTOR, "span[class = 'special-price']")
                current_price_element = special_price_element.find_element(By.CSS_SELECTOR, "span[class = 'price']")
                current_price = current_price_element.text.strip()
            except NoSuchElementException:
                try:
                    # If no special price, get the regular price
                    regular_price_element = product.find_element(By.CSS_SELECTOR, "span[class = 'price']")
                    current_price = regular_price_element.text.strip()
                except NoSuchElementException:
                    current_price = "N/A"

            product_info = {
                'name': product_name,
                'price': current_price
            }
            product_list.append(product_info)

        all_product_data.extend(product_list)

    # Print the total extracted data
    print(f"\nTotal products extracted: {len(all_product_data)}")

    # Convert the list of dictionaries to a Pandas DataFrame
    df = pd.DataFrame(all_product_data)
    print("\nPandas DataFrame:")
    print(df)

    # Save the DataFrame to a CSV file
    df.to_csv("furniture_products_page1_to_3.csv", index=False)
    print("\nData from pages 1 to {num_pages_to_scrape} saved to 'furniture_products_page1_to_3.csv'")

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    # Close the browser
    driver.quit()


--- Scraping Page 1 ---
Navigating to: https://www.courts.com.sg/furniture/furniture/study-desks?p=1
Found 16 products on this page

--- Scraping Page 2 ---
Navigating to: https://www.courts.com.sg/furniture/furniture/study-desks?p=2
Found 16 products on this page

--- Scraping Page 3 ---
Navigating to: https://www.courts.com.sg/furniture/furniture/study-desks?p=3
Found 14 products on this page

Total products extracted: 46

Pandas DataFrame:
                                                 name       price
0                INDEX AGENT OFFICE CHAIR (HIGH BACK)    S$199.00
1   COOLERMASTER CMI-GCR2C-GY CALIBER R2C GAMING C...    S$349.00
2   COOLERMASTER CMI-GCX1C-GY CALIBER X1C GAMING C...    S$399.00
3           ORTHO BACK BACK SUPPORT - ASSORTED COLOUR     S$77.00
4                             JOURNALIST OFFICE CHAIR     S$89.00
5                       INDEX TEMPO MESH OFFICE CHAIR    S$129.00
6                           MORGEN MESH CHAIR (BLACK)    S$139.00
7                       