In [2]:
# Text Analytics
# Lab Assignment 1 - Web Scraping
# Name: Muhammad Ammar Wafiy & Nur Aisya Safiyyah
# ID: IS01082517 & IS01082522

# Import the necessary libraries
import undetected_chromedriver as uc         # Stealth Chrome driver to avoid detection by Shopee's anti-bot system
from selenium.webdriver.common.by import By  # Provides access to locator strategies like CLASS_NAME, XPATH, etc.
import pandas as pd                          # Used to structure scraped data into DataFrames and save as CSV
import time                                  # Adds delay or pause to ensure pages and all elements are fully loaded

# Function to set up the stealth Chrome driver
def setup_undetected_driver():
    # Initialize ChromeOptions to customize browser settings
    options = uc.ChromeOptions()

    options.add_argument('--no-sandbox')                     
    options.add_argument('--disable-dev-shm-usage')        
    options.add_argument('--window-size=1200,800')        

    # Set a custom user-agent to mimic a real user to avoid bot detection
    options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.6998.89 Safari/537.36")

    # Launch the Chrome browser in stealth mode with the specified options above
    driver = uc.Chrome(options=options)

    return driver

# Function to scrape Shopee reviews from the specified product page
def scrape_shopee_reviews(url, max_pages=5):
    driver = setup_undetected_driver()

    # Step 1: Open Shopee homepage for manual login
    print("Opening Shopee homepage for manual login...")
    driver.get("https://shopee.com.my/")   # Opens Shopee homepage in browser
    time.sleep(15)                         # Wait 15 seconds to give you time to log in

    # Step 2: Wait for manual confirmation that login is complete
    input("Press Enter AFTER you have succesfully logged in to Shopee manually...")

    # Step 3: Open the product page to scrape reviews from
    print(f"Opening product page: {url}")
    driver.get(url)                        # Opens the specific product page
    time.sleep(10)                         # Wait for page elements and reviews section to load

    # Scroll to the middle of the page to trigger review section loading
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
    time.sleep(5)                          # Wait for reviews to appear after scrolling

    # Step 4: Prepare to store scraped review data
    reviews = []                           # Initialize an empty list to store review dictionaries

    # Step 5: Begin scraping reviews and looping through a specified number of pages
    for page in range(max_pages):          # Repeat scraping for each review page (up to specified max_pages)
        print(f"\n🔎 Scraping page {page + 1}...")

        # Find all review containers (each container holds one review)
        review_items = driver.find_elements(By.CLASS_NAME, 'shopee-product-rating')
        print(f"Found {len(review_items)} reviews on page {page + 1}.")

        # If no reviews found, exit the loop early
        if len(review_items) == 0:
            print("No reviews found on this page.")
            break

        # Step 6: Loop through each review and extract its data
        for element in review_items:
            try:
                # Extract the reviewer name from the review container
                try:
                    reviewer_name = element.find_element(By.CLASS_NAME, 'shopee-product-rating__author-name').text.strip()
                except:
                    reviewer_name = "Unknown"      # If the reviewer name not found, use "Unknown"

                # Extract the review date and variation info
                try:
                    review_date = element.find_element(By.CLASS_NAME, 'shopee-product-rating__time').text.strip()
                except:
                    review_date = "Unknown"        # If the review date not found, use "Unknown"

                # Extract the review text (content of the review)
                try:
                    # Locate the review content by its inline style
                    review_content_element = element.find_element(By.XPATH, ".//div[@style='margin-top: 0.75rem;']")
                    review_text = review_content_element.text.strip()      # Extract and clean the review text
                except:
                    review_text = ""               # If there's no review content, leave it blank

                # Step 7: Add the scraped review data into the reviews list as a dictionary
                reviews.append({
                    'Reviewer Name': reviewer_name,
                    'Review Date': review_date,
                    'Review Content': review_text
                })

                # Print feedback in terminal for each review collected for confirmation
                print(f"- Review by {reviewer_name} on {review_date} | Content: {review_text[:200]}...")

            except Exception as e:
                print("Error extracting one review:", e)
                continue   # If there’s an error extracting a review, skip to the next one

        # Step 8: Try closing any popups that may block the page or the next button
        try:
            close_popup = driver.find_element(By.CLASS_NAME, 'shopee-popup__close-btn')
            close_popup.click()                  # Close popup if found
            print("Closed a popup!")
            time.sleep(2)                        # Wait for a while after closing to stabilize the page
        except:
            pass                                 # If there's no popup found, continue normally

        # Step 9: Navigate to the next page of reviews
        try:
            # Locate the "Next" page button
            next_button = driver.find_element(By.CLASS_NAME, 'shopee-icon-button--right')

            # Check if the next button is disabled (meaning there are no more pages)
            if 'disabled' in next_button.get_attribute('class'):
                print("Next button disabled. No more pages.")
                break                             # Exit the page loop if button is disabled

            # Scroll the button into view before clicking it
            driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
            time.sleep(1)

            # Perform a JavaScript click to bypass any interception issues from popups or overlays
            print("Clicking next page button using JavaScript (to bypass interception)...")
            driver.execute_script("arguments[0].click();", next_button)

            time.sleep(7)                        # Wait for the new page of reviews to load
        except Exception as e:
            print(f"Next button not found or not clickable on page {page + 1}: {e}")
            break                                # Exit if unable to click the next button

    # Step 10: After all reviews has been scraped, this will close the browser
    driver.quit()

    # Step 11: Convert the list of reviews to a DataFrame for a structured data storage
    df_reviews = pd.DataFrame(reviews)

    # Step 12: Export the DataFrame as a CSV file
    csv_filename = "Lab Assignment 1 - Extracted Data.csv"
    df_reviews.to_csv(csv_filename, index=False, encoding='utf-8-sig')

    print(f"\n✅ Scraping completed! {len(df_reviews)} reviews saved to '{csv_filename}'.")

# Step 13: Define the URL of the Shopee product to scrape reviews from
product_url = 'https://shopee.com.my/Apple-iPhone-16-Pro-Max-i.1278832578.27559929362?sp_atk=d499961a-c335-4f77-9ea1-b61262c66474&xptdk=d499961a-c335-4f77-9ea1-b61262c66474&is_from_login=true'

# Step 14: Call the scraping function with the Shopee / product URL and specify how many pages to scrape
scrape_shopee_reviews(product_url, max_pages=5)

Opening Shopee homepage for manual login...


Press Enter AFTER you have succesfully logged in to Shopee manually... 


Opening product page: https://shopee.com.my/Apple-iPhone-16-Pro-Max-i.1278832578.27559929362?sp_atk=d499961a-c335-4f77-9ea1-b61262c66474&xptdk=d499961a-c335-4f77-9ea1-b61262c66474&is_from_login=true

🔎 Scraping page 1...
Found 6 reviews on page 1.
- Review by  on  | Content: ...
- Review by  on  | Content: ...
- Review by  on  | Content: ...
- Review by  on  | Content: ...
- Review by  on  | Content: ...
- Review by  on  | Content: ...
Clicking next page button using JavaScript (to bypass interception)...

🔎 Scraping page 2...
Found 6 reviews on page 2.
- Review by  on  | Content: ...
- Review by  on  | Content: ...
- Review by  on  | Content: ...
- Review by  on  | Content: ...
- Review by  on  | Content: ...
- Review by  on  | Content: ...
Clicking next page button using JavaScript (to bypass interception)...

🔎 Scraping page 3...
Found 6 reviews on page 3.
- Review by  on  | Content: ...
- Review by  on  | Content: ...
- Review by  on  | Content: ...
- Review by  on  | Content: ...
