In [1]:
pip install undetected-chromedriver

Collecting undetected-chromedriver
  Downloading undetected-chromedriver-3.5.5.tar.gz (65 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting selenium>=4.9.0 (from undetected-chromedriver)
  Downloading selenium-4.29.0-py3-none-any.whl.metadata (7.1 kB)
Collecting websockets (from undetected-chromedriver)
  Downloading websockets-15.0.1-cp312-cp312-win_amd64.whl.metadata (7.0 kB)
Collecting trio~=0.17 (from selenium>=4.9.0->undetected-chromedriver)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium>=4.9.0->undetected-chromedriver)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting attrs>=23.2.0 (from trio~=0.17->selenium>=4.9.0->undetected-chromedriver)
  Downloading attrs-25.3.0-py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium>=4.9.0->undetected-chromedriver)
  Downloading outcome-1.3.0.post0-py2.py3-none

In [2]:
#necessary libraries
#selenium (undetected_chromedriver) is used to mimick a browser
import re
import json
import csv
import time
import random
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Configure Chrome options
options = uc.ChromeOptions()
#variety of user-agent to prevent bot detection
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36")
#hides selenium usage from Lazada
options.add_argument("--disable-blink-features=AutomationControlled")  # Prevent bot detection
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")

# Start undetected Chrome driver
driver = uc.Chrome(version_main=134, options=options)

# Lazada product URL
url = 'https://www.lazada.com.my/products/ugreen-braided-mfi-lightning-usb-a-to-lightning-cable-iphone-fast-charging-cable-for-iphoneipad-i12749949-s10763212739.html?scm=1007.17760.398138.0&pvid=fdff5322-4592-45e2-88f6-1a80f6075881&search=flashsale&spm=a2o4k.homepage.FlashSale.d_12749949'

# Extract item ID from URL so it can be used to put in the API to extract reviews
match = re.search(r'-i(\d+)-s', url)
if not match:
    print("Invalid URL format.")
    driver.quit()
    exit()
item_id = match.group(1)

all_reviews = []

try:
    #Opens the Lazada homepage to simulate real browser
    driver.get("https://www.lazada.com.my")
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body"))) # wait time to avoid bot detection

    #Opens the product page
    driver.get(url)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))

    #Looping of reviews from page 1 to 5
    for page in range(1, 6):
        # Lazada review API
        ratings_url = f'https://my.lazada.com.my/pdp/review/getReviewList?itemId={item_id}&pageSize=5&filter=0&sort=0&pageNo={page}'
        
        #Fetches the reviews API
        driver.get(ratings_url)

        #Lazada uses captcha to prevent bots. So when a captcha comes up it needs to be solved manually. Hence if a captcha is detected a wait time is used.
        try:
            captcha_detected = WebDriverWait(driver, 3).until(
                EC.presence_of_element_located((By.XPATH, "//iframe[contains(@src, 'captcha')]"))
            )
            if captcha_detected:
                print("\nCAPTCHA DETECTED!\n")
                
                #waits until the captcha is solved
                WebDriverWait(driver, 120).until_not(
                    EC.presence_of_element_located((By.XPATH, "//iframe[contains(@src, 'captcha')]"))
                )
                print("\nCAPTCHA Solved!\n")
        
        except:
            print("No captcha detected. Continuing...")

        #Extract JSON response using javascript because to API uses JSON text. if empty it moves to the next page
        print(f"Extracting API response from page {page}...")
        body_text = driver.execute_script("return document.body.innerText;").strip()
        if not body_text:
            print(f"No response received from API (Page {page}).")
            continue

        #converts the extracted text to python dictionary
        try:
            data = json.loads(body_text)
        except json.JSONDecodeError:
            print(f"Failed to parse JSON response (Page {page}).")
            continue

        #Extract and display reviews from using the JSON structure 
        reviews = data.get("model", {}).get("items", [])
        if reviews:
            for review in reviews:
                review_data = {
                    "Reviewer Name": review.get('buyerName', 'Anon'),
                    "Review Date": review.get('boughtDate', 'Unknown'),
                    "Review Content": review.get('reviewContent', 'No review content')
                }
                #adds the extracted review into review_data
                all_reviews.append(review_data)
        else:
            print(f"No reviews found on page {page}.")

        #Random wait time before going to next review page
        wait_time = random.uniform(5, 10)  # Random wait between 5-10 sec
        print(f"Waiting for {wait_time:.2f} seconds before fetching next page...")
        WebDriverWait(driver, wait_time).until(lambda driver: True)

except Exception as e:
    print("Error:", e)

finally:
    driver.quit()

# Save reviews to scrapedData.csv
if all_reviews:
    with open('scrapedData.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Reviewer Name', 'Review Date', 'Review Content']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(all_reviews)

    print(f"Successfully saved {len(all_reviews)} reviews to scrapedData.csv")
else:
    print("No reviews collected, CSV not created.")


No captcha detected. Continuing...
Extracting API response from page 1...
Failed to parse JSON response (Page 1).
No captcha detected. Continuing...
Extracting API response from page 2...
Failed to parse JSON response (Page 2).
No captcha detected. Continuing...
Extracting API response from page 3...
Failed to parse JSON response (Page 3).
No captcha detected. Continuing...
Extracting API response from page 4...
Failed to parse JSON response (Page 4).
No captcha detected. Continuing...
Extracting API response from page 5...
Failed to parse JSON response (Page 5).
No reviews collected, CSV not created.
