## Temu Product Review Scrapping

In [2]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options

from selenium.webdriver.common.by import By # to find tags or elements
from selenium.webdriver.support.ui import WebDriverWait # a good way to wait
from selenium.webdriver.support import expected_conditions as EC # for scrolling

import time
import pandas as pd
import os
import requests

In [3]:
## Testing url - Temu
url = 'https://www.temu.com/ng/transparent-soft-tpu-case--for-galaxy-s25-s24-s23-s22-s23-fe--s24fe-s23fe-s21fe-s24plus-s23plus-s22plus-s21plus-s25ultra-s24ultra-s23ultra-s22ultra-with-wireless-charge-clear-cover-g-601099933583474.html?_oak_mp_inf=EPKQkOCn1ogBGhZmbGFzaF9zYWxlX2xpc3RfdTVuaThtIOqqstrQMg%3D%3D&top_gallery_url=https%3A%2F%2Fimg.kwcdn.com%2Fproduct%2Ffancy%2F5cb1c7dd-2661-4bc3-88bd-c88ceba7677d.jpg&spec_gallery_id=5465637388&refer_page_sn=10132&refer_source=0&freesia_scene=116&_oak_freesia_scene=116&_oak_rec_ext_1=Mzk2NzAw&_oak_gallery_order=658009297%2C633944919%2C1556163267%2C500529664%2C1329255741&refer_page_el_sn=201401&_x_channel_src=1&_x_channel_scene=spike&_x_sessn_id=u04ciiluh4&refer_page_name=lightning-deals&refer_page_id=10132_1739651326276_px3f554rbs'
response = requests.get(url)
response.status_code

200

In [4]:
soup = BeautifulSoup(response.content, 'html.parser')

In [6]:
# Get and click the load more reviews button
load_more_button = soup.find('span', {'class': '_3cgghkPI'})
response = requests.post(load_more_button.get('class'))
#response.status_code

AttributeError: 'NoneType' object has no attribute 'get'

In [4]:
## Function to get the reviews on each page
def get_page_review(driver):
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    first_leg = soup.find('div', class_ = '_3Rsl6Owq')
    reviews = first_leg.find_all('div', class_='_244ldJXl')

    # Extract review data
    review_data = []
    for review in reviews:
        # Reviewer Comment
        try:
            reviewer_comment = review.find('div', class_ = '_2EO0yd2j').text
        except AttributeError:
            reviewer_comment = ''
        
        # Review Dates
        try:
            review_date = review.find('span', style='font-size: 14px; color: rgb(170, 170, 170); font-weight: 400; text-decoration: none;').text
        except AttributeError:
            review_date = ''
        
        # Reviewer Name
        try:
            reviewer_name = review.find("div", class_='XTEkYdlM _3a8V1xkt').text
        except AttributeError:
            reviewer_name = ''
        
        # product star
        try:
            product_star = review.find('div', class_ = '_7JDNQb0g _1uEtAYnT').text
        except AttributeError:
            product_star = ''
        
        review_data.append({
            'reviewer_name': reviewer_name,
            'review_date': review_date,
            'product_star': product_star,
            'reviewer_comment': reviewer_comment            
        })
    
    return review_data

In [1]:
# Run the scrap function
def shopify_scrap(product_url, product_name):
    # Clean product name incase it contains spaces
    product_name = product_name.replace(' ', '_')

    # Configure Selenium EdgeDriver options
    options = Options()
    options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.59") # code to prevent the website from detecting bot activity
    options.add_argument("--disable-blink-features=AutomationControlled") # Disable the automation line at the top of the browser
    options.add_argument('headless')
    options.add_argument('disable-gpu')
    options.use_chromium = True
    service = Service(executable_path=r'C:\Users\Joshu\Desktop\msedgedriver.exe') # Always check to confirm that the version of edge driver matches the version or MS Edge browser
    driver = webdriver.Edge(service=service, options=options) # Initialize the webdriver

    driver.get(product_url)
    time.sleep(8) # To load the product page

    # Scrape all reviews variable
    all_reviews = []
    
    # Clicking 'See all reviews' button
    try:
        read_more_reviews_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(
            (By.XPATH, '//span[@class="_3cgghkPI"]'))
        )
        
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", read_more_reviews_button) # Scroll the page to view the button for Next Page
        time.sleep(2)  # Wait for the scroll
        
        read_more_reviews_button.click() # Click the read more reviews button
        time.sleep(5)
        
        # Then we're changing the driver to now point to the new pop-up window
        reviews_window = WebDriverWait(driver, 10).until(EC.presence_of_element_located(
            (By.XPATH, '//div[@class="_2OaJDN8Y _3Rsl6Owq _1xAk_zzX"]')
        ))

        # Logic for the scrolling down on the reviews window
        # Get the current scroll height
        scroll_height = driver.execute_script('return arguments[0].scrollHeight', reviews_window)
        
        while True:
            #scroll down little by little by 5000 pixels
            driver.execute_script('arguments[0].scrollBy(0, 5000);', reviews_window)
            time.sleep(6)

            new_scroll_height = driver.execute_script('return arguments[0].scrollHeight', reviews_window)
            
            if scroll_height == new_scroll_height:
                break
            scroll_height = new_scroll_height
        
        d_reviews = get_page_review(driver)
        all_reviews.extend(d_reviews)

    except Exception as e:
        print("No more pages to load", e)

    driver.quit()

    # Save to CSV
    # output_dir = r'Reviews\Temu'
    # file_name = f'{product_name}_reviews.csv'
    # output_path = os.path.join(output_dir, file_name)
    # os.makedirs(output_dir, exist_ok=True)

    # product_df = pd.DataFrame(all_reviews)
    # product_df.to_csv(output_path, index=False)
    # print(f'{len(product_df)} {product_name} reviews successfully written to {output_path}! Nice work!!!')

In [2]:
# Update the start URL
start_url = 'https://www.temu.com/ng/transparent-soft-tpu-case--for-galaxy-s25-s24-s23-s22-s23-fe--s24fe-s23fe-s21fe-s24plus-s23plus-s22plus-s21plus-s25ultra-s24ultra-s23ultra-s22ultra-with-wireless-charge-clear-cover-g-601099933583474.html?_oak_mp_inf=EPKQkOCn1ogBGhZmbGFzaF9zYWxlX2xpc3RfdTVuaThtIOqqstrQMg%3D%3D&top_gallery_url=https%3A%2F%2Fimg.kwcdn.com%2Fproduct%2Ffancy%2F5cb1c7dd-2661-4bc3-88bd-c88ceba7677d.jpg&spec_gallery_id=5465637388&refer_page_sn=10132&refer_source=0&freesia_scene=116&_oak_freesia_scene=116&_oak_rec_ext_1=Mzk2NzAw&_oak_gallery_order=658009297%2C633944919%2C1556163267%2C500529664%2C1329255741&refer_page_el_sn=201401&_x_channel_src=1&_x_channel_scene=spike&_x_sessn_id=u04ciiluh4&refer_page_name=lightning-deals&refer_page_id=10132_1739651326276_px3f554rbs' # less review data
# '' # more review data
product_name = 'Case for SamSung for Galaxy'

shopify_scrap(start_url, product_name)

NameError: name 'Options' is not defined