## Scrap Data from Shopify

In [5]:
## Import the libraries

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options

from selenium.webdriver.common.by import By # to find tags or elements
from selenium.webdriver.support.ui import WebDriverWait # a good way to wait
from selenium.webdriver.support import expected_conditions as EC # for scrolling

import time
import pandas as pd
import os

In [6]:
## Function to get the reviews on each page
def get_page_review(driver):
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    first_leg = soup.find('div', class_ = 'flex w-full flex-col gap-space-16 overflow-x-hidden pb-space-32 md:gap-space-24')
    reviews = first_leg.find_all('div', class_='flex flex-col gap-space-16 md:gap-space-24')

    # Extract review data
    review_data = []
    for review in reviews:
        
        # Reviewer Header Comment
        try:
            reviewer_header_comment = review.find('p', class_ = 'font-bodySmallBold text-bodySmallBold').text
        except AttributeError:
            reviewer_header_comment = ''
        
        # Reviewer Detail Comment
        try:
            reviewer_detail_comment = review.find('p', class_ = 'font-bodySmall text-bodySmall text-text').text
        except AttributeError:
            reviewer_detail_comment = ''
        
        # Review Dates - Got index error from the split somewhere between the date and the name. So added a modification
        try:
            date_text = review.find('p', class_ = 'font-caption text-caption text-text-tertiary').text
            date_part = date_text.split('·')
            review_date = date_part[1] if len(date_part) > 1 else ''
        except (AttributeError, IndexError):
            review_date = ''
        
        # Reviewer Name
        try:
            name_text = review.find('p', class_ = 'font-caption text-caption text-text-tertiary').text
            name_part = name_text.split('·')
            reviewer_name = name_part[0] if len(name_part) > 1 else ''
        except (AttributeError, IndexError):
            reviewer_name = ''
                
        # product star
        try:
            rating = review.find('div', class_ = 'flex flex-row items-center gap-space-2 border-border-tertiary')
            product_star = rating['aria-label']
        except AttributeError:
            product_star = ''
        
        review_data.append({
            'reviewer_name': reviewer_name,
            'reviewer_header_comment': reviewer_header_comment,
            'reviewer_detail_comment': reviewer_detail_comment,
            'review_date': review_date,
            'product_star': product_star
        })
    
    return review_data

In [7]:
# Run the scrap function
def shopify_scrap(product_url, product_name):
    # Clean product name incase it contains spaces
    product_name = product_name.replace(' ', '_')

    # Configure Selenium EdgeDriver options
    options = Options()
    options.use_chromium = True
    service = Service(executable_path=r'Driver\msedgedriver.exe') # Always check to confirm that the version of edge driver matches the version or MS Edge browser
    driver = webdriver.Edge(service=service, options=options) # Initialize the webdriver

    driver.get(product_url)
    time.sleep(15) # To load the product page

    # Scrape all reviews - using a while loop
    all_reviews = []
    
    # Clicking 'Read more reviews' button
    try:
        read_more_reviews_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(
            (By.XPATH, '//button[@class="bg-bg-fill-secondary stroke-text text-text hover:bg-bg-fill-secondary-hover focus-visible:outline-none focus-visible:ring focus-visible:ring-border-input-active focus-visible:ring-offset-2 false text-buttonMedium font-buttonMedium rounded-radius-10 _button_m_95bul_61 min-w-[72px] p-space-8 transition active:scale-[0.99] w-full relative"]'))
        )
        
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", read_more_reviews_button) # Scroll the page to view the button for Next Page
        time.sleep(2)  # Wait for the scroll
        
        read_more_reviews_button.click() # Click the read more reviews button
        time.sleep(5)
        
        # Then we're changing the driver to now point to the new pop-up window
        reviews_window = WebDriverWait(driver, 10).until(EC.presence_of_element_located(
            (By.XPATH, '//div[@class="flex-1 overflow-y-auto px-screen-margin md:pt-0"]')
        ))

        # Logic for the scrolling down on the reviews window
        # Get the current scroll height
        scroll_height = driver.execute_script('return arguments[0].scrollHeight', reviews_window)
        
        while True:
            #scroll down little by little by 100 pixels
            driver.execute_script('arguments[0].scrollBy(0, 5000);', reviews_window)
            time.sleep(6)

            new_scroll_height = driver.execute_script('return arguments[0].scrollHeight', reviews_window)
            
            if scroll_height == new_scroll_height:
                break
            scroll_height = new_scroll_height
        
        d_reviews = get_page_review(driver)
        all_reviews.extend(d_reviews)

    except Exception as e:
        print("No more pages to load", e)

    driver.quit()

    # Save to CSV
    output_dir = r'Reviews\Shopify'
    file_name = f'{product_name}_reviews.csv'
    output_path = os.path.join(output_dir, file_name)
    os.makedirs(output_dir, exist_ok=True)

    product_df = pd.DataFrame(all_reviews)
    product_df.to_csv(output_path, index=False)
    print(f'{len(product_df)} {product_name} reviews successfully written to {output_path}! Nice work!!!')

In [8]:
# Update the start URL
start_url = 'https://shop.app/products/8779524505845' # less review data
# 'https://shop.app/products/7533915177117' # more review data
product_name = 'Mercury Keyboard'

shopify_scrap(start_url, product_name)

14 Mercury_Keyboard reviews successfully written to Reviews\Shopify\Mercury_Keyboard_reviews.csv! Nice work!!!
