In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time

### 1) Product Information Scraping

In [None]:
#Insert your driver file path here
driver_path = r'driver_file_path'

service = Service(driver_path)
driver = webdriver.Chrome(service=service)
url  = "insert_your_url_link_here"
driver.get(url)

#Obtain the last page for page navigation
page_element = driver.find_elements(By.CLASS_NAME, "pagination-item")
page_list = []

for page in page_element:
    if page.text != "":
        page_list.append(page.text)
last_page = int(page_list[-1])

brand_list = [] #Stores the product brand information
name_list = []  #Stores the product name information
price_list = [] #Stores the product price information
page_links = [] #Stores the page_links information

driver.maximize_window() #Maximize the window

for n in range(last_page):    
    #Wait for page to load
    time.sleep(5)
    #Obtaining current url for checking purposes
    current_url = driver.current_url
    print(current_url)
    
    try:
        #Finding the brand, name and price
        product_list = driver.find_element(By.CLASS_NAME, "products-grid-container")
        product_brand = product_list.find_elements(By.CLASS_NAME, "brand")
        product_name = product_list.find_elements(By.CLASS_NAME, "product-name")
        product_price = product_list.find_elements(By.CLASS_NAME, "prices")
        product_links = product_list.find_elements(By.TAG_NAME, "a")

        for brand in product_brand:
            brand_list.append(brand.text)
        for name in product_name:
            name_list.append(name.text)
        for price in product_price:
            price_list.append(price.text)
            
        #Retrieving product links to 'click into'
        for product_link in product_links:
            href = product_link.get_attribute('href')
            page_links.append(href)
        
        #Finding the next button and click on it if it exists
        next_button = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR,"a.pagination-item.next-page"))
        )
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_button)
        next_button.click()
    except:
        break

#There are duplicated links at every next position, remove those to ensure the information index matches        
page_links_no_duplicate = [page_links[0]] # Start with the first element
for i in range(1, len(page_links)):
    if page_links[i] != page_links[i-1]:
        page_links_no_duplicate.append(page_links[i])

print(len(page_links_no_duplicate))
print(len(brand_list))
print(len(name_list))
print(len(price_list))

product_ratings = [] #Stores the product rating information
review_counts = [] #Stores the review count information
ingredients = [] #Stores the ingredients information

#Go into every product link and extract the ingredient, rating, review count information
number = 1
for links in page_links_no_duplicate:
    print(number) #Check which product i am at now
    print(links)
    
    #Wait for the page to load
    time.sleep(2)
    driver.get(links)
    try:
        ingredient_button = WebDriverWait(driver, 2).until(
        EC.presence_of_element_located((By.XPATH,'//*[@id="single_column"]/div[2]/div[2]/div[2]/div[3]/div/div[2]/div[2]/button'))
        )
        ingredient_button.click()
        ingredient_list = WebDriverWait(driver, 2).until(
        EC.presence_of_element_located((By.CLASS_NAME,"product-ingredients-values"))
        )
        ingredients.append(ingredient_list.text)
    except:
        ingredients.append('No info')
            
    try:
        #Rating
        rating = WebDriverWait(driver, 2).until(
        EC.presence_of_element_located((By.CLASS_NAME,"product-rating-text"))
        )
        product_ratings.append(rating.text)
    except:
        product_ratings.append(None)
            
    try:
        #Review count
        review_count = WebDriverWait(driver, 2).until(
        EC.presence_of_element_located((By.CLASS_NAME,"product-rating-count"))
        )
        review_counts.append(review_count.text)
    except:
        review_counts.append('0 Reviews')
    
    number += 1

### 2) Review Scraping
Access into each product URL and scrape the reviews from there.

Done in a separate step to avoid confusion.

In [None]:
driver_path = r'driver_file_path'

service = Service(driver_path)
driver = webdriver.Chrome(service=service)

review_heading_list = []     #product name
reviewer_list = []           #reviewer name
review_date_list = []        #date of review
review_country_list = []     #country of reviewer
review_description_list = [] #review description
review_url_list = []         #product url
review_rating_list = []      #review rating


for i, url in enumerate(page_links_no_duplicate):
    driver.get(url)
    print(i, url)
    #date, author, country, review title, review text, review-variant-name (x), rateit-range, pagination-container
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    #Obtaining the last page of the reviews
    time.sleep(1)
    try:
        page_element = driver.find_elements(By.CLASS_NAME, 'page')
        page_list = []
        for page in page_element:
            if page.text != "":
                page_list.append(page.text)
        last_page = int(page_list[-1])
        print(last_page)

        #Retrieving review information
        for page in range(last_page):
            print(page)

            time.sleep(1)
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            try:
                heading = driver.find_element(By.CLASS_NAME, 'product-heading')
                ratings = driver.find_elements(By.CLASS_NAME, 'rateit-range')
                date_list = driver.find_elements(By.CLASS_NAME, 'date')
                country_list = driver.find_elements(By.CLASS_NAME, 'country')
                reviewer = driver.find_elements(By.CLASS_NAME, 'author')
                review_title = driver.find_elements(By.CLASS_NAME, 'review-title')
                review_list = driver.find_elements(By.CLASS_NAME, 'review-text')
                print(heading.text)
                
            except:
                #Ignoring reviews with missing information
                pass
            
            else:
                
                #Obtaining rating information if other information exists
                if len(date_list) == len(country_list) == len(reviewer) == len(review_title) == len(review_list):
                    for rating in ratings[2:]:
                        score = rating.get_attribute('aria-valuenow')
                        review_rating_list.append(float(score))
                        
                    for n in range(len(review_title)):
                        reviews = review_title[n].text + " " + review_list[n].text
                        review_description_list.append(reviews)
                        review_date_list.append(date_list[n].text)
                        review_country_list.append(country_list[n].text)
                        reviewer_list.append(reviewer[n].text)
                        review_heading_list.append(heading.text)
                        review_url_list.append(url)
                else:
                    #Ignoring reviews with missing information
                    pass
                
                #Checking for consistency
                print(len(review_heading_list))
                print(len(reviewer_list))
                print(len(review_date_list))
                print(len(review_country_list))
                print(len(review_description_list))
                print(len(review_url_list))
                print(len(review_rating_list))
                
                #Navigate to next page 
                try:
                    next_button = driver.find_element(By.CSS_SELECTOR, 'a.page.next')
                    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_button)
                    next_button.click()
                except:
                    #End at last page
                    break
    except:
        pass

driver.quit()