## Scrap Data for reviews on Jumia

#### To Do
- Search for a product on Jumia and manually check for results that have good amount of reviews.
    - for this test, I will search for just 1 result with a good number of reviews
- get the review url
- use the url to scrap all the reviews

In [1]:
## Import necessary Libraries

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
import time
import pandas as pd
import os, re

In [15]:
## Function to get product details
def get_product_details(driver):
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Extract product details
    product_details = {
        'product_name': soup.find('h1', class_='-fs20 -pts -pbxs').text.strip(),
        # 'brand': soup.find('div', class_='-pvxs').find_all('a')[0].text.strip(),
        'price': re.sub(r'[^\d]', '', soup.find('span', class_='-b -ubpt -tal -fs24 -prxs').text.strip()),
        # 'description': soup.find('div', class_='markup -mhm -pvl -oxa -sc').text.strip(),
        'features': soup.find('div', class_ = 'markup -pam').text.strip(),
        'specification': soup.find('ul', class_ = '-pvs -mvxs -phm -lsn').text.strip()
    }

    return product_details

In [16]:
## Function to get the reviews on each page
def get_page_review(driver):
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    reviews = soup.find_all('article', class_='-pvs -hr _bet')

    # Extract review data
    review_data = []
    for review in reviews:
        
        # Reviewer Header Comment
        try:
            reviewer_header_comment = review.find('h3', class_ = '-m -fs16 -pvs').text.strip()
        except AttributeError:
            reviewer_header_comment = ''
        
        # Reviewer Detail Comment
        try:
            reviewer_detail_comment = review.find('p', class_ = '-pvs').text.strip()
        except AttributeError:
            reviewer_detail_comment = ''
        
        # Review Dates
        try:
            review_date = review.find('span', class_ = '-prs').text
        except AttributeError:
            review_date = ''
        
        # Reviewer Name
        try:
            div = review.find("div", class_="-df -j-bet -i-ctr -gy5")
            reviewer_name = div.find_all('span')[1].text.replace('by ', '')
        except AttributeError:
            reviewer_name = ''
                
        # product star
        try:
            product_star = review.find('div', class_ = 'stars _m _al -mvs').text
        except AttributeError:
            product_star = ''
        
        review_data.append({
            'reviewer_name': reviewer_name,
            'reviewer_header_comment': reviewer_header_comment,
            'reviewer_detail_comment': reviewer_detail_comment,
            'review_date': review_date,
            'product_star': product_star
        })
    
    return review_data

In [18]:
# Run the scrap function
def jumia_scrap(product_url):
    # Clean product name incase it contains spaces
    # product_name = product_name.replace(' ', '_')
    
    # Configure Selenium EdgeDriver options
    options = Options()
    options.use_chromium = True
    options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.59") # code to prevent the website from detecting bot activity
    options.add_argument("--disable-blink-features=AutomationControlled") # Disable the automation line at the top of the browser
    options.add_argument('headless')
    options.add_argument('disable-gpu')
    service = Service(executable_path=r'Driver\msedgedriver.exe') # Always check to confirm that the version of edge driver matches the version or MS Edge browser
    driver = webdriver.Edge(service=service, options=options) # Initialize the webdriver

    driver.get(product_url)
    time.sleep(10) # To load the product page

    # Get the product details
    product_details = get_product_details(driver)
    product_name = re.split(r"[\'\"]", product_details['product_name'])[0]
    
    try:
        review_page_button = driver.find_element(By.XPATH, '//a[@class="btn _def _ti -mhs -fsh0"]')

        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", review_page_button) # Scroll the page to view the button for Next Page
        time.sleep(2)  # Wait for the scroll

        review_page_button.click() # Clicks the next page button
        time.sleep(10)  # Wait for the next page to load
    except Exception as e:
        print("There are no reviews for this product")
    
    # Scrape all reviews - using a while loop
    all_reviews = []

    while True:
        d_reviews = get_page_review(driver)
        all_reviews.extend(d_reviews)

        # Clicking the 'Next Page' button to get to other pages
        try:
            next_page_button = driver.find_element(By.XPATH, '//a[@aria-label="Next Page"]')

            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_page_button) # Scroll the page to view the button for Next Page
            time.sleep(2)  # Wait for the scroll

            next_page_button.click() # Clicks the next page button
            time.sleep(10)  # Wait for the next page to load
        except Exception as e:
            print("No more pages to load")
            break

    driver.quit()

    # Joing product details and the reviews
    for review in all_reviews:
        review.update(product_details)

    # Save to CSV
    product_df = pd.DataFrame(all_reviews)

    output_dir = r'Reviews\Jumia'
    file_name = f'{product_name}_reviews.csv'
    output_path = os.path.join(output_dir, file_name)
    os.makedirs(output_dir, exist_ok=True)

    product_df.to_csv(output_path, index=False)
    print(f'{len(product_df)} {product_name} reviews successfully written to {output_path}! Nice work!!!')

In [24]:
# Update the start URL
start_url = 'https://www.jumia.com.ng/xiaomi-redmi-14c-6.88-8gb-ram256gb-rom-android-12-sage-green-373375543.html'
# 'https://www.jumia.com.ng/catalog/productratingsreviews/sku/AC431CL57R8EKNAFAMZ/' # small review data for testing
# 'https://www.jumia.com.ng/catalog/productratingsreviews/sku/GE779EA1BY9XTNAFAMZ/' - more data
# product_name = 'Wireless Rechargeable Mouse'

jumia_scrap(start_url)

# https://www.jumia.com.ng/itel-2163-wireless-fm-torchlight-dual-sim-black-84833750.html
# https://www.jumia.com.ng/itel-p55-5g-6.6-hd-hole-6gb-ram128gb-rom-android-13-blue-274011937.html
# https://www.jumia.com.ng/xiaomi-redmi-14c-6.88-8gb-ram256gb-rom-android-12-sage-green-373375543.html
# https://www.jumia.com.ng/agm-note-n1-6.52-8gb-ram-128gb-expandable-rom-android-13-grey-235195482.html
# https://www.jumia.com.ng/nokia-105african-edition-1.77-4mb4mb-800-mah-dual-sim-blue-132260276.html
# https://www.jumia.com.ng/samsung-galaxy-a05-6.7-4gb-ram64gb-rom-android-13-black-277533765.html
# https://www.jumia.com.ng/tecno-t101-1.8-dual-sim-black-382115813.html
# https://www.jumia.com.ng/nokia-105african-edition1.774mb4mb800mah-dual-sim-charcoal-132260136.html
# https://www.jumia.com.ng/xiaomi-redmi-14c-6.88-8gb-ram256gb-rom-android-12-sage-green-373375543.html

No more pages to load
62 XIAOMI Redmi 14C 6.88 reviews successfully written to Reviews\Jumia\XIAOMI Redmi 14C 6.88_reviews.csv! Nice work!!!
