## Scrap Data for Mouse on Jumia

#### To Do
- Search for laptop mouse on Jumia and manually check for results that have good amount of reviews.
    - for this test, I will search for just 1 result with a good number of reviews
- get the url - [mouse link](https://www.jumia.com.ng/catalog/productratingsreviews/sku/GE779EA0A9NPFNAFAMZ/?page=1)
- use the url to scrap all the reviews

In [7]:
## Import necessary Libraries

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
import time
import pandas as pd
import os

In [8]:
## Function to get the reviews on each page
def get_page_review(driver):
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    reviews = soup.find_all('article', class_='-pvs -hr _bet')

    # Extract review data
    review_data = []
    for review in reviews:
        
        # Reviewer Header Comment
        try:
            reviewer_header_comment = review.find('h3', class_ = '-m -fs16 -pvs').text
        except AttributeError:
            reviewer_header_comment = ''
        
        # Reviewer Detail Comment
        try:
            reviewer_detail_comment = review.find('p', class_ = '-pvs').text
        except AttributeError:
            reviewer_detail_comment = ''
        
        # Review Dates
        try:
            review_date = review.find('span', class_ = '-prs').text
        except AttributeError:
            review_date = ''
        
        # Reviewer Name
        try:
            div = review.find("div", class_="-df -j-bet -i-ctr -gy5")
            reviewer_name = div.find_all('span')[1].text
        except AttributeError:
            reviewer_name = ''
                
        # product star
        try:
            product_star = review.find('div', class_ = 'stars _m _al -mvs').text
        except AttributeError:
            product_star = ''
        
        review_data.append({
            'reviewer_name': reviewer_name,
            'reviewer_header_comment': reviewer_header_comment,
            'reviewer_detail_comment': reviewer_detail_comment,
            'review_date': review_date,
            'product_star': product_star
        })
    
    return review_data

In [17]:
# Run the scrap function
def jumia_scrap(product_url, product_name):
    # Clean product name incase it contains spaces
    product_name = product_name.replace(' ', '_')
    
    # Configure Selenium EdgeDriver options
    options = Options()
    options.use_chromium = True
    service = Service(executable_path=r'Driver\msedgedriver.exe') # Always check to confirm that the version of edge driver matches the version or MS Edge browser
    driver = webdriver.Edge(service=service, options=options) # Initialize the webdriver

    driver.get(product_url)
    time.sleep(10) # To load the product page

    # Scrape all reviews - using a while loop
    all_reviews = []

    while True:
        d_reviews = get_page_review(driver)
        all_reviews.extend(d_reviews)

        # Clicking the 'Next Page' button to get to other pages
        try:
            next_page_button = driver.find_element(By.XPATH, '//a[@aria-label="Next Page"]')

            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_page_button) # Scroll the page to view the button for Next Page
            time.sleep(2)  # Wait for the scroll

            next_page_button.click() # Clicks the next page button
            time.sleep(10)  # Wait for the next page to load
        except Exception as e:
            print("No more pages to load")
            break

    driver.quit()


    # Save to CSV
    product_df = pd.DataFrame(all_reviews)

    output_dir = r'Reviews\Jumia'
    file_name = f'{product_name}_reviews.csv'
    output_path = os.path.join(output_dir, file_name)
    os.makedirs(output_dir, exist_ok=True)

    product_df.to_csv(output_path, index=False)
    print(f'{len(product_df)} {product_name} reviews successfully written to {output_path}! Nice work!!!')

In [None]:
# Update the start URL
start_url = 'https://www.jumia.com.ng/catalog/productratingsreviews/sku/AC431CL57R8EKNAFAMZ/' # small review data for testing
# 'https://www.jumia.com.ng/catalog/productratingsreviews/sku/GE779EA1BY9XTNAFAMZ/' - more data
product_name = 'Ace Laptop'

jumia_scrap(start_url, product_name)

No more pages to load
11 Ace_Laptop reviews successfully written to Reviews\Jumia\Ace_Laptop_reviews.csv! Nice work!!!
