In [1]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

In [2]:
# Configure Selenium WebDriver
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems
chrome_options.add_argument("--no-sandbox")  # Bypass OS security model


service= Service('C:/Users/elitebook/Downloads/chromedriver-win64/chromedriver.exe')
# Initialize WebDriver
driver = webdriver.Chrome(service=service, options=chrome_options)


In [5]:
# List to store product data
products = []

# There are 4 pages in the Unilever Jumia store
num_pages = 4
base_url = "https://www.jumia.co.ke/mlp-unilever-store/?page={}"

In [6]:
for page in range(1, num_pages + 1):
    try:
        # Construct the URL for the current page
        url = base_url.format(page)
        
        # Open the URL
        driver.get(url)
        time.sleep(3)  # Allow some time for the page to load
        
        # Parse page content with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Find all product items
        items = soup.find_all('a', class_='core')
        
        for item in items:
            # Extract product name (check both 'div' and 'h3' tags with class 'name')
            product_name_div = item.find('div', class_='name')
            product_name_h3 = item.find('h3', class_='name')
            
            if product_name_div:
                product_name_text = product_name_div.text.strip()
            elif product_name_h3:
                product_name_text = product_name_h3.text.strip()
            else:
                product_name_text = 'N/A'
            
            # Extract rating
            rating_div = item.find('div', class_='stars _s')
            rating_text = rating_div.text.strip().split(' ')[0] if rating_div else 'N/A'
            
            # Extract price
            price_div = item.find('div', class_='prc')
            price_text = price_div.text.strip() if price_div else 'N/A'
            
            # Log the extracted product details
            print(f"Product found - Name: {product_name_text}, Rating: {rating_text}, Price: {price_text}")
            
            # Append the product details to the list
            products.append({
                'ProductName': product_name_text,
                'Rating': rating_text,
                'Price': price_text
            })
        
        # Optional: Print progress
        print(f"Scraped page {page}/{num_pages}")

    except Exception as e:
        print(f"Error on page {page}: {e}")
        continue

Product found - Name: Vaseline Cocoa Glow Void 200ml, Rating: N/A, Price: KSh 302
Product found - Name: Vaseline Dry Skin Repair Void 200ml, Rating: N/A, Price: KSh 302
Product found - Name: Vaseline Vaseline  PJ Baby 240ML, Rating: N/A, Price: KSh 272
Product found - Name: Geisha Black Soap 225g, Rating: N/A, Price: KSh 137
Product found - Name: Rexona Active Dry Antiperspirant Deodorant Roll On - 50ml, Rating: N/A, Price: KSh 361
Product found - Name: Vaseline Aloe Soothe Void 200ml, Rating: N/A, Price: KSh 310
Product found - Name: Geisha Moringa Oil Soap 225G, Rating: N/A, Price: KSh 116
Product found - Name: Vaseline Vaseline  PJ Baby 240ML, Rating: N/A, Price: KSh 272
Product found - Name: Royco Mchuzi Mix Beef Flavour Seasoning - 500g, Rating: N/A, Price: KSh 291
Product found - Name: Knorr Soft Cube Chicken Seasoning 6's, Rating: N/A, Price: KSh 93
Product found - Name: Omo Multi-purpose Bleach Lemon- 700ml, Rating: N/A, Price: KSh 332
Product found - Name: Omo Hand Washing Pow

In [11]:
# Create a DataFrame from the list of products
df = pd.DataFrame(products)

Unnamed: 0,ProductName,Rating,Price
42,Sunlight Dish Washing Paste Lemon - 800g,5.0,KSh 319
69,Sunlight Sunlight Scourer Lemon 500g,,KSh 81
25,Sunlight Spring Sensations 1kg + Free 400g,4.6,KSh 450
109,Axe Apollo Deodorant Aerosol - 150ml,,KSh 609
49,Sunlight Sunlight Scourer Lavender 1Kg,4.3,KSh 144
38,VIM Scouring Powder Lavender 500g,5.0,KSh 92
90,Vaseline Petroleum Jelly Baby- 95ml,5.0,KSh 100
18,Vaseline Lotion Men Cooling 400ml + FREE 95ml ...,4.6,KSh 550
3,Geisha Black Soap 225g,,KSh 137
20,Vaseline Petroleum Jelly Men Fresh- 240ml,4.7,KSh 262


In [7]:
# Remove duplicates based on ProductName and Price
df = df.drop_duplicates(subset=['ProductName', 'Price'])
df.sample(10)

In [12]:
# Save the DataFrame to a CSV file
df.to_csv('UnileverJumiaproducts.csv', index=False)