This project involves scraping data from an e-commerce website and creating a list of all products currently offered on discount.perfotm EDA on the data  and build a ui page where the items can be listed.
This project is part of the luxEA bootcamp.
technologies used include pyhton,beautiful soup,selenium,scrapy,pandas, numpy,flask/fast api/dash

In [6]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import logging

In [7]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Create a Service object using ChromeDriverManager
service = Service(ChromeDriverManager().install())

# Initialize the Chrome WebDriver
driver = webdriver.Chrome(service=service, options=chrome_options)

2024-09-03 14:01:21,327 - INFO - Get LATEST chromedriver version for google-chrome
2024-09-03 14:01:22,406 - INFO - Get LATEST chromedriver version for google-chrome
2024-09-03 14:01:23,765 - INFO - Driver [C:\Users\user\.wdm\drivers\chromedriver\win64\128.0.6613.119\chromedriver-win32/chromedriver.exe] found in cache


In [8]:
# Define a list to hold product information
products = []

def clean_price(price):
    if pd.isna(price) or price in ['No current price', 'No old price', 0]:
        return 0
    
    if isinstance(price, float):
        return price
    
    if isinstance(price, str):
        price = price.replace('KSh ', '').replace(',', '')
        price = re.sub(r'[^\d.]', '', price)
    
    try:
        return float(price)
    except ValueError:
        logging.warning(f"Unable to convert '{price}' to float")
        return 0

def extract_name(article):
    # Try different selectors for name
    selectors = [
        ('h3', 'name'),
        ('div', 'name'),
        ('a', 'core')
    ]
    
    for tag, class_name in selectors:
        name_elem = article.find(tag, class_=class_name)
        if name_elem:
            return name_elem.text.strip()
    
    logging.warning("Unable to find product name")
    return 'No name'


In [9]:
def scrape_page(url):
    driver.get(url)
    wait = WebDriverWait(driver, 20)
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'article.prd')))
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    articles = soup.find_all('article', class_='prd')
    
    for article in articles:
        try:
            name = extract_name(article)
            
            price_div = article.find('div', class_='prc')
            current_price = clean_price(price_div.text.strip()) if price_div else 0
            old_price = clean_price(price_div.get('data-oprc', '0').strip()) if price_div else 0
            
            discount_div = article.find('div', class_=lambda x: x and '_dsct' in x.split())
            discount = discount_div.text.strip() if discount_div else 'No discount'
            
            rating_elem = article.find('div', class_='stars _s')
            if rating_elem:
                rating_style = rating_elem.find('div', class_='in')['style'] if rating_elem.find('div', class_='in') else ''
                rating = re.search(r'width:(\d+)%', rating_style)
                rating = f"{rating.group(1)}%" if rating else 'No rating'
            else:
                rating = 'No rating'

            core_elem = article.find('a', class_='core')
            brand = core_elem.get('data-brand') or core_elem.get('data-gtm-brand') if core_elem else 'No brand'
            category = core_elem.get('data-category') or core_elem.get('data-gtm-category') if core_elem else 'No category'
            
            product = {
                'name': name,
                'price': current_price,
                'old_price': old_price,
                'discount': discount,
                'rating': rating,
                'brand': brand,
                'category': category
            }
            products.append(product)
            logging.info(f"Scraped product: {product}")
        except Exception as e:
            logging.error(f"Error processing product: {e}", exc_info=True)
            continue


In [10]:
def scrape_page_set(start_page, end_page):
    for page in range(start_page, end_page + 1):
        url = f'https://www.jumia.co.ke/mlp-fulfilled-by-jumia/?shop_premium_services=shop_express&shipped_from=country_local&page={page}#catalog-listing'
        logging.info(f'Scraping page {page}...')
        scrape_page(url)
        time.sleep(2)

total_pages = 40
pages_per_set = 10
num_sets = total_pages // pages_per_set

for set_num in range(num_sets):
    start_page = set_num * pages_per_set + 1
    end_page = start_page + pages_per_set - 1
    logging.info(f"\nScraping set {set_num + 1} (pages {start_page}-{end_page})...")
    scrape_page_set(start_page, end_page)
    
    if set_num < num_sets - 1:
        logging.info(f"Pausing for 30 seconds before the next set...")
        time.sleep(30)

driver.quit()

2024-09-03 14:01:29,677 - INFO - 
Scraping set 1 (pages 1-10)...
2024-09-03 14:01:29,679 - INFO - Scraping page 1...
2024-09-03 14:01:38,213 - INFO - Scraped product: {'name': 'Black Portable Outdoor Mountaineering Beer Belt For Men', 'price': 493.0, 'old_price': 1094.0, 'discount': '55%', 'rating': 'No rating', 'brand': 'Generic', 'category': 'Sporting Goods/Outdoor Recreation/Camping & Hiking/Backpacks & Bags/Daypacks & Casual Bags/Waist Packs'}
2024-09-03 14:01:38,214 - INFO - Scraped product: {'name': 'Universal Negative Ion Supercharged Shower Head', 'price': 748.0, 'old_price': 1496.0, 'discount': '50%', 'rating': 'No rating', 'brand': 'Generic', 'category': 'Home & Office/Home & Kitchen/Bath/Bathroom Accessories/Showerheads'}
2024-09-03 14:01:38,216 - INFO - Scraped product: {'name': 'Dimmable LED Panel Book Reading Lamp Eye Protect For Night', 'price': 739.0, 'old_price': 1478.0, 'discount': '50%', 'rating': 'No rating', 'brand': 'Generic', 'category': 'Home & Office/Home & Kit

In [11]:
df = pd.DataFrame(products)
df.to_csv('discounted_products.csv', index=False, encoding='utf-8')
logging.info('Data has been written to discounted_products.csv')

# Print summary statistics
logging.info(f"Total products scraped: {len(df)}")
logging.info(f"Products with 'No name': {df['name'].value_counts().get('No name', 0)}")
logging.info(f"Unique brands: {df['brand'].nunique()}")
logging.info(f"Unique categories: {df['category'].nunique()}")

2024-09-03 14:06:29,099 - INFO - Data has been written to discounted_products.csv
2024-09-03 14:06:29,100 - INFO - Total products scraped: 3680
2024-09-03 14:06:29,142 - INFO - Products with 'No name': 0
2024-09-03 14:06:29,148 - INFO - Unique brands: 55
2024-09-03 14:06:29,151 - INFO - Unique categories: 471


In [12]:
logging.info(f"Total products scraped: {len(df)}")
logging.info(f"Products with 'No name': {df['name'].value_counts().get('No name', 0)}")
logging.info(f"Products with 'No price': {df['price'].value_counts().get('No price', 0)}")
logging.info(f"Products with 'No old price': {df['old_price'].value_counts().get('No old price', 0)}")
logging.info(f"Products with 'No discount': {df['discount'].value_counts().get('No discounts', 0)}")
logging.info(f"Products with 'No rating': {df['name'].value_counts().get('No rating', 0)}")
logging.info(f"Unique brands: {df['brand'].nunique()}")
logging.info(f"Unique categories: {df['category'].nunique()}")

2024-09-03 14:06:29,166 - INFO - Total products scraped: 3680
2024-09-03 14:06:29,172 - INFO - Products with 'No name': 0
2024-09-03 14:06:29,177 - INFO - Products with 'No price': 0
2024-09-03 14:06:29,180 - INFO - Products with 'No old price': 0
2024-09-03 14:06:29,184 - INFO - Products with 'No discount': 0
2024-09-03 14:06:29,189 - INFO - Products with 'No rating': 0
2024-09-03 14:06:29,192 - INFO - Unique brands: 55
2024-09-03 14:06:29,196 - INFO - Unique categories: 471


In [14]:
df.tail()

Unnamed: 0,name,price,old_price,discount,rating,brand,category
3675,"General Motors Leather Center Armrest Mat, Car...",420.0,0.0,22%,100%,Generic,Automobile/Replacement Parts/Body & Trim/Trim/...
3676,Wireless Headset Volume Control Is Lightweight...,780.0,0.0,22%,No rating,Generic,Phones & Tablets/Accessories/Bluetooth Accesso...
3677,"3-in-1 Men's Electric Shaver, Electric Trimmer...",1980.0,0.0,49%,No rating,Generic,Health & Beauty/Beauty & Personal Care/Persona...
3678,Wireless Headset Volume Control Is Lightweight...,780.0,0.0,22%,No rating,Generic,Phones & Tablets/Accessories/Bluetooth Headsets
3679,Fashion Men Glossy Tassel Brogue Leather Shoes...,1320.0,0.0,49%,No rating,Fashion,Fashion/Men's Fashion/Shoes


In [15]:
df.shape

(3680, 7)

In [None]:

#loadind data from a csv file
def load_data()