In [2]:
import pandas as pd
import seaborn as sns
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import requests
import time
import re

In [3]:
# Define categories and their URLs
categories = {
    "Laptops": "https://www.jumia.co.ke/catalog/?q=laptop&page=",
    "Phones": "https://www.jumia.co.ke/catalog/?q=phones&page=",
    "Skincare": "https://www.jumia.co.ke/catalog/?q=skincare&page=",
    "Home Appliances": "https://www.jumia.co.ke/catalog/?q=home+appliances&page=",
    "TVs": "https://www.jumia.co.ke/catalog/?q=tv&page=",
    "Watches": "https://www.jumia.co.ke/catalog/?q=watches&page=",
    "Shoes": "https://www.jumia.co.ke/catalog/?q=shoes&page=",
    "Kitchen Appliances": "https://www.jumia.co.ke/catalog/?q=kitchen+appliances&page="
}

In [4]:
# Function to extract product details
def scrape_products(category, base_url, max_pages=5):
    products = []
    
    for page in range(1, max_pages + 1):
        url = f"{base_url}{page}"
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to retrieve {category} data on page {page}")
            continue
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract product details
        for item in soup.find_all('article', class_='prd _fb col c-prd'):
            name = item.find('h3', class_='name')
            price = item.find('div', class_='prc')
            old_price = item.find('div', class_='old')
            discount = item.find('div', class_='bdg _dsct')
            rating = item.find('div', class_='stars _s')
            review_count = item.find('div', class_='rev')
            
            products.append({
                "Name": name.text.strip() if name else None,
                "Brand": name.text.split()[0] if name else None,
                "Price": price.text.strip() if price else None,
                "Old Price": old_price.text.strip() if old_price else None,
                "Discount": discount.text.strip() if discount else None,
                "Rating": rating.get('aria-label', None) if rating else None,
                "Review Count": review_count.text.strip() if review_count else None,
                "Type of Product": category
            })
        
        print(f"Scraped {category} - Page {page}")
        time.sleep(2)  # Avoid getting blocked
    
    return products

In [5]:
# Scrape all categories
data = []
for category, url in categories.items():
    print(f"Scraping {category}...")
    data.extend(scrape_products(category, url))
    time.sleep(2)  # Avoid getting blocked by the website
print('Congratulations!Scrapping Complete!!')

Scraping Laptops...
Scraped Laptops - Page 1
Scraped Laptops - Page 2
Scraped Laptops - Page 3
Scraped Laptops - Page 4
Scraped Laptops - Page 5
Scraping Phones...
Scraped Phones - Page 1
Scraped Phones - Page 2
Scraped Phones - Page 3
Scraped Phones - Page 4
Scraped Phones - Page 5
Scraping Skincare...
Scraped Skincare - Page 1
Scraped Skincare - Page 2
Scraped Skincare - Page 3
Scraped Skincare - Page 4
Scraped Skincare - Page 5
Scraping Home Appliances...
Scraped Home Appliances - Page 1
Scraped Home Appliances - Page 2
Scraped Home Appliances - Page 3
Scraped Home Appliances - Page 4
Scraped Home Appliances - Page 5
Scraping TVs...
Scraped TVs - Page 1
Scraped TVs - Page 2
Scraped TVs - Page 3
Scraped TVs - Page 4
Scraped TVs - Page 5
Scraping Watches...
Scraped Watches - Page 1
Scraped Watches - Page 2
Scraped Watches - Page 3
Scraped Watches - Page 4
Scraped Watches - Page 5
Scraping Shoes...
Scraped Shoes - Page 1
Scraped Shoes - Page 2
Scraped Shoes - Page 3
Scraped Shoes - Pag

In [6]:
# Convert to DataFrame and save as CSV
df = pd.DataFrame(data)
df.to_csv("jumia_products.csv", index=False)
print("Scraping completed! Data saved to jumia_products.csv")

Scraping completed! Data saved to jumia_products.csv


In [7]:
df=pd.read_csv('jumia_products.csv')
df.head(5)

Unnamed: 0,Name,Brand,Price,Old Price,Discount,Rating,Review Count,Type of Product
0,"HP Chromebook 11 G6 EE, Intel Celeron Dual N33...",HP,"KSh 8,999","KSh 16,799",,,3.9 out of 5(48),Laptops
1,HP Refurbished EliteBook 8470p Core I5 - HDD 5...,HP,"KSh 13,999","KSh 23,000",,,3.7 out of 5(48),Laptops
2,Lenovo Refurbished ThinkPad Yoga 11e X360 Int...,Lenovo,"KSh 11,499","KSh 22,500",,,4 out of 5(162),Laptops
3,HP Refurbished EliteBook 8460p Core I5 - HDD 5...,HP,"KSh 12,899","KSh 22,000",,,3.7 out of 5(61),Laptops
4,"Lenovo ThinkPad X260 Intel Core I5, 8GB RAM, 2...",Lenovo,"KSh 15,795","KSh 28,000",,,4 out of 5(43),Laptops


In [8]:
# Function to clean and extract product details
def clean_product_name(name):
    if not isinstance(name, str):
        return None, None, None
    
    # Extract storage (RAM and SSD/HDD sizes)
    storage_match = re.findall(r'\b(\d+GB\s*(?:RAM|SSD|HDD))\b', name, re.IGNORECASE)
    storage = ', '.join(storage_match) if storage_match else "N/A"
    
    # Remove storage details and extra info to clean the name
    clean_name = re.sub(r'\b(\d+GB\s*(?:RAM|SSD|HDD))\b', '', name, flags=re.IGNORECASE)
    clean_name = clean_name.split(',')[0].strip()  # Keep the first main part
    
    # Extract remaining details
    more_details = name.replace(clean_name, '').strip().strip(',')
    
    return clean_name, storage, more_details

In [9]:
# Apply the function to clean names and extract storage
df[['Clean Name', 'Storage', 'More Details']] = df.apply(
    lambda row: pd.Series(clean_product_name(row['Name'])), axis=1
)

df.head(5)

Unnamed: 0,Name,Brand,Price,Old Price,Discount,Rating,Review Count,Type of Product,Clean Name,Storage,More Details
0,"HP Chromebook 11 G6 EE, Intel Celeron Dual N33...",HP,"KSh 8,999","KSh 16,799",,,3.9 out of 5(48),Laptops,HP Chromebook 11 G6 EE,"4GB RAM, 16GB SSD","Intel Celeron Dual N3350 , Storage 4GB RAM/16..."
1,HP Refurbished EliteBook 8470p Core I5 - HDD 5...,HP,"KSh 13,999","KSh 23,000",,,3.7 out of 5(48),Laptops,HP Refurbished EliteBook 8470p Core I5 - HDD 5...,8GB RAM,HP Refurbished EliteBook 8470p Core I5 - HDD 5...
2,Lenovo Refurbished ThinkPad Yoga 11e X360 Int...,Lenovo,"KSh 11,499","KSh 22,500",,,4 out of 5(162),Laptops,Lenovo Refurbished ThinkPad Yoga 11e X360 Int...,4GB RAM,Lenovo Refurbished ThinkPad Yoga 11e X360 Int...
3,HP Refurbished EliteBook 8460p Core I5 - HDD 5...,HP,"KSh 12,899","KSh 22,000",,,3.7 out of 5(61),Laptops,HP Refurbished EliteBook 8460p Core I5 - HDD 5...,4GB RAM,HP Refurbished EliteBook 8460p Core I5 - HDD 5...
4,"Lenovo ThinkPad X260 Intel Core I5, 8GB RAM, 2...",Lenovo,"KSh 15,795","KSh 28,000",,,4 out of 5(43),Laptops,Lenovo ThinkPad X260 Intel Core I5,"8GB RAM, 256GB SSD","8GB RAM, 256GB SSD, 12.5'' REFURBISHED"


In [10]:
df.tail(5)

Unnamed: 0,Name,Brand,Price,Old Price,Discount,Rating,Review Count,Type of Product,Clean Name,Storage,More Details
1595,Ice Cubes Small Ice Maker Machine Ice Cube Mak...,Ice,"KSh 26,999","KSh 30,000",,,,Kitchen Appliances,Ice Cubes Small Ice Maker Machine Ice Cube Mak...,,Fast Ice Making
1596,Compact Portable Ice Cube Maker Counter top Ic...,Compact,"KSh 26,999","KSh 30,000",,,,Kitchen Appliances,Compact Portable Ice Cube Maker Counter top Ic...,,
1597,RAF 24 Hours Countertop Crushed Ice Chewable I...,RAF,"KSh 30,000","KSh 60,000",,,,Kitchen Appliances,RAF 24 Hours Countertop Crushed Ice Chewable I...,,
1598,RAF Portable High Quality Adjustable-size Ice ...,RAF,"KSh 30,000","KSh 60,000",,,,Kitchen Appliances,RAF Portable High Quality Adjustable-size Ice ...,,
1599,RAF Automatic Smart Countertop Ice Making Mach...,RAF,"KSh 30,000","KSh 60,000",,,,Kitchen Appliances,RAF Automatic Smart Countertop Ice Making Mach...,,


In [11]:
df.isnull().sum()

Name                  0
Brand                 0
Price                 0
Old Price           128
Discount           1600
Rating             1600
Review Count        606
Type of Product       0
Clean Name            0
Storage               0
More Details          0
dtype: int64

In [12]:
# Convert price columns to numerical values
def clean_price(price):
    if isinstance(price, str):
        return int(''.join(filter(str.isdigit, price)))  # Remove non-numeric characters
    return 0

df["Price"] = df["Price"].apply(clean_price)
df["Old Price"] = df["Old Price"].apply(clean_price)

In [13]:
# Calculate discount percentage where old price is available
df["Discount"] = (((df["Old Price"] - df["Price"]) / df["Old Price"]) * 100).round(2)
# Set discount to 0 where old price is missing
df.loc[df["Old Price"] == 0, "Discount"] = 0

# Extract numeric rating and review count
df["Rating"] = df["Review Count"].str.extract(r'(\d+\.\d+|\d+)').astype(float, errors='ignore')
df["Review Count"] = df["Review Count"].str.extract(r'\((\d+)\)').astype(float, errors='ignore')

# Handle missing values
df.fillna({"Price": 0, "Old Price": 0, "Discount": 0, "Rating": 0, "Review Count": 0}, inplace=True)
df.head(5)

Unnamed: 0,Name,Brand,Price,Old Price,Discount,Rating,Review Count,Type of Product,Clean Name,Storage,More Details
0,"HP Chromebook 11 G6 EE, Intel Celeron Dual N33...",HP,8999,16799,46.43,3.9,48.0,Laptops,HP Chromebook 11 G6 EE,"4GB RAM, 16GB SSD","Intel Celeron Dual N3350 , Storage 4GB RAM/16..."
1,HP Refurbished EliteBook 8470p Core I5 - HDD 5...,HP,13999,23000,39.13,3.7,48.0,Laptops,HP Refurbished EliteBook 8470p Core I5 - HDD 5...,8GB RAM,HP Refurbished EliteBook 8470p Core I5 - HDD 5...
2,Lenovo Refurbished ThinkPad Yoga 11e X360 Int...,Lenovo,11499,22500,48.89,4.0,162.0,Laptops,Lenovo Refurbished ThinkPad Yoga 11e X360 Int...,4GB RAM,Lenovo Refurbished ThinkPad Yoga 11e X360 Int...
3,HP Refurbished EliteBook 8460p Core I5 - HDD 5...,HP,12899,22000,41.37,3.7,61.0,Laptops,HP Refurbished EliteBook 8460p Core I5 - HDD 5...,4GB RAM,HP Refurbished EliteBook 8460p Core I5 - HDD 5...
4,"Lenovo ThinkPad X260 Intel Core I5, 8GB RAM, 2...",Lenovo,15795,28000,43.59,4.0,43.0,Laptops,Lenovo ThinkPad X260 Intel Core I5,"8GB RAM, 256GB SSD","8GB RAM, 256GB SSD, 12.5'' REFURBISHED"


In [14]:
df.tail()

Unnamed: 0,Name,Brand,Price,Old Price,Discount,Rating,Review Count,Type of Product,Clean Name,Storage,More Details
1595,Ice Cubes Small Ice Maker Machine Ice Cube Mak...,Ice,26999,30000,10.0,0.0,0.0,Kitchen Appliances,Ice Cubes Small Ice Maker Machine Ice Cube Mak...,,Fast Ice Making
1596,Compact Portable Ice Cube Maker Counter top Ic...,Compact,26999,30000,10.0,0.0,0.0,Kitchen Appliances,Compact Portable Ice Cube Maker Counter top Ic...,,
1597,RAF 24 Hours Countertop Crushed Ice Chewable I...,RAF,30000,60000,50.0,0.0,0.0,Kitchen Appliances,RAF 24 Hours Countertop Crushed Ice Chewable I...,,
1598,RAF Portable High Quality Adjustable-size Ice ...,RAF,30000,60000,50.0,0.0,0.0,Kitchen Appliances,RAF Portable High Quality Adjustable-size Ice ...,,
1599,RAF Automatic Smart Countertop Ice Making Mach...,RAF,30000,60000,50.0,0.0,0.0,Kitchen Appliances,RAF Automatic Smart Countertop Ice Making Mach...,,


In [15]:
df.isnull().sum()

Name               0
Brand              0
Price              0
Old Price          0
Discount           0
Rating             0
Review Count       0
Type of Product    0
Clean Name         0
Storage            0
More Details       0
dtype: int64