In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
import re

def web_scraping(base_url, first_page_url):
    descriptions = []
    prices = []
    older_prices = []
    reviews_list = []

    # Scrape the first page
    r = requests.get(first_page_url)
    print(f"Fetching data from: {first_page_url} - Status: {r.status_code}")

    if r.status_code == 200:
        soup = BeautifulSoup(r.text, 'html.parser')

        # Extract product prices
        price_elements = soup.find_all("div", {"class": "prc"})
        for price_element in price_elements:
            prices.append(price_element.text.strip())

        # Extract old prices
        old_price_elements = soup.find_all("div", class_="old")
        for old_price_element in old_price_elements:
            older_prices.append(old_price_element.text.strip())

        # Extract product descriptions
        desc_elements = soup.find_all("h3", class_="name")
        for desc_element in desc_elements:
            descriptions.append(desc_element.text.strip())

        # Extract reviews
        reviews = soup.find_all("div", class_="stars _s")
        for rev in reviews:
            reviews_list.append(rev.text.strip())
    else:
        print(f"Failed to fetch data from {first_page_url}")

    # Scrape pages 2 to 50
    for i in range(2, 51):
        page_url = f"{base_url}{i}#catalog-listing"
        r = requests.get(page_url)
        print(f"Fetching data from: {page_url} - Status: {r.status_code}")

        if r.status_code != 200:
            print(f"Failed to fetch data from {page_url}")
            continue

        soup = BeautifulSoup(r.text, 'html.parser')

        # Extract product prices
        price_elements = soup.find_all("div", {"class": "prc"})
        for price_element in price_elements:
            prices.append(price_element.text.strip())

        # Extract old prices
        old_price_elements = soup.find_all("div", class_="old")
        for old_price_element in old_price_elements:
            older_prices.append(old_price_element.text.strip())

        # Extract product descriptions
        desc_elements = soup.find_all("h3", class_="name")
        for desc_element in desc_elements:
            descriptions.append(desc_element.text.strip())

        # Extract reviews
        reviews = soup.find_all("div", class_="stars _s")
        for rev in reviews:
            reviews_list.append(rev.text.strip())

    # Return collected data
    rows = list(zip(descriptions, prices, older_prices, reviews_list))
    return rows



In [4]:
def collect_data_phones():

    data = web_scraping(base_url, first_page_url)

    # Convert to DataFrame for easier handling
    df = pd.DataFrame(data, columns=['Description', 'Price', 'Old Price', 'Reviews'])

    # Save to CSV
    df.to_csv('jumia_mobile_phones.csv', index=False)

In [5]:
 #Base URL for pages 2 to 50
base_url = "https://www.jumia.co.ke//mobile-phones/?page="

# URL for the first page
first_page_url = "https://www.jumia.co.ke/mobile-phones/"

collect_data_phones()

Fetching data from: https://www.jumia.co.ke/mobile-phones/ - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones/?page=2#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones/?page=3#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones/?page=4#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones/?page=5#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones/?page=6#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones/?page=7#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones/?page=8#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones/?page=9#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones/?page=10#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-p

In [6]:
def phones_cleaning(csv_path):
    dataset = pd.read_csv(csv_path)
    dataset["price"] = dataset["Price"].str.replace("KSh", "").str.replace(",", "")
    # dataset.drop("Price", axis = 1)
    dataset["old_price"] = dataset["Old Price"].str.replace("KSh", "").str.replace(",", "")
    # dataset.drop("Old Price", axis = 1)
    dataset["reviews"] = dataset["Reviews"].str.replace("out of 5", "")
    # dataset.drop("Reviews", axis = 1)
    pattern_brand = r"^[a-zA-Z0-9\s]+"
    dataset["brand"] = dataset["Description"].str.extract(f"({pattern_brand})")
    pattern_ram = r"\d\s*[GB]+\s+RAM"
    dataset["RAM"] = dataset["Description"].str.extract(f"({pattern_ram})")
    pattern_rom = r"\b(128GB|64GB|256GB)\b"
    values = dataset["Description"].str.extract(f"({pattern_rom})").fillna("Unknown")
    dataset["storage"] = values[0]
    pattern_bat = r"[0-9]+\s*(mah|MAH|MaH|mAh|MAh)"
    result = dataset["Description"].str.extract(f"({pattern_bat})")
    dataset["Battery"] = result[0]
    dataset = dataset.drop(columns = ["Price", "Old Price", "Reviews"])
    columns = ["Description", "brand", "price", "old_price", "reviews", "RAM", "storage", "Battery"]
    dataset = dataset[columns]
    dataset.to_csv("cleaned_phones_jumia", index = False)
    
    return dataset


In [7]:
phones_cleaning("C:\\Users\\charity.ngari\\Desktop\\e-commerce-product-analysis\\jumia_mobile_phones.csv")

Unnamed: 0,Description,brand,price,old_price,reviews,RAM,storage,Battery
0,"Itel S23+ 6.78"", 128GB + 4GB RAM, 50MP Camera,...",Itel S23,12200,23000,4.5,4GB RAM,128GB,5000mAh
1,"Tecno Spark 30c, 6.67'' HD+, UP to 128GB ROM+ ...",Tecno Spark 30c,9200,14999,4.3,4GB RAM,128GB,5000 mAh
2,"XIAOMI Redmi 13 6.79'' 8GB+256GB Dual SIM, 4G,...",XIAOMI Redmi 13 6,13580,23000,4.3,,256GB,5030 MAh
3,"VILLAON V101 177"" Kabambe, Wireless FM, Camera...",VILLAON V101 177,12999,1200,4.1,,Unknown,1000mAh
4,"XIAOMI Redmi A3x, 6.71"", 128GB + 4GB RAM (Dual...",XIAOMI Redmi A3x,21060,12000,4.1,4GB RAM,128GB,5000mAh
...,...,...,...,...,...,...,...,...
1115,"Oale A18 3G- 5.5"" 3GB RAM 32GB ROM 8MP 3050mAh...",Oale A18 3G,19400,30000,4.3,3GB RAM,Unknown,3050mAh
1116,"Samsung Galaxy A04e, 6.5"" Display, 3GB RAM + ...",Samsung Galaxy A04e,22799,19999,5,3GB RAM,Unknown,
1117,"Spark10 Pro Smartphone5.5inch,16GB RAM/1TB ROM...",Spark10 Pro Smartphone5,9690,19999,3.9,6GB RAM,Unknown,6800mAh
1118,"XIAOMI Redmi 14C, 6.88"" (256GB Storage+8GB RAM...",XIAOMI Redmi 14C,16350,19999,5,8GB RAM,256GB,5160mAh
