In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
import re

def web_scraping(base_url, first_page_url):
    descriptions = []
    prices = []
    older_prices = []
    reviews_list = []

    # Scrape the first page
    r = requests.get(first_page_url)
    print(f"Fetching data from: {first_page_url} - Status: {r.status_code}")

    if r.status_code == 200:
        soup = BeautifulSoup(r.text, 'html.parser')

        # Extract product prices
        price_elements = soup.find_all("div", {"class": "prc"})
        for price_element in price_elements:
            prices.append(price_element.text.strip())

        # Extract old prices
        old_price_elements = soup.find_all("div", class_="old")
        for old_price_element in old_price_elements:
            older_prices.append(old_price_element.text.strip())

        # Extract product descriptions
        desc_elements = soup.find_all("h3", class_="name")
        for desc_element in desc_elements:
            descriptions.append(desc_element.text.strip())

        # Extract reviews
        reviews = soup.find_all("div", class_="stars _s")
        for rev in reviews:
            reviews_list.append(rev.text.strip())
    else:
        print(f"Failed to fetch data from {first_page_url}")

    # Scrape pages 2 to 50
    for i in range(2, 51):
        page_url = f"{base_url}{i}#catalog-listing"
        r = requests.get(page_url)
        print(f"Fetching data from: {page_url} - Status: {r.status_code}")

        if r.status_code != 200:
            print(f"Failed to fetch data from {page_url}")
            continue

        soup = BeautifulSoup(r.text, 'html.parser')

        # Extract product prices
        price_elements = soup.find_all("div", {"class": "prc"})
        for price_element in price_elements:
            prices.append(price_element.text.strip())

        # Extract old prices
        old_price_elements = soup.find_all("div", class_="old")
        for old_price_element in old_price_elements:
            older_prices.append(old_price_element.text.strip())

        # Extract product descriptions
        desc_elements = soup.find_all("h3", class_="name")
        for desc_element in desc_elements:
            descriptions.append(desc_element.text.strip())

        # Extract reviews
        reviews = soup.find_all("div", class_="stars _s")
        for rev in reviews:
            reviews_list.append(rev.text.strip())

    # Return collected data
    rows = list(zip(descriptions, prices, older_prices, reviews_list, urls))
    return rows



In [2]:
def collect_data_phones():

    data = web_scraping(base_url, first_page_url)

    # Convert to DataFrame for easier handling
    df = pd.DataFrame(data, columns=['Description', 'Price', 'Old Price', 'Reviews', 'urls'])

    # Save to CSV
    df.to_csv(r'..\data\scraped_data\jumia_scaped_phones.csv', index=False)
    


In [3]:
 #Base URL for pages 2 to 50
base_url = "https://www.jumia.co.ke//mobile-phones/?page="

# URL for the first page
first_page_url = "https://www.jumia.co.ke/mobile-phones/"

collect_data_phones()

Fetching data from: https://www.jumia.co.ke/mobile-phones/ - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones/?page=2#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones/?page=3#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones/?page=4#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones/?page=5#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones/?page=6#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones/?page=7#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones/?page=8#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones/?page=9#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones/?page=10#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-p

ValueError: 5 columns passed, passed data had 4 columns

In [107]:
import pandas as pd

def phones_cleaning(csv_path):
    dataset = pd.read_csv(csv_path)
    id = [i for i in range(1, len(dataset) + 1)]
    dataset["id"] = id
    dataset["id"] = dataset["id"].astype(int)
    dataset["source"] = ["Jumia"] * len(data)
    dataset["price"] = dataset["Price"].str.replace("KSh", "").str.replace(",", "")
    # dataset.drop("Price", axis = 1)
    dataset["price"] = dataset["price"].str.strip().str.replace("-", "").str.extract(r"(\d+)", expand = False)
    dataset["price"] = dataset["price"].astype(int)
    dataset["old_price"] = dataset["Old Price"].str.replace("KSh", "").str.replace(",", "")
    dataset["old_price"] = dataset["old_price"].str.strip().str.replace("-", "").str.extract(r"(\d+)", expand = False)
    dataset["old_price"] = dataset["old_price"].astype(int)
    # dataset.drop("Old Price", axis = 1)
    dataset["reviews"] = dataset["Reviews"].str.replace("out of 5", "").astype(float)
    # dataset.drop("Reviews", axis = 1)
    pattern_brand = r"^[a-zA-Z0-9\s]+"
    dataset["brand"] = dataset["Description"].str.extract(f"({pattern_brand})")
    pattern_ram = r"\d\s*[GB]+\s+RAM"
    dataset["RAM"] = dataset["Description"].str.extract(f"({pattern_ram})")
    dataset["RAM"] = dataset["RAM"].str.strip().str.replace("GB RAM", "")
    dataset["RAM"] = pd.to_numeric(dataset["RAM"], errors="coerce")
    pattern_rom = r"\b(128GB|64GB|256GB)\b"
    values = dataset["Description"].str.extract(f"({pattern_rom})").fillna("Unknown")
    dataset["storage"] = values[0]
    dataset["storage"] = dataset["storage"].str.replace("GB", "").str.strip()
    dataset["storage"] = pd.to_numeric(dataset["storage"], errors = "coerce")
    pattern_bat = r"[0-9]+\s*(mah|MAH|MaH|mAh|MAh)"
    result = dataset["Description"].str.extract(f"({pattern_bat})")
    dataset["Battery"] = result[0]
    dataset["Battery"] = dataset["Battery"].str.replace("mAh", "").str.replace("mah","").str.replace("MAH","").str.replace("MaH","").str.replace("MAh","")
    dataset["Battery"] = pd.to_numeric(data["Battery"], errors = "coerce")
    dataset = dataset.drop(columns = ["Price", "Old Price", "Reviews"])
    columns = ["id", "Description", "brand", "price", "old_price", "reviews", "RAM", "storage", "Battery", "source"]
    dataset = dataset[columns]
    dataset.to_csv(r"..\data\clean_data\jumia_clean_phones.csv", index = False)
    
    return dataset


In [109]:

data = phones_cleaning(r'..\data\scraped_data\jumia_scaped_phones.csv')
# data["price"] = data["price"].str.strip().str.replace("-", "").str.extract(r"(\d+)", expand = False)
# data["price"] = data["price"].astype(int)
# data["RAM"] = data["RAM"].str.strip().str.replace("GB RAM", "")
# data["RAM"] = pd.to_numeric(data["RAM"], errors="coerce")
# data["storage"] = data["storage"].str.replace("GB", "").str.strip()
# data["storage"] = pd.to_numeric(data["storage"], errors = "coerce")
# data["Battery"] = data["Battery"].str.replace("mAh", "").str.replace("mah","").str.replace("MAH","").str.replace("MaH","").str.replace("MAh","")
# data["Battery"] = pd.to_numeric(data["Battery"], errors = "coerce")
data

Unnamed: 0,id,Description,brand,price,old_price,reviews,RAM,storage,Battery,source
0,1,"Itel S23+ 6.78"", 128GB + 4GB RAM, 50MP Camera,...",Itel S23,8999,23000,4.5,4.0,128.0,5000.0,Jumia
1,2,"Oppo A3X, 6.67"", 64GB + 4GB RAM, 4G LTE, 45W F...",Oppo A3X,13000,15999,5.0,4.0,64.0,100.0,Jumia
2,3,"Samsung Galaxy A05, 6.7'' 4GB RAM + 64GB ROM (...",Samsung Galaxy A05,13580,14999,4.1,4.0,64.0,5000.0,Jumia
3,4,"Infinix Smart 8 6.6"" HD, 2GB RAM + 64GB , Andr...",Infinix Smart 8 6,7999,10000,4.1,2.0,64.0,5000.0,Jumia
4,5,"XIAOMI Redmi 14C, 6.88"" (4GB RAM+128GB Storage...",XIAOMI Redmi 14C,21060,19999,4.2,4.0,128.0,5160.0,Jumia
...,...,...,...,...,...,...,...,...,...,...
1125,1126,"Bontel 2720//SCREEN DISPLAY1.77""//1000mAh//fre...",Bontel 2720,17399,220000,5.0,,,1000.0,Jumia
1126,1127,"Nokia 6310,2.8 Inchs, 8MB+16MB,Dual Sim - Black",Nokia 6310,18590,39999,5.0,,,,Jumia
1127,1128,"Tecno T301 , Dual Sim Kabambe/Katululu/ Mulika...",Tecno T301,69999,27999,1.0,,,,Jumia
1128,1129,"XIAOMI Redmi Note 13 Pro, 6.67"", 512GB ROM + ...",XIAOMI Redmi Note 13 Pro,2249,2500,2.0,2.0,,5100.0,Jumia


In [104]:
data.dtypes

Description     object
brand           object
price            int64
old_price        int64
reviews        float64
RAM            float64
storage        float64
Battery        float64
dtype: object

In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
import re

def web_scraping(base_url, first_page_url):
    descriptions = []
    prices = []
    older_prices = []
    reviews_list = []
    urls = []

    # Scrape the first page
    r = requests.get(first_page_url)
    print(f"Fetching data from: {first_page_url} - Status: {r.status_code}")

    if r.status_code == 200:
        soup = BeautifulSoup(r.text, 'html.parser')

        # Extract product prices
        price_elements = soup.find_all("div", {"class": "prc"})
        for price_element in price_elements:
            prices.append(price_element.text.strip())

        # Extract old prices
        old_price_elements = soup.find_all("div", class_="old")
        for old_price_element in old_price_elements:
            older_prices.append(old_price_element.text.strip())

        # Extract product descriptions
        desc_elements = soup.find_all("h3", class_="name")
        for desc_element in desc_elements:
            descriptions.append(desc_element.text.strip())

        # Extract reviews
        reviews = soup.find_all("div", class_="stars _s")
        for rev in reviews:
            reviews_list.append(rev.text.strip())

        
        products = soup.find_all('a', class_='core')

        for product in products:
                # Extract product link
                link = product['href'] if 'href' in product.attrs else None
                # Complete the URL if the link is relative
                link = f"https://www.jumia.co.ke{link}" if link and link.startswith('/') else link
                urls.append(link)


    else:
        print(f"Failed to fetch data from {first_page_url}")

    # Scrape pages 2 to 50
    for i in range(2, 51):
        page_url = f"{base_url}{i}#catalog-listing"
        r = requests.get(page_url)
        print(f"Fetching data from: {page_url} - Status: {r.status_code}")

        if r.status_code != 200:
            print(f"Failed to fetch data from {page_url}")
            continue

        soup = BeautifulSoup(r.text, 'html.parser')

        # Extract product prices
        price_elements = soup.find_all("div", {"class": "prc"})
        for price_element in price_elements:
            prices.append(price_element.text.strip())

        # Extract old prices
        old_price_elements = soup.find_all("div", class_="old")
        for old_price_element in old_price_elements:
            older_prices.append(old_price_element.text.strip())

        # Extract product descriptions
        desc_elements = soup.find_all("h3", class_="name")
        for desc_element in desc_elements:
            descriptions.append(desc_element.text.strip())

        # Extract reviews
        reviews = soup.find_all("div", class_="stars _s")
        for rev in reviews:
            reviews_list.append(rev.text.strip())
        
        products = soup.find_all('a', class_='core')

        for product in products:
            # Extract product link 
            link = product['href'] if 'href' in product.attrs else None
            # Complete the URL if the link is relative
            link = f"https://www.jumia.co.ke{link}" if link and link.startswith('/') else link
            urls.append(link)


    # Return collected data
    rows = list(zip(descriptions, prices, older_prices, reviews_list, urls))
    return rows




In [11]:
def collect_data_phones():

    data = web_scraping(base_url, first_page_url)

    # Convert to DataFrame for easier handling
    df = pd.DataFrame(data, columns=['Description', 'Price', 'Old Price', 'Reviews', 'urls'])

    # Save to CSV
    df.to_csv(r"..\data\scraped_data\jumia_scraped_phones.csv", index = False)


 #Base URL for pages 2 to 50
base_url = "https://www.jumia.co.ke//mobile-phones//?page="

# URL for the first page
first_page_url = "https://www.jumia.co.ke//mobile-phones//"

collect_data_phones()


Fetching data from: https://www.jumia.co.ke//mobile-phones// - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones//?page=2#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones//?page=3#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones//?page=4#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones//?page=5#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones//?page=6#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones//?page=7#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones//?page=8#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones//?page=9#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.ke//mobile-phones//?page=10#catalog-listing - Status: 200
Fetching data from: https://www.jumia.co.k

In [19]:
def phones_cleaning(csv_path):
    dataset = pd.read_csv(csv_path)
    id = [i for i in range(1, len(dataset) + 1)]
    dataset["id"] = id
    dataset["id"] = dataset["id"].astype(int)
    dataset["source"] = ["Jumia"] * len(dataset)
    dataset["price"] = dataset["Price"].str.replace("KSh", "").str.replace(",", "")
    # dataset.drop("Price", axis = 1)
    dataset["price"] = dataset["price"].str.strip().str.replace("-", "").str.extract(r"(\d+)", expand = False)
    dataset["price"] = dataset["price"].astype(int)
    dataset["old_price"] = dataset["Old Price"].str.replace("KSh", "").str.replace(",", "")
    dataset["old_price"] = dataset["old_price"].str.strip().str.replace("-", "").str.extract(r"(\d+)", expand = False)
    dataset["old_price"] = dataset["old_price"].astype(int)
    # dataset.drop("Old Price", axis = 1)
    dataset["reviews"] = dataset["Reviews"].str.replace("out of 5", "").astype(float)
    # dataset.drop("Reviews", axis = 1)
    pattern_brand = r"^[a-zA-Z0-9\s]+"
    dataset["brand"] = dataset["Description"].str.extract(f"({pattern_brand})")
    pattern_ram = r"\d\s*[GB]+\s+RAM"
    dataset["RAM"] = dataset["Description"].str.extract(f"({pattern_ram})")
    dataset["RAM"] = dataset["RAM"].str.strip().str.replace("GB RAM", "")
    dataset["RAM"] = pd.to_numeric(dataset["RAM"], errors="coerce")
    pattern_rom = r"\b(128GB|64GB|256GB)\b"
    values = dataset["Description"].str.extract(f"({pattern_rom})").fillna("Unknown")
    dataset["storage"] = values[0]
    dataset["storage"] = dataset["storage"].str.replace("GB", "").str.strip()
    dataset["storage"] = pd.to_numeric(dataset["storage"], errors = "coerce")
    pattern_bat = r"[0-9]+\s*(mah|MAH|MaH|mAh|MAh)"
    result = dataset["Description"].str.extract(f"({pattern_bat})")
    dataset["Battery"] = result[0]
    dataset["Battery"] = dataset["Battery"].str.replace("mAh", "").str.replace("mah","").str.replace("MAH","").str.replace("MaH","").str.replace("MAh","")
    dataset["Battery"] = pd.to_numeric(dataset["Battery"], errors = "coerce")
    dataset = dataset.drop(columns = ["Price", "Old Price", "Reviews"])
    columns = ["id", "Description", "brand", "price", "old_price", "reviews", "RAM", "storage", "Battery", "source", "urls"]
    dataset = dataset[columns]
    dataset.to_csv(r"..\data\clean_data\jumia_clean_phones.csv", index = False)
    
    return dataset


csv_path = r"..\data\scraped_data\jumia_scraped_phones.csv"


phones_cleaning(csv_path)

Unnamed: 0,id,Description,brand,price,old_price,reviews,RAM,storage,Battery,source,urls
0,1,"Itel S23+ 6.78"", 128GB + 4GB RAM, 50MP Camera,...",Itel S23,8999,23000,4.5,4.0,128.0,5000.0,Jumia,https://www.jumia.co.ke/infinix-smart-8-6.6-hd...
1,2,"Tecno Spark 30c, 6.67'' HD+, UP to 128GB ROM+ ...",Tecno Spark 30c,13000,14999,4.1,4.0,128.0,5000.0,Jumia,https://www.jumia.co.ke/infinix-hot-50i-6.7-12...
2,3,"Oppo A3X, 6.67"", 64GB + 4GB RAM, 4G LTE, 45W F...",Oppo A3X,13580,15999,5.0,4.0,64.0,100.0,Jumia,https://www.jumia.co.ke/poco-c75-6.88-128gb-6g...
3,4,"VILLAON V101 177"" Kabambe, Wireless FM, Camera...",VILLAON V101 177,7999,1200,4.0,,,1000.0,Jumia,https://www.jumia.co.ke/blackview-wave-6c-6.5-...
4,5,"Samsung Galaxy A05, 6.7'' 4GB RAM + 64GB ROM (...",Samsung Galaxy A05,21060,14999,4.1,4.0,64.0,5000.0,Jumia,https://www.jumia.co.ke/itel-rs4-6.56-8gb-ram-...
...,...,...,...,...,...,...,...,...,...,...,...
1142,1143,"Infinix NOTE 30, 256GB ROM+8GB RAM, DISPLAY 6....",Infinix NOTE 30,12099,28888,3.0,8.0,256.0,,Jumia,https://www.jumia.co.ke/tecno-pop-9-6.67-128gb...
1143,1144,Modio 5G Tablet - Kids Android Tablets PC M730,Modio 5G Tablet,2250,40000,5.0,,,,Jumia,https://www.jumia.co.ke/bontel-911size-display...
1144,1145,"Tecno Transformers Spark 30, 6.8'' UP to 16GB ...",Tecno Transformers Spark 30,10270,80000,4.0,6.0,256.0,5000.0,Jumia,https://www.jumia.co.ke/xiaomi-a3x-6.71-3gb-ra...
1145,1146,"Nokia 3310, 2.4 Inches 2MP Dual SIM Cards-Dark...",Nokia 3310,10200,9000,4.0,,,,Jumia,https://www.jumia.co.ke/realme-c30s-6.5display...
