# Imports

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Define constants (headers & base URL)

In [2]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
}
base_url = "https://www.jumia.com.ng/laptops/?page={}"
products = []

# Function to get reviews for certian product

In [3]:
def get_reviews(product_url):

    try:
        resp = requests.get(product_url, headers=headers, timeout=10)
        if resp.status_code != 200:
            return None, None

        soup = BeautifulSoup(resp.text, 'html.parser')

        rating_tag = soup.find(string=lambda t: t and "out of 5" in t)
        rating = rating_tag.strip() if rating_tag else None

        num_reviews = None
        for tag in soup.find_all(string=lambda t: t and "verified ratings" in t):
            num_reviews = tag.strip()
            break

        return rating, num_reviews

    except Exception as e:
        print(f"Error fetching reviews for {product_url}: {e}")
        return None, None


## Scraping multiple pages with live progress updates

In [None]:
import requests
from requests.exceptions import ReadTimeout, ConnectionError
import time

number_of_pages = 5

for page in range(1, number_of_pages + 1):
    print(f"Scraping page {page}...")

    try:
        resp = requests.get(base_url.format(page), headers=headers, timeout=20)  # longer timeout
        resp.raise_for_status()
    except (ReadTimeout, ConnectionError) as e:
        print(f"⚠️ Timeout/Connection error on page {page}, skipping... ({e})")
        continue

    soup = BeautifulSoup(resp.text, 'html.parser')
    items = soup.select('a.core')
    if not items:
        print(f"⚠️ No items found on page {page}")
        continue

    for idx, item in enumerate(items, start=1):
        title_tag = item.select_one('h3.name')
        if not title_tag:
            continue
        title = title_tag.text.strip()
        if not title:
            continue

        price = item.select_one('.prc').text.strip() if item.select_one('.prc') else None
        brand = item.get('data-gtm-brand', None)
        product_url = "https://www.jumia.com.ng" + item.get('href')
        img_tag = item.select_one('img.img')
        image_url = img_tag.get('data-src') or img_tag.get('src') if img_tag else None

        try:
            rating, num_reviews = get_reviews(product_url)
        except (ReadTimeout, ConnectionError):
            rating, num_reviews = None, None

        products.append({
            'title': title,
            'price': price,
            'brand': brand,
            'rating': rating,
            'num_reviews': num_reviews,
            'product_url': product_url,
            'image_url': image_url
        })

        time.sleep(1)  # Delay between product requests

    print(f"Finished page {page}")
    time.sleep(2)  # Delay between pages

print("Done scraping")


Scraping page 1...


# Save results to CSV

In [None]:
df = pd.DataFrame(products)
df.to_csv("jumia_laptops.csv", index=False)
print(f"Collected {len(products)} products from (number_of_pages) pages.")


In [None]:
df