In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

In [3]:
BASE_URL = "https://jiji.ug/cars?page={}"

In [4]:
def clean_price(text):
    """Convert price string to integer."""
    text = text.replace("USh", "").replace(",", "").strip()
    return int(re.sub(r"\D", "", text)) if re.sub(r"\D", "", text) else None

In [5]:
def extract_year(title):
    """Extract car year from title."""
    match = re.search(r"(19|20)\d{2}", title)
    return int(match.group()) if match else None

In [6]:
def parse_listing(card):
    """Extract data from a single car card."""
    try:
        title = card.find("a", {"class": "b-list-advert__item-title"}).text.strip()
    except:
        title = None

    try:
        price_raw = card.find("div", {"class": "b-list-advert__item-price"}).text.strip()
        price = clean_price(price_raw)
    except:
        price = None

    try:
        link = "https://jiji.ug" + card.find("a", {"class": "b-list-advert__item-title"})["href"]
    except:
        link = None

    # Extract attributes like mileage, fuel type, transmission, etc
    attributes = card.find_all("div", {"class": "b-list-advert__item-attr"})
    mileage, fuel, transmission, condition = None, None, None, None

    for attr in attributes:
        text = attr.text.lower()
        if "km" in text:
            mileage = text
        elif "fuel" in text:
            fuel = text.replace("fuel: ", "").strip()
        elif "automatic" in text or "manual" in text:
            transmission = text.strip()
        elif "foreign" in text or "uganda used" in text:
            condition = text.strip()

    year = extract_year(title or "")

    # Extract make & model from title
    make, model = None, None
    if title:
        parts = title.split()
        if len(parts) >= 2:
            make = parts[0]
            model = parts[1]

    car = {
        "title": title,
        "make": make,
        "model": model,
        "year": year,
        "mileage": mileage,
        "fuel": fuel,
        "transmission": transmission,
        "condition": condition,
        "price": price,
        "url": link
    }

    return car

In [7]:
def scrape_jiji(max_pages=20, delay=2):
    """Scrape multiple pages from Jiji Uganda Cars section."""
    all_cars = []

    for page in range(1, max_pages + 1):
        print(f"Scraping page {page}...")
        url = BASE_URL.format(page)
        response = requests.get(url)

        if response.status_code != 200:
            print(f"Failed to load page {page}")
            continue

        soup = BeautifulSoup(response.text, "html.parser")
        cards = soup.find_all("div", {"class": "b-list-advert__item"})

        if not cards:
            print("No more listings found. Stopping.")
            break

        for card in cards:
            car = parse_listing(card)
            all_cars.append(car)

        time.sleep(delay)

    return pd.DataFrame(all_cars)

In [8]:
df = scrape_jiji(max_pages=30, delay=2)

print("Scraped:", len(df), "cars")

# Save CSV
df.to_csv("raw_data/jiji_raw.csv", index=False)

df.head()

Scraping page 1...
No more listings found. Stopping.
Scraped: 0 cars
