In [2]:
import requests
import threading
import time
from queue import Queue
import pandas as pd
from urllib.parse import urljoin
from datetime import datetime

In [3]:
# --- Config ---
BASE_URL = "https://jiji.com.et/api_web/v1/listing"
BASE_SITE = "https://jiji.com.et"
CATEGORY_SLUG = "agriculture-and-foodstuff"
NUM_WORKERS = 5
DELAY_SECONDS = 2
LSMID = "1768997666323"
date = datetime.now()

# --- Shared variables ---
next_page = 1
page_lock = threading.Lock()

seen_guids = set()
seen_lock = threading.Lock()

all_products = []
products_lock = threading.Lock()

stop_event = threading.Event()

# --- Worker function ---
def worker(worker_id):
    global next_page
    while not stop_event.is_set():
        # Get the next page safely
        with page_lock:
            page = next_page
            next_page += 1

        params = {
            "slug": CATEGORY_SLUG,
            "init_page": "true" if page == 1 else "false",
            "page": page,
            "webp": "false",
            "lsmid": LSMID
        }

        try:
            resp = requests.get(BASE_URL, params=params, timeout=10)
            resp.raise_for_status()
            data = resp.json()
        except Exception as e:
            print(f"[Worker {worker_id}] Request failed on page {page}: {e}")
            time.sleep(DELAY_SECONDS)
            continue

        adverts = data.get("adverts_list", {}).get("adverts", [])

        # Stop if no adverts
        if not adverts:
            print(f"[Worker {worker_id}] Page {page} has no adverts, stopping scraper")
            stop_event.set()
            break

        new_items = 0
        for item in adverts:
            guid = item.get("guid")
            if not guid:
                continue

            # Deduplicate
            with seen_lock:
                if guid in seen_guids:
                    continue
                seen_guids.add(guid)

            images = [img.get("url") for img in item.get("images", []) if img.get("url")]

            product = {
                "title": item.get("title"),
                "category": item.get("category_name"),
                "category_id": item.get("category_id"),
                "description": item.get("details") or item.get("short_description"),
                "price": item.get("price_obj", {}).get("value"),
                "price_text": item.get("price_title"),
                "currency": item.get("price_obj", {}).get("currency") or "ETB",
                "condition": next((a.get("value") for a in item.get("attrs", []) if a.get("name")=="Condition"), None),
                "region": item.get("region_name"),
                "region_id": item.get("region_id"),
                "city": item.get("region_parent_name"),
                "url": urljoin(BASE_SITE, item.get("url", "")),
                "guid": guid,
                "images": images,
                "count_images": item.get("count_images"),
                "user_id": item.get("user_id"),
                "status": item.get("status"),
                "scrape_date": date
            }

            with products_lock:
                all_products.append(product)
            new_items += 1

        print(f"[Worker {worker_id}] Page {page} scraped, {new_items} new items")
        time.sleep(DELAY_SECONDS)

# --- Start threads ---
threads = []
for i in range(NUM_WORKERS):
    t = threading.Thread(target=worker, args=(i+1,))
    t.start()
    threads.append(t)

# Wait for all threads to finish
for t in threads:
    t.join()

# --- Save results ---
df = pd.DataFrame(all_products)
df.to_json("jiji_agriculture-and-foodstuff.jsonl", orient="records", lines=True, index=False)
print(f"Scraping completed: {len(all_products)} products collected")


[Worker 7] Page 7 scraped, 20 new items
[Worker 6] Page 6 scraped, 20 new items
[Worker 3] Page 3 scraped, 20 new items
[Worker 5] Page 5 scraped, 20 new items
[Worker 4] Page 4 scraped, 20 new items
[Worker 1] Page 1 scraped, 20 new items
[Worker 2] Page 2 scraped, 20 new items
[Worker 3] Page 10 scraped, 20 new items[Worker 7] Page 8 scraped, 20 new items

[Worker 6] Page 9 scraped, 20 new items
[Worker 4] Page 12 scraped, 20 new items
[Worker 5] Page 11 scraped, 20 new items
[Worker 1] Page 13 scraped, 20 new items
[Worker 2] Page 14 scraped, 20 new items
[Worker 7] Page 15 scraped, 20 new items[Worker 6] Page 17 scraped, 20 new items

[Worker 4] Page 18 scraped, 20 new items
[Worker 3] Page 16 scraped, 20 new items
[Worker 5] Page 19 scraped, 20 new items
[Worker 1] Page 20 scraped, 20 new items
[Worker 2] Page 21 scraped, 20 new items
[Worker 5] Page 26 scraped, 20 new items
[Worker 6] Page 23 scraped, 20 new items
[Worker 4] Page 24 scraped, 20 new items
[Worker 7] Page 22 scrape