In [12]:
import requests
import json
import pandas as pd
import itertools
from itertools import product
import time

In [13]:
# Lists of categories and locations
categories = [
    'Accounting-Finance','Administration','Analyst-Research','Android-Jobs','Banking',
    'Business-Development','C-Level-Executive-GM-Director','Creative-Design-Art',
    'Customer-Service-Support','Education-Teaching','Engineering-Construction-Civil-Architecture',
    'Engineering-Mechanical-Electrical','Engineering-Oil-Gas-Energy','Engineering-Other',
    'Engineering-Telecom-Technology','Fashion','Hospitality-Hotels-Food-Services','Human-Resources',
    'IT-Software-Development','Installation-Maintenance-Repair','Internships-in-Egypt','Legal',
    'Logistics-Supply-Chain','Manufacturing-Production','Marketing-PR-Advertising',
    'Media-Journalism-Publishing','Medical-Healthcare','Operations-Management','Pharmaceutical',
    'Project-Program-Management','Purchasing-Procurement','Quality','R-D-Science','Sales-Retail',
    'Sports-and-Leisure','Startup-Jobs','Strategy-Consulting','Tourism-Travel','Training-Instructor',
    'Writing-Editorial'
]

locations = [
    'Cairo','Dubai','Riyadh','Giza','Alexandria','Sharqia','Makkah','Monufya','Red-Sea','Suez',
    'Ras-al-Khaimah','Dakahlia','Qalubia','Gharbia','Matruh','Jeddah','Ismailia','Tabuk','Dammam',
    'Beheira','Beni-Suef','Abu-Dhabi','Port-Said','Damietta','Assiut','New-Cairo','Maadi','Nasr-City',
    '6th-of-October','Heliopolis','Sheikh-Zayed','Obour-City','Sheraton','10th-of-Ramadan-City',
    'Dokki','Mohandessin','Mokattam','Badr-City','Alsadat-City','Haram','New-Capital','Katameya',
    'Ameria','New-Nozha','Bourj-Alarab','Smouha','Downtown','Ain-Sokhna','Shorouk-City'
]

In [14]:
# Search API
API_URL = 'https://wuzzuf.net/api/search/job'
HEADERS = {'content-type': 'application/json;charset=UTF-8'}

# Collect search results 
all_jobs = []
page_size = 10000
delay_between_requests = 1.0

In [15]:
for cat, loc in itertools.product(categories, locations):
    start = 0
    while True:
        payload = {
            "startIndex": start,
            "pageSize": page_size,
            "longitude": "0",
            "latitude": "0",
            "query": f"{cat} Jobs in {loc}",
            "searchFilters": {}
        }

        try:
            res = requests.post(API_URL, headers=HEADERS, data=json.dumps(payload), timeout=20)
            if res.status_code != 200:
                print(f"  ! HTTP {res.status_code} for {cat}-{loc} start={start}")
                break

            body = res.json()
            jobs = body.get("data", [])
            if not jobs:
                break

            for j in jobs:
                attr = j.get("attributes", {}) or {}

                # basic fields
                record = {
                    "search_category": cat,
                    "search_location": loc,
                    "id": j.get("id"),
                }

                all_jobs.append(record)

            start += page_size
            time.sleep(delay_between_requests)

        except requests.exceptions.RequestException as e:
            print(f" Request error for {cat}-{loc} start={start}: {e}")
            # exponential backoff attempt
            time.sleep(2)
            break

In [16]:
# --- Convert to DataFrame ---
df_jobs = pd.DataFrame(all_jobs)
print(f"\nTotal jobs collected: {len(df_jobs)}")


Total jobs collected: 1108969


In [17]:
df_jobs

Unnamed: 0,search_category,search_location,id
0,Accounting-Finance,Cairo,3831f42a-fe69-40b9-a9f7-a2d725a18ae4
1,Accounting-Finance,Cairo,df8a8659-85d6-4e30-9666-d5f27156e911
2,Accounting-Finance,Cairo,2550c42d-cec8-4beb-93d8-79513c56baa0
3,Accounting-Finance,Cairo,0e06538b-941b-40a2-b6c4-a473939da9a6
4,Accounting-Finance,Cairo,b7d17a61-61e3-4253-9694-5b2096ce71e8
...,...,...,...
1108964,Writing-Editorial,Shorouk-City,93a170b1-28ba-43cd-a2a9-f479143d92f3
1108965,Writing-Editorial,Shorouk-City,c5aa5f79-4e83-4520-9c54-d14a20d10453
1108966,Writing-Editorial,Shorouk-City,f338654f-4e41-482a-8b8d-f1b8cbde1a2f
1108967,Writing-Editorial,Shorouk-City,2c5f4729-4e9c-4462-9571-4d5ce5a6903e


In [20]:
jobs_cleaned = df_jobs.drop_duplicates(subset=['id'])

In [21]:
jobs_cleaned.to_csv('wuzzuf job listings output 12-oct-2025.csv', index=False)

In [22]:
jobs_cleaned

Unnamed: 0,search_category,search_location,id
0,Accounting-Finance,Cairo,3831f42a-fe69-40b9-a9f7-a2d725a18ae4
1,Accounting-Finance,Cairo,df8a8659-85d6-4e30-9666-d5f27156e911
2,Accounting-Finance,Cairo,2550c42d-cec8-4beb-93d8-79513c56baa0
3,Accounting-Finance,Cairo,0e06538b-941b-40a2-b6c4-a473939da9a6
4,Accounting-Finance,Cairo,b7d17a61-61e3-4253-9694-5b2096ce71e8
...,...,...,...
1103782,Tourism-Travel,Suez,d01a8fbf-7cd1-4a8e-98b0-638809a6cbaf
1103788,Tourism-Travel,Suez,b50ec3c0-8003-427e-a5d9-c85cf8db2e23
1107677,Writing-Editorial,Cairo,58a909d1-73f2-4032-8f30-c4cfc654451f
1107703,Writing-Editorial,Dubai,69df21aa-78c4-406b-81b7-fb520ade8fe5
