In [None]:
# pakwheels_scrape_bs4.py
import requests
from bs4 import BeautifulSoup
import csv
import re

URL = "https://www.pakwheels.com/used-cars/search/-/rg_karachi/"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

def text_or_empty(el):
    return el.get_text(strip=True) if el else ""

def first_or_empty(lst):
    return lst[0] if lst else ""

def parse_listing(li):
    # Safe lookups using several possible selectors
    title = text_or_empty(li.find("a", class_="car-name")) or text_or_empty(li.find("h3"))

    price = text_or_empty(li.find("p", class_="price")) \
            or text_or_empty(li.find("div", class_="price-details"))

    # location: sometimes p.location or span.pull-left
    location = text_or_empty(li.find("p", class_="location")) \
               or text_or_empty(li.find("span", class_="pull-left"))

    # Updated time (if present)
    updated = text_or_empty(li.find("span", class_="timeago")) \
              or text_or_empty(li.find("span", class_="updated"))

    # Specs: try li.list-inline-item (common), fallback to any <li> inside the listing's info block
    specs = li.select("li.list-inline-item")
    if not specs:
        # broader fallback: find <ul> or <div> that contains numeric year/mileage etc.
        specs = li.find_all("li")
    spec_texts = [s.get_text(strip=True) for s in specs]

    # heuristics: find year (4-digit), mileage (km), fuel, engine (cc), transmission (Automatic/Manual)
    year = ""
    mileage = ""
    fuel = ""
    engine = ""
    transmission = ""

    # Combine spec_texts into one joined string for regex fallback
    joined = " | ".join(spec_texts)

    # Year: first 4-digit number between 1900-2099 in list or in joined
    for t in spec_texts:
        match = re.search(r"\b(19|20)\d{2}\b", t)
        if match:
            year = match.group(0)
            break
    if not year:
        m = re.search(r"\b(19|20)\d{2}\b", joined)
        if m:
            year = m.group(0)

    # Mileage: look for pattern like '179,360 km' or '179360 km'
    for t in spec_texts:
        if "km" in t.lower() or re.search(r"\d{2,3}[,\d]*\s*km", t.lower()):
            mileage = t
            break
    if not mileage:
        m = re.search(r"(\d{1,3}(?:,\d{3})+|\d+)\s*km", joined, flags=re.IGNORECASE)
        if m:
            mileage = m.group(0)

    # Engine: look for 'cc'
    for t in spec_texts:
        if "cc" in t.lower():
            engine = t
            break
    if not engine:
        m = re.search(r"\d{3,4}\s*cc", joined, flags=re.IGNORECASE)
        if m:
            engine = m.group(0)

    # Transmission
    for t in spec_texts:
        if re.search(r"\b(automatic|manual|cvt|prosmatec)\b", t, flags=re.IGNORECASE):
            transmission = t
            break
    if not transmission:
        m = re.search(r"\b(automatic|manual|cvt|prosmatec)\b", joined, flags=re.IGNORECASE)
        if m:
            transmission = m.group(0)

    # Fuel type (Petrol/Hybrid/Diesel/CNG)
    for t in spec_texts:
        if re.search(r"\b(petrol|hybrid|diesel|cng|lpg)\b", t, flags=re.IGNORECASE):
            fuel = t
            break
    if not fuel:
        m = re.search(r"\b(petrol|hybrid|diesel|cng|lpg)\b", joined, flags=re.IGNORECASE)
        if m:
            fuel = m.group(0)

    return {
        "Title": title,
        "Price": price,
        "Location": location,
        "Year": year,
        "Mileage": mileage,
        "Fuel": fuel,
        "Engine": engine,
        "Transmission": transmission,
        "Updated": updated
    }

def scrape_page(url):
    r = requests.get(url, headers=HEADERS, timeout=15)
    r.raise_for_status()
    html = r.text
    soup = BeautifulSoup(html, "html.parser")

    # Try main listing selectors
    listings = soup.find_all("li", class_="classified-listing")
    if not listings:
        # fallback to div-based listing container
        listings = soup.select("div.col-md-9.grid-box, div.listing")

    results = []
    for li in listings:
        item = parse_listing(li)
        # Only add if we have at least title or price
        if item["Title"] or item["Price"]:
            results.append(item)
    return results

def save_csv(rows, filename="pakwheels_karachi_parsed.csv"):
    if not rows:
        print("No rows to save.")
        return
    keys = list(rows[0].keys())
    with open(filename, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=keys)
        w.writeheader()
        w.writerows(rows)
    print(f"Saved {len(rows)} rows to {filename}")

def print_preview(rows, n=10):
    for i, r in enumerate(rows[:n], 1):
        print(f"{i}. {r['Title']}")
        print(f"   Price: {r['Price']}  |  Location: {r['Location']}  |  Year: {r['Year']}")
        print(f"   Mileage: {r['Mileage']}  |  Engine: {r['Engine']}  |  Transmission: {r['Transmission']}")
        if r['Updated']:
            print(f"   Updated: {r['Updated']}")
        print("-" * 80)

if __name__ == "__main__":
    rows = scrape_page(URL)
    if not rows:
        print("No listings found on page. The site may load listings with JavaScript.")
        print("If you get an empty result, tell me and I'll provide the Selenium/API version.")
    else:
        print_preview(rows, n=10)
        save_csv(rows)