In [26]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

In [27]:
BASE_URL = "https://www.autorec.co.jp/car-stock?page={}"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0)"
}

USD_TO_UGX = 3600

In [28]:
def clean_price(text):
    text = text.replace("US", "").replace("$", "").replace(",", "").strip()
    digits = re.sub(r"\D", "", text)
    return int(digits) if digits else None

In [23]:
def extract_year(text):
    m = re.search(r"(19|20)\d{2}", text)
    return int(m.group()) if m else None

In [29]:
def parse_row(row):
    """Parse a single <tr> row containing car info."""
    cols = row.find_all("td")
    if len(cols) < 6:
        return None

    # TITLE (inside <strong>)
    strong = cols[0].find("strong")
    title = strong.text.strip() if strong else None

    # YEAR (2nd column)
    year_tag = cols[1].find("span", {"class": "date-display-single"})
    year = int(year_tag.text.strip()) if year_tag else None

    # ENGINE CC (3rd column)
    engine_cc = cols[2].text.strip()

    # MILEAGE (4th column)
    mileage = cols[3].text.strip()

    # PRICE USD (5th column)
    price_usd = clean_price(cols[4].text)
    price_ugx = price_usd * USD_TO_UGX if price_usd else None

    # CAR DETAIL URL (6th column)
    link_tag = cols[5].find("a")
    url = link_tag["href"] if link_tag else None

    # Make/Model parsing
    make = model = None
    if title:
        parts = title.split()
        if len(parts) >= 2:
            # title = "2013 Mazda Verisa"
            # parts = ["2013", "Mazda", "Verisa"]
            make = parts[1]
        if len(parts) >= 3:
            model = parts[2]

    return {
        "title": title,
        "make": make,
        "model": model,
        "year": year,
        "engine_cc": engine_cc,
        "mileage": mileage,
        "price_usd": price_usd,
        "price_ugx": price_ugx,
        "url": url
    }

In [30]:
def scrape_autorec(max_pages=20, delay=1.5):
    all_cars = []

    for page in range(max_pages):
        url = BASE_URL.format(page)
        print(f"Scraping {url}")

        resp = requests.get(url, headers=HEADERS)
        if resp.status_code != 200:
            print("Failed:", resp.status_code)
            break

        soup = BeautifulSoup(resp.text, "html.parser")

        # The car rows are inside: <table> <tbody> <tr>
        rows = soup.select("table tbody tr")
        if not rows:
            print("No more rows found. Stopping.")
            break

        print(f"Found {len(rows)} cars")

        for row in rows:
            data = parse_row(row)
            if data:
                all_cars.append(data)

        time.sleep(delay)

    return pd.DataFrame(all_cars)

In [32]:
df_autorec = scrape_autorec(max_pages=100)
df_autorec.to_csv("autorec_raw.csv", index=False)

df_autorec.head()

Scraping https://www.autorec.co.jp/car-stock?page=0
Found 11 cars
Scraping https://www.autorec.co.jp/car-stock?page=1
Found 11 cars
Scraping https://www.autorec.co.jp/car-stock?page=2
Found 11 cars
Scraping https://www.autorec.co.jp/car-stock?page=3
Found 11 cars
Scraping https://www.autorec.co.jp/car-stock?page=4
Found 11 cars
Scraping https://www.autorec.co.jp/car-stock?page=5
No more rows found. Stopping.


Unnamed: 0,title,make,model,year,engine_cc,mileage,price_usd,price_ugx,url
0,2013 Mazda Verisa,Mazda,Verisa,2013,"1,500cc","93,200km",4500,16200000,/cars-list/hatchback/2013-mazda-verisa-7204?se...
1,2011 Mazda Biante,Mazda,Biante,2011,"2,000cc","100,500km",4500,16200000,/cars-list/mini-van/2011-mazda-biante-7203?sel...
2,2008 Toyota Vellfire,Toyota,Vellfire,2008,"2,390cc","102,700km",5200,18720000,/cars-list/mini-van/2008-toyota-vellfire-7200?...
3,2012 Subaru Legacy Touring Wagon,Subaru,Legacy,2012,"2,490cc","102,400km",4900,17640000,/cars-list/wagon/2012-subaru-legacy-touring-wa...
4,2012 Subaru Impreza,Subaru,Impreza,2012,"1,590cc","66,900km",4700,16920000,/cars-list/hatchback/2012-subaru-impreza-7197?...
