In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import json
import time
import random

# Configuration
base_url = "https://www.carlist.my/cars-for-sale/malaysia?page_number={}&page_size=25"
start_page = 3471
end_page = 5205
car_listings = []
output_file = 'car_listings.csv'
max_retries = 3

# Start time
start_time = time.time()

# Main scraping loop
try:
    for page_num in range(start_page, end_page + 1):
        url = base_url.format(page_num)
        print(f"\n🔎 Extracting page {page_num} - {url}")

        # Retry mechanism for network errors
        retries = 0
        while retries < max_retries:
            try:
                headers = {"User-Agent": "Mozilla/5.0"}
                response = requests.get(url, headers=headers, timeout=10)
                response.raise_for_status()
                soup = BeautifulSoup(response.content, 'html.parser')
                break
            except requests.exceptions.RequestException as e:
                retries += 1
                print(f"⚠️ Network error: {e}. Retrying ({retries}/{max_retries})...")
                time.sleep(2)
        else:
            print(f"❌ Failed to fetch page {page_num} after {max_retries} retries. Skipping...")
            continue

        # Find all articles (car listings) and extract JSON-LD data
        articles = soup.find_all('article', class_='listing')
        ld_json = None
        for script in soup.find_all('script', type='application/ld+json'):
            try:
                data = json.loads(script.string)
                if isinstance(data, list):
                    for d in data:
                        if 'itemListElement' in d:
                            ld_json = d['itemListElement']
                            break
            except Exception as e:
                print(f"⚠️ Error parsing JSON-LD: {e}")
                continue
        
        if not ld_json:
            print(f"⚠️ No JSON-LD found on page {page_num}. Skipping...")
            continue

        page_count = 0
        for article, item in zip(articles, ld_json):
            car = item['item']

            # Extract fields directly without utility functions
            name = article.get('data-title', '')
            brand = article.get('data-make', '')
            model = article.get('data-model', '')
            body = article.get('data-body-type', '')
            transmission = article.get('data-transmission', '')
            installment = article.get('data-installment', '')
            mileage = ""
            location = ""
            for icon in article.find_all('i'):
                if 'icon--meter' in icon.get('class', []):
                    mileage = str(icon.next_sibling).strip()
                elif 'icon--location' in icon.get('class', []):
                    location_text = []
                    for sib in icon.next_siblings:
                        text = sib.get_text(strip=True) if hasattr(sib, 'get_text') else str(sib).strip()
                        if text:
                            location_text.append(text)
                    location = ' '.join(location_text)

            # Extract additional fields from the JSON-LD data
            year = car.get('vehicleModelDate', '')
            fuel = car.get('fuelType', '')
            color = car.get('color', '')
            price = car.get('offers', {}).get('price', '')
            condition = car.get('itemCondition', '').lower()
            condition = "New" if "new" in condition else "Used"
            seats = car.get('seatingCapacity', '')
            sales_channel = ""
            dealer_div = article.find('div', class_='listing__spec--dealer')
            if dealer_div:
                sales_channel = dealer_div.get_text(strip=True)

            # Append data to the car_listings list
            car_listings.append({
                'Car Name': name,
                'Car Brand': brand,
                'Car Model': model,
                'Manufacture Year': year,
                'Body Type': body,
                'Fuel Type': fuel,
                'Mileage': mileage,
                'Transmission': transmission,
                'Color': color,
                'Price': price,
                'Installment': installment,
                'Condition': condition,
                'Seat Capacity': seats,
                'Location': location,
                'Sales Channel': sales_channel,
                'URL': car.get('url', '')
            })
            page_count += 1

        print(f"✅ Found {page_count} cars on page {page_num}")
        print(f"📄 Total scraped: {len(car_listings)}")

        # Randomized delay to prevent IP blocking
        delay = random.uniform(2, 5)
        print(f"⏳ Waiting {delay:.2f} seconds before next page...")
        time.sleep(delay)

except KeyboardInterrupt:
    print("\n🚨 Script interrupted by the user. Saving progress...")

# Save the extracted data to a CSV file
if car_listings:
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=car_listings[0].keys())
        writer.writeheader()
        writer.writerows(car_listings)
    print(f"\n✅ Saved {len(car_listings)} cars to '{output_file}'")
else:
    print("\n⚠️ No car listings found.")

# End time and elapsed time calculation
end_time = time.time()
execution_time = end_time - start_time
print(f"\n🕒 Total execution time: {execution_time:.2f} seconds")


🔎 Extracting page 3471 - https://www.carlist.my/cars-for-sale/malaysia?page_number=3471&page_size=25
✅ Found 25 cars on page 3471
📄 Total scraped: 25
⏳ Waiting 2.69 seconds before next page...

🔎 Extracting page 3472 - https://www.carlist.my/cars-for-sale/malaysia?page_number=3472&page_size=25
✅ Found 25 cars on page 3472
📄 Total scraped: 50
⏳ Waiting 2.83 seconds before next page...

🔎 Extracting page 3473 - https://www.carlist.my/cars-for-sale/malaysia?page_number=3473&page_size=25
✅ Found 25 cars on page 3473
📄 Total scraped: 75
⏳ Waiting 4.43 seconds before next page...

🔎 Extracting page 3474 - https://www.carlist.my/cars-for-sale/malaysia?page_number=3474&page_size=25
✅ Found 25 cars on page 3474
📄 Total scraped: 100
⏳ Waiting 2.02 seconds before next page...

🔎 Extracting page 3475 - https://www.carlist.my/cars-for-sale/malaysia?page_number=3475&page_size=25
✅ Found 25 cars on page 3475
📄 Total scraped: 125
⏳ Waiting 4.92 seconds before next page...

🔎 Extracting page 3476 - ht