In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote_plus
import pandas as pd
import time
import re
from random import uniform

# Configurations
BASE_URL = "https://www.cardekho.com/used-cars+in+"
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
}
STATES = [
    "Maharashtra", "Uttar Pradesh", "Karnataka", "Gujarat", "Rajasthan",
    "Madhya Pradesh", "Tamil Nadu", "Haryana", "Kerala", "Delhi",
    "Bihar", "Jharkhand", "Chhattisgarh", "Odisha", "West Bengal"
]

def extract_specs(card, car_name):
    """Extracts specifications from a car listing card."""
    specs = {'Fuel Type': 'N/A', 'Transmission': 'N/A', 'Km Driven': 'N/A', 'Manufacture Year': 'N/A'}
    
    specs_div = card.find('div', class_='dotsDetails')
    if specs_div:
        texts = [t.strip().lower() for t in specs_div.stripped_strings]
        for fuel in ['petrol', 'diesel', 'cng', 'electric', 'hybrid']:
            if fuel in texts:
                specs['Fuel Type'] = fuel.title()
                break
        for trans in ['manual', 'automatic']:
            if trans in texts:
                specs['Transmission'] = trans.title()
                break
        km = next((t for t in texts if 'km' in t), None)
        if km:
            km_val = re.sub(r'[^\d]', '', km)
            if km_val:
                specs['Km Driven'] = km_val

    # Try extracting year from car name
    match = re.search(r'(19|20)\d{2}', car_name)
    if match:
        specs['Manufacture Year'] = match.group()
    
    return specs

def scrape_state(state):
    """Scrapes listings for a single state."""
    url = BASE_URL + quote_plus(state.lower())
    time.sleep(uniform(1, 3))  # polite delay
    response = requests.get(url, headers=HEADERS)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.content, 'html.parser')
    cards = soup.find_all('div', class_='NewUcExCard posR')

    data = []
    for card in cards:
        name_tag = card.find('h3', class_='title')
        car_name = name_tag.get_text(strip=True) if name_tag else "N/A"

        location_tag = card.find('div', class_='distanceText')
        location = location_tag.get_text(strip=True) if location_tag else state

        price_tag = card.find('div', class_='Price hover')
        price_text = price_tag.get_text(strip=True).lower() if price_tag else "N/A"
        price = None
        if 'lakh' in price_text:
            price = float(re.sub(r'[^\d.]', '', price_text)) * 1e5
        elif 'crore' in price_text:
            price = float(re.sub(r'[^\d.]', '', price_text)) * 1e7

        specs = extract_specs(card, car_name)
        data.append({
            'Car Name': car_name,
            'Location': state,
            'Price (INR)': round(price) if price else None,
            'Fuel Type': specs['Fuel Type'],
            'Transmission': specs['Transmission'],
            'Km Driven': specs['Km Driven'],
            'Manufacture Year': specs['Manufacture Year']
        })
    return data

def scrape_all_states():
    all_data = []
    for state in STATES:
        print(f"Scraping {state}...")
        try:
            all_data.extend(scrape_state(state))
        except Exception as e:
            print(f"Failed to scrape {state}: {e}")
    return pd.DataFrame(all_data)

def clean_data(df):
    """Cleans and enriches the scraped data."""
    df['Km Driven'] = pd.to_numeric(df['Km Driven'], errors='coerce')
    df['Manufacture Year'] = pd.to_numeric(df['Manufacture Year'], errors='coerce')
    df['Price (INR)'] = df['Price (INR)'].fillna(0).astype(int)

    # Extract car brand after year in name
    df['Car Name'] = df['Car Name'].str.replace(r'(\d{4})([A-Za-z])', r'\1 \2', regex=True)
    df['Brand'] = df['Car Name'].str.extract(r'\b(?:19|20)\d{2}\s+(\w+)', expand=False)

    return df.dropna(how='all').reset_index(drop=True)

def main():
    print("Starting scraper...")
    start = time.time()

    df = scrape_all_states()
    print(f"\nScraped {len(df)} rows.")

    df = clean_data(df)
    print(f"Cleaned data: {len(df)} rows.")

    filename = f"used_cars_{pd.Timestamp.now():%Y%m%d_%H%M%S}.csv"
    df.to_csv(filename, index=False)
    print(f"\nData saved to {filename} in {time.time() - start:.2f} seconds.")
    print("\nSample:")
    print(df.head())

if __name__ == "__main__":
    main()


Starting scraper...
Scraping Maharashtra...
Scraping Uttar Pradesh...
Scraping Karnataka...
Scraping Gujarat...
Scraping Rajasthan...
Scraping Madhya Pradesh...
Scraping Tamil Nadu...
Scraping Haryana...
Scraping Kerala...
Scraping Delhi...
Scraping Bihar...
Scraping Jharkhand...
Scraping Chhattisgarh...
Scraping Odisha...
Scraping West Bengal...

Scraped 300 rows.
Cleaned data: 300 rows.

Data saved to used_cars_20250717_095647.csv in 45.10 seconds.

Sample:
                              Car Name     Location  Price (INR) Fuel Type  \
0  2023 Hyundai ExterSX CNG 4 Cylinder  Maharashtra       890000       Cng   
1          2024 Kia SonetHTX Turbo DCT  Maharashtra      1325000    Petrol   
2             2024 Audi Q3Bold Edition  Maharashtra      4450000    Petrol   
3                2022 Renault KigerRXT  Maharashtra       604000    Petrol   
4               2024 Maruti CelerioZXI  Maharashtra       595000    Petrol   

  Transmission  Km Driven  Manufacture Year    Brand  
0       Manu