In [1]:
# Generate Tunisia Cars Dataset - Enhanced Version
import pandas as pd
import numpy as np
import random

# ------------------------------
# Configuration
# ------------------------------
N = 60000  # Number of rows (increased for better coverage)

# Brands and Models (23 brands total)
brands_models = {
    "Peugeot": ["206", "207", "208", "301", "308", "3008", "2008", "Partner"],
    "Renault": ["Clio", "Megane", "Symbol", "Captur", "Kangoo", "Duster"],
    "Volkswagen": ["Golf 4", "Golf 5", "Golf 6", "Golf 7", "Polo", "Passat", "Tiguan"],
    "Kia": ["Picanto", "Rio", "Sportage", "Cerato", "Carens"],
    "Hyundai": ["i10", "i20", "Elantra", "Tucson", "Accent", "Santa Fe"],
    "Toyota": ["Corolla", "Yaris", "Avensis", "Hiace", "Rav4", "Land Cruiser"],
    "Suzuki": ["Swift", "Celerio", "Vitara", "Jimny"],
    "Dacia": ["Logan", "Sandero", "Duster", "Dokker"],
    "Fiat": ["Punto", "Tipo", "500", "Panda"],
    "BMW": ["116i", "118i", "320i", "520d", "X1", "X3"],
    "Mercedes": ["C180", "C200", "E200", "GLA", "Vito"],
    "Audi": ["A3", "A4", "A6", "Q3", "Q5"],
    "Opel": ["Corsa", "Astra", "Insignia"],
    "Ford": ["Focus", "Fiesta", "Kuga", "Ranger"],
    "Nissan": ["Micra", "Qashqai", "X-Trail", "Navara"],
    "Mitsubishi": ["Lancer", "Outlander", "L200"],
    "Chery": ["Tiggo", "Arrizo"],
    # New brands popular in Tunisia
    "Isuzu": ["D-Max"],  # Very popular pickup
    "Seat": ["Ibiza", "Leon", "Ateca"],
    "CitroÃ«n": ["C3", "C4", "C-ElysÃ©e", "Berlingo"],
    "Skoda": ["Octavia", "Fabia", "Kodiaq"],
    "Mazda": ["3", "CX-5", "CX-3"],
    "Jeep": ["Renegade", "Compass", "Wrangler"]
}

# Car attributes
fuel_types = ["Essence", "Diesel", "Hybride"]
gearbox_types = ["Manuelle", "Automatique"]
conditions = ["excellent", "tres bon etat", "bon etat", "moyen", "a reparer"]
car_bodies = ["citadine", "compacte", "berline", "SUV", "break", "monospace", "pickup"]

# All 24 governorates of Tunisia
locations = [
    # Greater Tunis (4)
    "Tunis", "Ariana", "Ben Arous", "Manouba",
    # North East (4)
    "Nabeul", "Zaghouan", "Bizerte", "Beja",
    # North West (3)
    "Jendouba", "Kef", "Siliana",
    # Central East (4)
    "Sousse", "Monastir", "Mahdia", "Sfax",
    # Central West (3)
    "Kairouan", "Kasserine", "Sidi Bouzid",
    # South East (3)
    "Gabes", "Medenine", "Tataouine",
    # South West (3)
    "Gafsa", "Tozeur", "Kebili"
]

colors = ["blanc", "noir", "gris", "argent", "rouge", "bleu", "vert", "beige"]
options_list = ["air_conditioning", "parking_sensor", "rear_camera", "sunroof", "alloy_wheels", "bluetooth", "gps"]

# Base prices by brand (TND)
brand_base = {
    "Peugeot": 27000, "Renault": 26000, "Volkswagen": 32000, "Kia": 24000, "Hyundai": 25000,
    "Toyota": 30000, "Suzuki": 22000, "Dacia": 18000, "Fiat": 20000, "BMW": 70000, "Mercedes": 75000,
    "Audi": 68000, "Opel": 23000, "Ford": 24000, "Nissan": 26000, "Mitsubishi": 25000, "Chery": 20000,
    # New brands
    "Isuzu": 80000, "Seat": 25000, "CitroÃ«n": 26000, "Skoda": 28000, "Mazda": 32000, "Jeep": 90000
}

# Horsepower ranges by car body type
body_hp_ranges = {
    "citadine": (60, 95),
    "compacte": (85, 130),
    "berline": (110, 180),
    "SUV": (120, 220),
    "break": (100, 160),
    "monospace": (100, 160),
    "pickup": (140, 200)  # New: pickups have higher power
}

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)

# ------------------------------
# Generate Dataset
# ------------------------------
print(f"Generating {N:,} car records...")

rows = []
for i in range(N):
    if (i + 1) % 10000 == 0:
        print(f"  Generated {i + 1:,} rows...")
    
    # Basic info
    brand = random.choice(list(brands_models.keys()))
    model = random.choice(brands_models[brand])
    year = int(np.random.choice(np.arange(2005, 2025), p=np.linspace(1, 1.5, 20)/np.linspace(1, 1.5, 20).sum()))
    age = 2025 - year
    
    # Mileage based on age
    mileage_mean = max(5000, age * 15000)
    mileage = int(max(3000, np.random.normal(mileage_mean, mileage_mean * 0.35)))
    
    # Attributes
    fuel = random.choices(fuel_types, weights=[0.6, 0.35, 0.05])[0]
    gearbox = random.choices(gearbox_types, weights=[0.65, 0.35])[0]
    condition = random.choices(conditions, weights=[0.12, 0.35, 0.35, 0.12, 0.06])[0]
    
    # Car body type (pickup more common for certain brands)
    if brand in ["Isuzu", "Ford", "Nissan", "Mitsubishi", "Toyota"]:
        car_body = random.choices(car_bodies, weights=[0.1, 0.15, 0.15, 0.2, 0.1, 0.1, 0.2])[0]
    else:
        car_body = random.choices(car_bodies, weights=[0.2, 0.25, 0.2, 0.2, 0.1, 0.05, 0])[0]
    
    # Technical specs based on body type
    hp_low, hp_high = body_hp_ranges[car_body]
    horsepower = int(np.random.randint(hp_low, hp_high + 1))
    engine_size = round(max(0.9, min(3.5, horsepower / 80.0 + np.random.normal(0, 0.2))), 1)
    
    # Ownership history
    number_of_owners = int(np.random.choice([1, 2, 3, 4], p=[0.45, 0.35, 0.15, 0.05]))
    accident_history = int(np.random.choice([0, 1], p=[0.85, 0.15]))
    import_or_local = random.choices(["imported", "local"], weights=[0.3, 0.7])[0]
    
    # Location and color
    location = random.choice(locations)
    color = random.choice(colors)
    
    # Optional features
    options = {opt: int(np.random.choice([0, 1], p=[0.6, 0.4])) for opt in options_list}
    
    # ------------------------------
    # Price Calculation Model (Enhanced with Stronger Impacts)
    # ------------------------------
    model_factor = 1.0
    if any(x in model.lower() for x in [
        "sportage","3008","rav4","qashqai","tucson","x-trail","tiggo","duster",
        "d-max","land cruiser","x3","q5","santa fe","kodiaq","cx-5","compass",
        "wrangler","ranger","navara","l200"
    ]):
        model_factor += 0.08
    if any(x in model.lower() for x in [
        "punto","clio","picanto","i10","micra","corsa","sandero","ibiza","fabia","c3"
    ]):
        model_factor -= 0.05

    base_price = brand_base.get(brand, 22000) * model_factor

    # Age depreciation (4% per year)
    age_penalty = 0.04 * base_price
    price = base_price - (age * age_penalty)

    # Mileage penalty (12 TND per 1k km)
    price -= (mileage / 1000) * 12

    # Horsepower premium (200 TND per HP above baseline)
    body_avg_hp = {
        "citadine": 77, "compacte": 107, "berline": 145,
        "SUV": 170, "break": 130, "monospace": 130, "pickup": 170
    }
    hp_baseline = body_avg_hp[car_body]
    if horsepower > hp_baseline:
        price += (horsepower - hp_baseline) * 200

    # Body Type Premium (NEW - stronger direct impact)
    body_premiums = {
        "citadine": 1.0,
        "compacte": 1.03,
        "berline": 1.05,
        "SUV": 1.15,
        "break": 1.08,
        "monospace": 1.10,
        "pickup": 1.20
    }
    price *= body_premiums[car_body]

    # Condition adjustments (INCREASED from 8% to 15%)
    condition_add = {
        "excellent": 0.15 * base_price,
        "tres bon etat": 0.05 * base_price,
        "bon etat": 0.00,
        "moyen": -0.12 * base_price,
        "a reparer": -0.30 * base_price
    }[condition]
    price += condition_add

    # Gearbox impact (INCREASED from 4% to 8%)
    if gearbox == "Automatique":
        price += 0.08 * base_price

    # Fuel impact (INCREASED - Hybride from 6% to 10%, added Diesel penalty)
    if fuel == "Hybride":
        price += 0.10 * base_price
    elif fuel == "Diesel":
        price -= 0.03 * base_price

    # Options additive
    feature_values = {
        "sunroof": 600,
        "rear_camera": 500,
        "air_conditioning": 400,
        "alloy_wheels": 700,
        "parking_sensor": 400,
        "gps": 500,
        "bluetooth": 200
    }
    for opt, val in options.items():
        if val == 1:
            price += feature_values.get(opt, 0)

    # Import/local impact (INCREASED from Â±3% to Â±8%)
    if import_or_local == "imported":
        price *= (0.92 if brand in ["Dacia","Fiat","Chery"] else 1.08)

    # Location premium (NEW - was completely missing!)
    location_premiums = {
        "Tunis": 1.05,
        "Ariana": 1.03,
        "Ben Arous": 1.02,
        "Sfax": 1.01,
        "Sousse": 1.01
    }
    price *= location_premiums.get(location, 1.0)

    # Color premium (NEW - was completely missing!)
    if color in ["blanc", "noir", "gris"]:
        price += 300
    elif color == "argent":
        price += 200

    # Accident penalty
    if accident_history == 1:
        price -= 0.15 * base_price

    # Owners penalty
    price -= (number_of_owners - 1) * 350

    # Additive noise
    price += np.random.normal(0, 500)

    # Final constraints
    price = int(min(max(price, 5000), 250000))

    # Build row
    row = {
        "brand": brand,
        "model": model,
        "year": year,
        "mileage": mileage,
        "fuel": fuel,
        "gearbox": gearbox,
        "vehicle_condition": condition,
        "car_body": car_body,
        "horsepower": horsepower,
        "engine_size": engine_size,
        "number_of_owners": number_of_owners,
        "accident_history": accident_history,
        "import_or_local": import_or_local,
        "location": location,
        "color": color,
        "price": price
    }
    
    # Add optional features
    for opt in options_list:
        row[opt] = options[opt]
    
    rows.append(row)

# Create DataFrame
df = pd.DataFrame(rows)

# Reorder columns
cols_order = [
    "brand", "model", "year", "mileage", "fuel", "gearbox", "vehicle_condition", "car_body",
    "horsepower", "engine_size", "number_of_owners", "accident_history", "import_or_local",
    "location", "color"
] + options_list + ["price"]
df = df[cols_order]

# Save to CSV
out_file = "../data/raw/tunisia_cars_dataset.csv"
df.to_csv(out_file, index=False, encoding="utf-8")

# Display summary
print("\n" + "="*60)
print("âœ… Dataset Generated Successfully!")
print("="*60)
print(f"File: {out_file}")
print(f"Total Rows: {len(df):,}")
print(f"Columns: {len(df.columns)}")
print(f"Brands: {df['brand'].nunique()}")
print(f"Models: {df['model'].nunique()}")
print(f"Locations: {df['location'].nunique()}")
print(f"Price Range: {df['price'].min():,} - {df['price'].max():,} TND")
print(f"Average Price: {df['price'].mean():,.0f} TND")
print("="*60)

# Show sample
print("\nðŸ“Š Sample Data:")
print(df.sample(8, random_state=42).to_string(index=False))


Generating 60,000 car records...
  Generated 10,000 rows...
  Generated 10,000 rows...
  Generated 20,000 rows...
  Generated 20,000 rows...
  Generated 30,000 rows...
  Generated 30,000 rows...
  Generated 40,000 rows...
  Generated 40,000 rows...
  Generated 50,000 rows...
  Generated 50,000 rows...
  Generated 60,000 rows...
  Generated 60,000 rows...

âœ… Dataset Generated Successfully!
File: ../data/raw/tunisia_cars_dataset.csv
Total Rows: 60,000
Columns: 23
Brands: 23
Models: 98
Locations: 24
Price Range: 5,000 - 147,796 TND
Average Price: 26,216 TND

ðŸ“Š Sample Data:
  brand    model  year  mileage    fuel     gearbox vehicle_condition car_body  horsepower  engine_size  number_of_owners  accident_history import_or_local  location  color  air_conditioning  parking_sensor  rear_camera  sunroof  alloy_wheels  bluetooth  gps  price
   Fiat     Tipo  2020    79674 Essence    Manuelle     tres bon etat      SUV         186          2.2                 1                 1        impor