Imports, paths, config

In [1]:
import json, pathlib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# ── Paths
PROJECT_ROOT = pathlib.Path().resolve().parents[0]
RAW_FILE = PROJECT_ROOT / "data" / "raw" / "car.csv"
INTERIM_FILE = PROJECT_ROOT / "data" / "interim" / "cleaned.csv"
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# ── Config
SEED = 42
TEST_SIZE = 0.20
TOP_OPTIONS = 10  # how many option flags to extract
TARGET = "Price($)"

CATEGORICAL_COLS = [
    "Brand","Model","Condition","FuelType","Transmission","DriveType",
    "BodyType","Color","Interior","City","AccidentHistory",
    "Insurance","RegistrationStatus"
]

NUMERIC_COLS = [
    "Year","Mileage(km)","EngineSize(L)","Horsepower","Torque",
    "Doors","Seats","FuelEfficiency(L/100km)","PricePerKm", TARGET
]

print("Project root:", PROJECT_ROOT)
print("Raw file:", RAW_FILE)

Project root: C:\Users\User\Desktop\Projects\Car_Price
Raw file: C:\Users\User\Desktop\Projects\Car_Price\data\raw\car.csv


Load & initial shape

In [2]:
df = pd.read_csv(RAW_FILE)
print("Initial shape:", df.shape)
df.head()


Initial shape: (50100, 25)


Unnamed: 0,Brand,Model,Year,CarAge,Condition,Mileage(km),EngineSize(L),FuelType,Horsepower,Torque,...,Color,Interior,Options,City,AccidentHistory,Insurance,RegistrationStatus,FuelEfficiency(L/100km),PricePerKm,Price($)
0,Porsche,Panamera,2008,17,Used,256395,3.3,Gasoline,513,395,...,White,Cloth,"Navigation, Cruise Control, Heated Seats, Blue...",Tehran,No,Valid,Incomplete,11.96,0.05,13884
1,Audi,A6,2023,2,Used,20433,2.2,Diesel,302,270,...,Black,Cloth,"Parking Sensors, Cruise Control, Touchscreen",Berlin,Yes,Expired,Incomplete,8.74,1.9,38888
2,BMW,X5,2022,3,Used,52328,3.2,Gasoline,400,388,...,Gray,Leather,"Touchscreen, Bluetooth, Cruise Control, Naviga...",Tokyo,Yes,Valid,Complete,15.68,0.63,33074
3,Hyundai,Tucson,2019,6,Used,91878,1.6,Hybrid,187,219,...,Silver,Cloth,"Sunroof, Rear Camera, Bluetooth, Parking Senso...",Delhi,No,Expired,Complete,9.45,0.14,12966
4,Fiat,500,2012,13,Damaged,192331,1.1,Gasoline,90,112,...,Red,Leather,"Heated Seats, Touchscreen",Delhi,No,Valid,Complete,7.16,0.01,2670


Drop exact duplicates

In [3]:
before = len(df)
df = df.drop_duplicates()
print(f"Removed duplicates: {before - len(df)} | Shape now: {df.shape}")

Removed duplicates: 100 | Shape now: (50000, 25)


Drop redundant columns (CarAge) and tidy strings

In [4]:
# 1) Drop CarAge (redundant with Year)
if "CarAge" in df.columns:
    df = df.drop(columns=["CarAge"])
    print("Dropped column: CarAge")

# 2) Strip whitespace from text columns
obj_cols = df.select_dtypes(include="object").columns.tolist()
for c in obj_cols:
    df[c] = df[c].astype(str).str.strip()
print("Stripped whitespace on object columns:", len(obj_cols))

# 3) Drop less important or redundant columns
DROP_COLS = [
    "Color",
    "Interior",
    "Options",
    "City",
    "Insurance",
    "RegistrationStatus",
    "PricePerKm",  # leakage
    "Doors",
    "Seats"
]

existing_drops = [c for c in DROP_COLS if c in df.columns]
df = df.drop(columns=existing_drops)
print("Dropped columns:", existing_drops)


Dropped column: CarAge
Stripped whitespace on object columns: 14
Dropped columns: ['Color', 'Interior', 'Options', 'City', 'Insurance', 'RegistrationStatus', 'PricePerKm', 'Doors', 'Seats']


Coerce numeric columns safely

In [5]:
# Make sure numeric-like columns are numeric (coerce invalid to NaN)
coerced_report = {}
for c in NUMERIC_COLS:
    if c in df.columns:
        before_na = df[c].isna().sum()
        df[c] = pd.to_numeric(df[c], errors="coerce")
        after_na = df[c].isna().sum()
        coerced_report[c] = {"added_NA_from_coercion": int(after_na - before_na)}
pd.DataFrame(coerced_report).T

Unnamed: 0,added_NA_from_coercion
Year,0
Mileage(km),0
EngineSize(L),0
Horsepower,0
Torque,0
FuelEfficiency(L/100km),0
Price($),0


Ensure target present, drop rows with missing target

In [6]:
if TARGET not in df.columns:
    raise ValueError(f"Target column {TARGET!r} not found in data.")
before = len(df)
df = df.dropna(subset=[TARGET])
print(f"Dropped rows with missing target: {before - len(df)} | Shape: {df.shape}")

Dropped rows with missing target: 0 | Shape: (50000, 15)


Missing value imputation

In [7]:
# Numeric → median ; Categorical → mode/Unknown
for c in NUMERIC_COLS:
    if c in df.columns:
        med = df[c].median()
        df[c] = df[c].fillna(med)

for c in CATEGORICAL_COLS:
    if c in df.columns:
        mode = df[c].mode(dropna=True)
        fill = mode.iloc[0] if not mode.empty else "Unknown"
        df[c] = df[c].fillna(fill)

Outlier handling

In [8]:
winsor_cols = [col for col in ["Price($)","Mileage(km)","Horsepower","EngineSize(L)","FuelEfficiency(L/100km)"] if col in df.columns]
winsor_cutoffs = {}
for c in winsor_cols:
    lo, hi = df[c].quantile([0.01, 0.99])
    df[c] = df[c].clip(lower=lo, upper=hi)
    winsor_cutoffs[c] = {"q01": float(lo), "q99": float(hi)}
winsor_cutoffs

{'Price($)': {'q01': 1492.99, 'q99': 85854.33000000007},
 'Mileage(km)': {'q01': 202.0, 'q99': 303929.03},
 'Horsepower': {'q01': 67.0, 'q99': 848.0},
 'EngineSize(L)': {'q01': 0.0, 'q99': 4.9},
 'FuelEfficiency(L/100km)': {'q01': 0.0, 'q99': 17.800100000000022}}

Feature engineering from Options

In [9]:
# # Create OptionsCount + top-K option flags; then drop raw Options
# from collections import Counter

# top_options = []

# if "Options" in df.columns:
#     # Clean nulls
#     df["Options"] = df["Options"].fillna("").astype(str)

#     # 1) OptionsCount
#     df["OptionsCount"] = df["Options"].apply(lambda s: len([t for t in s.split(",") if t.strip()]))

#     # 2) Find top-K options (on the whole dataset is fine here since these are just presence flags)
#     tokens = []
#     for v in df["Options"]:
#         tokens += [t.strip().lower() for t in v.split(",") if t.strip()]
#     top_options = [opt for opt, _ in Counter(tokens).most_common(TOP_OPTIONS)]

#     # 3) Build option-flag features
#     for opt in top_options:
#         colname = f"opt_{opt.replace(' ', '_')}"
#         df[colname] = df["Options"].str.lower().apply(
#             lambda s, o=opt: int(o in [t.strip() for t in s.split(",") if t.strip()])
#         )

#     # 4) Drop raw text column to keep dataset clean
#     df = df.drop(columns=["Options"])
#     print("Created OptionsCount +", len(top_options), "option flags; dropped raw 'Options' column.")

# else:
#     print("'Options' column not found; skipping option features.")


In [10]:
# --- EXTRA PREPROCESSING CONFIG ---
TARGET = "Price($)"

RARE_THRESH = 50          # minimum count to keep a category (Brand/Model)
CORR_DROP_THRESHOLD = 0.95
LOG_FEATURES_AUTO = True  # auto-detect skewed numeric cols (excludes TARGET)
LOG_FEATURES_MANUAL = ["Mileage(km)"]  # apply log1p to fix skew if needed

# Record metadata
from pathlib import Path
import json
META_PATH = PROJECT_ROOT / "data" / "processed" / "preprocess_meta.json"
try:
    with open(META_PATH) as f:
        meta = json.load(f)
except FileNotFoundError:
    meta = {}


In [11]:
# --- Categorical normalization (final: only kept columns) ---

def _norm(s):
    return str(s).strip().lower().replace('-', ' ').replace('_', ' ')

fuel_map = {
    "gasoline": "gasoline", "petrol": "gasoline", "benzine": "gasoline",
    "diesel": "diesel",
    "hybrid": "hybrid", "hybrid electric": "hybrid",
    "electric": "electric", "ev": "electric"
}

trans_map = {
    "auto": "automatic", "automatic": "automatic", "at": "automatic",
    "manual": "manual", "mt": "manual"
}

cond_map = {
    "used": "used", "new": "new",
    "damaged": "damaged", "accident": "damaged", "salvage": "damaged"
}

drive_map = {
    "fwd": "fwd", "front wheel drive": "fwd",
    "rwd": "rwd", "rear wheel drive": "rwd",
    "awd": "awd", "4wd": "awd", "4x4": "awd", "all wheel drive": "awd"
}

body_map = {
    "sedan": "sedan", "saloon": "sedan",
    "hatchback": "hatchback",
    "suv": "suv", "crossover": "suv",
    "coupe": "coupe",
    "wagon": "wagon", "estate": "wagon",
    "pickup": "pickup", "truck": "pickup",
    "convertible": "convertible", "cabrio": "convertible"
}

# Apply mappings only if column exists (from the kept set)
if "FuelType" in df.columns:
    df["FuelType"] = df["FuelType"].map(lambda x: fuel_map.get(_norm(x), _norm(x)))

if "Transmission" in df.columns:
    df["Transmission"] = df["Transmission"].map(lambda x: trans_map.get(_norm(x), _norm(x)))

if "Condition" in df.columns:
    df["Condition"] = df["Condition"].map(lambda x: cond_map.get(_norm(x), _norm(x)))

if "DriveType" in df.columns:
    df["DriveType"] = df["DriveType"].map(lambda x: drive_map.get(_norm(x), _norm(x)))

if "BodyType" in df.columns:
    df["BodyType"] = df["BodyType"].map(lambda x: body_map.get(_norm(x), _norm(x)))

if "AccidentHistory" in df.columns:
    # normalize to yes/no
    df["AccidentHistory"] = df["AccidentHistory"].map(lambda x: "yes" if "yes" in _norm(x) else "no")

# Light cleanup for identity text fields you kept
for c in ["Brand", "Model"]:
    if c in df.columns:
        df[c] = df[c].astype(str).str.replace(r"\s+", " ", regex=True)

meta.setdefault("normalization", {})["applied"] = True
print("✅ Normalized: FuelType, Transmission, Condition, DriveType, BodyType, AccidentHistory; tidied Brand/Model.")


✅ Normalized: FuelType, Transmission, Condition, DriveType, BodyType, AccidentHistory; tidied Brand/Model.


In [12]:
from collections import Counter

def group_rare(df, col, min_count, other_label="Other"):
    vc = Counter(df[col].astype(str))
    keep = {k for k,v in vc.items() if v >= min_count}
    df[col] = df[col].astype(str).apply(lambda x: x if x in keep else other_label)
    return sorted(list(keep))

rare_info = {}

if "Brand" in df.columns:
    kept_brands = group_rare(df, "Brand", RARE_THRESH, "Other")
    rare_info["Brand_min_count"] = RARE_THRESH
    rare_info["Brand_kept"] = kept_brands

# Model can be very granular—use a slightly lower threshold if you like
if "Model" in df.columns:
    kept_models = group_rare(df, "Model", max(20, RARE_THRESH//2), "Other")
    rare_info["Model_min_count"] = max(20, RARE_THRESH//2)
    rare_info["Model_kept"] = kept_models

meta["rare_category"] = rare_info


In [13]:
# --- Log-transform skewed numeric features (after dropping extra columns) ---
from scipy.stats import skew
logged_cols = set()

# Only apply to numeric predictors (not the target)
candidate_log = [
    c for c in df.select_dtypes(include=[np.number]).columns 
    if c != TARGET and c not in ["PricePerKm"]  # avoid derived or leakage features
]

for c in candidate_log:
    s = df[c].dropna()
    if (s <= 0).any():   # skip columns with zeros/negatives
        continue
    if abs(skew(s)) > 1.0:   # only log-transform if highly skewed
        df[c] = np.log1p(df[c])
        logged_cols.add(c)

meta["log_transform"] = {"columns": sorted(list(logged_cols))}
print("✅ Log-transformed columns:", sorted(list(logged_cols)))


✅ Log-transformed columns: ['Horsepower']


In [14]:
with open(META_PATH, "w") as f:
    json.dump(meta, f, indent=2)

print("Extra preprocessing recorded to preprocess_meta.json")


Extra preprocessing recorded to preprocess_meta.json


Train/Test split

In [15]:
train_df, test_df = train_test_split(df, test_size=TEST_SIZE, random_state=SEED)
print("Train shape:", train_df.shape, " | Test shape:", test_df.shape)

Train shape: (40000, 15)  | Test shape: (10000, 15)


Save processed data + metadata

In [16]:
# ---- Define what we actually kept/dropped (FINAL 14 for modeling) ----
# Stick to the 14 columns you agreed with the evaluator (no Doors/Seats/Options/etc.)
KEPT_ORDER = [
    "Brand", "Model", "Year", "Condition",
    "Mileage(km)", "EngineSize(L)", "FuelType",
    "Horsepower", "Torque", "Transmission",
    "DriveType", "BodyType",
    "FuelEfficiency(L/100km)",
    "Price($)"  # target
]

# Keep only those that exist (in case columns were renamed upstream)
kept_cols = [c for c in KEPT_ORDER if c in df.columns]

# Everything else is considered dropped
dropped_cols = [c for c in df.columns if c not in kept_cols]

# If you removed all the category unification steps, leave this empty
normalization_info = {
    "FuelType": "kept/raw",
    "Transmission": "kept/raw"
    # add notes if you normalized anything
}

# If you didn’t run the log-transform step or it found none, make sure this exists
if "logged_cols" not in locals():
    logged_cols = []


In [17]:
# Save an interim full-clean copy (pre-split)
INTERIM_FILE.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(INTERIM_FILE, index=False)

# ---- Train/Test split (make sure you've done it right above) ----
# train_df, test_df = train_test_split(df, test_size=TEST_SIZE, random_state=SEED)

# Save final splits
train_path = PROCESSED_DIR / "train.csv"
test_path  = PROCESSED_DIR / "test.csv"
train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)

# ---- Build reproducible metadata that reflects the NEW pipeline ----
meta = {
    "seed": SEED,
    "test_size": TEST_SIZE,
    "kept_columns": sorted(list(kept_cols)),
    "dropped_columns": sorted(list(dropped_cols)),
    "normalization": normalization_info,
    "log_transform": {"columns": sorted(list(logged_cols))},
    "notes": [
        "CarAge dropped as redundant with Year",
        "PricePerKm excluded from modeling to avoid leakage",
        "Options text features removed per evaluation feedback (kept core 14 columns)"
    ]
}

# Save metadata
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
with open(PROCESSED_DIR / "preprocess_meta.json", "w") as f:
    json.dump(meta, f, indent=2)

print("Saved files:")
print(" -", INTERIM_FILE)
print(" -", train_path)
print(" -", test_path)
print(" -", PROCESSED_DIR / "preprocess_meta.json")


Saved files:
 - C:\Users\User\Desktop\Projects\Car_Price\data\interim\cleaned.csv
 - C:\Users\User\Desktop\Projects\Car_Price\data\processed\train.csv
 - C:\Users\User\Desktop\Projects\Car_Price\data\processed\test.csv
 - C:\Users\User\Desktop\Projects\Car_Price\data\processed\preprocess_meta.json


Quick sanity report

In [18]:
# ---- Simplified report for final 14-column version ----
report = {
    "rows_train": len(train_df),
    "rows_test": len(test_df),
    "kept_columns": sorted(list(kept_cols)),
    "dropped_columns": sorted(list(dropped_cols)),
    "winsorized": list(winsor_cutoffs.keys()) if "winsor_cutoffs" in locals() else [],
    "target": TARGET,
    "notes": [
        "Dropped CarAge (redundant with Year)",
        "Dropped PricePerKm (data leakage risk)",
        "Removed Options-related engineered features per evaluator feedback",
        "Kept core 14 columns for modeling"
    ]
}
report


{'rows_train': 40000,
 'rows_test': 10000,
 'kept_columns': ['BodyType',
  'Brand',
  'Condition',
  'DriveType',
  'EngineSize(L)',
  'FuelEfficiency(L/100km)',
  'FuelType',
  'Horsepower',
  'Mileage(km)',
  'Model',
  'Price($)',
  'Torque',
  'Transmission',
  'Year'],
 'dropped_columns': ['AccidentHistory'],
 'winsorized': ['Price($)',
  'Mileage(km)',
  'Horsepower',
  'EngineSize(L)',
  'FuelEfficiency(L/100km)'],
 'target': 'Price($)',
 'notes': ['Dropped CarAge (redundant with Year)',
  'Dropped PricePerKm (data leakage risk)',
  'Removed Options-related engineered features per evaluator feedback',
  'Kept core 14 columns for modeling']}

Final Check

In [19]:
print("Final dataset shape:", df.shape)

Final dataset shape: (50000, 15)


In [20]:
print("Missing values per column:\n", df.isna().sum())

Missing values per column:
 Brand                      0
Model                      0
Year                       0
Condition                  0
Mileage(km)                0
EngineSize(L)              0
FuelType                   0
Horsepower                 0
Torque                     0
Transmission               0
DriveType                  0
BodyType                   0
AccidentHistory            0
FuelEfficiency(L/100km)    0
Price($)                   0
dtype: int64


In [21]:
print("Duplicate rows:", df.duplicated().sum())

Duplicate rows: 0


In [22]:
print("Column types:\n", df.dtypes)

Column types:
 Brand                       object
Model                       object
Year                         int64
Condition                   object
Mileage(km)                float64
EngineSize(L)              float64
FuelType                    object
Horsepower                 float64
Torque                       int64
Transmission                object
DriveType                   object
BodyType                    object
AccidentHistory             object
FuelEfficiency(L/100km)    float64
Price($)                   float64
dtype: object


In [23]:
df.head(10)

Unnamed: 0,Brand,Model,Year,Condition,Mileage(km),EngineSize(L),FuelType,Horsepower,Torque,Transmission,DriveType,BodyType,AccidentHistory,FuelEfficiency(L/100km),Price($)
0,Porsche,Panamera,2008,used,256395.0,3.3,gasoline,6.242223,395,manual,awd,sedan,no,11.96,13884.0
1,Audi,A6,2023,used,20433.0,2.2,diesel,5.713733,270,manual,fwd,sedan,yes,8.74,38888.0
2,BMW,X5,2022,used,52328.0,3.2,gasoline,5.993961,388,automatic,awd,suv,yes,15.68,33074.0
3,Hyundai,Tucson,2019,used,91878.0,1.6,hybrid,5.236442,219,automatic,fwd,suv,no,9.45,12966.0
4,Fiat,500,2012,damaged,192331.0,1.1,gasoline,4.51086,112,automatic,fwd,hatchback,no,7.16,2670.0
5,Porsche,911 Carrera,2018,used,110968.0,3.3,gasoline,6.338594,392,automatic,awd,coupe,no,12.69,47830.0
6,Mercedes-Benz,S-Class,2019,used,82607.0,4.9,hybrid,6.234411,750,manual,rwd,sedan,yes,17.8001,51189.0
7,Porsche,Panamera,2014,used,163074.0,3.5,gasoline,6.242223,428,manual,rwd,sedan,no,13.06,27296.0
8,Audi,Q7,2007,used,274471.0,2.0,gasoline,5.755742,277,automatic,awd,suv,yes,10.38,5389.0
9,Ford,Mustang,2019,used,95498.0,4.6,gasoline,5.899897,588,manual,rwd,coupe,yes,14.9,10704.0
