Load data from "/backend/dump_cars.json".

In [2]:
import json
import pandas as pd

# Load JSON dump
with open("/Users/belyaevmikhail/VCCode/car-market-place/backend/dump_cars.json", "r") as f:
    raw_data = json.load(f)

# Flatten into DataFrame
df = pd.json_normalize([item["fields"] for item in raw_data if "fields" in item])

# Basic check
print(f"Total: {len(df)} rows")
df.head()

Total: 8885 rows


Unnamed: 0,brand,model,year,price,description,created_at,mileage,car_ad_id,reference_url,description_detail,...,additional_options,location,owner_name,owner_member_since,owner_last_seen,owner_profile_url,owner_tel_number,owner_type,body_type,owner_count
0,Toyota,Camry,2022,30000.0,A comfortable sedan.,2024-12-17T07:00:00,,,,,...,,,,,,,,,,
1,Chevrolet,Malibu 1,2013,12000.0,"93,000 km odometer, light scratch on the body.",2024-12-17T07:00:00,,,,,...,,,,,,,,,,
2,Chevrolet,Lacetti,2012,7376.36,ласети 1.6 сотилади,2024-12-15T14:00:00,245000.0,,,,...,,,,,,,,,,
3,Chevrolet,Lacetti,2012,6435.75,Lasseti 2012 2 pz,2024-12-13T14:00:00,304000.0,,,,...,,,,,,,,,,
4,Chevrolet,Lacetti,2012,6930.81,Laseti sotiladi yili 2012,2024-11-22T14:00:00,347.0,,,,...,,,,,,,,,,


Clean the data:

In [3]:
import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
from datetime import datetime, timedelta

print("🚀 Starting data cleaning pipeline...")
print("🔢 Initial rows:", len(df))

# Step 1: Convert to numeric
df["price"] = pd.to_numeric(df["price"], errors="coerce")
df["year"] = pd.to_numeric(df["year"], errors="coerce")
df["mileage"] = pd.to_numeric(df["mileage"], errors="coerce")

# Step 2: Drop rows missing price/year/mileage/car_ad_id
before = len(df)
df = df.dropna(subset=["price", "year", "mileage", "car_ad_id"])
print(f"✅ Step 2: Dropped {before - len(df)} rows with missing key fields → {len(df)} remain")

# Step 3: Drop duplicates by car_ad_id
before = len(df)
df = df.drop_duplicates(subset="car_ad_id")
print(f"✅ Step 3: Dropped {before - len(df)} duplicate ads → {len(df)} remain")

# Step 4: Drop rows with old 'created_at' (older than 30 days)
df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce")
last_month = datetime.now() - timedelta(days=30)
before = len(df)
df = df[df["created_at"] >= last_month]
print(f"✅ Step 4: Filtered to last 30 days → Dropped {before - len(df)} rows → {len(df)} remain")

# Step 5: Detect & drop price outliers using linear regression on year + mileage
X = df[["year", "mileage"]]
y = df["price"]
model = LinearRegression()
model.fit(X, y)
df["price_predicted"] = model.predict(X)
df["residual"] = np.abs(df["price"] - df["price_predicted"])

threshold = df["residual"].mean() + 3 * df["residual"].std()
before = len(df)
df = df[df["residual"] < threshold]
print(f"✅ Step 5: Removed {before - len(df)} price outliers → {len(df)} remain")

# Step 6: Drop temp columns
df.drop(columns=["price_predicted", "residual"], inplace=True)

# Final summary
print("🎯 Final dataset shape:", df.shape)
print("🧼 Missing values per column:\n", df.isnull().sum())

🚀 Starting data cleaning pipeline...
🔢 Initial rows: 8885
✅ Step 2: Dropped 90 rows with missing key fields → 8795 remain
✅ Step 3: Dropped 0 duplicate ads → 8795 remain
✅ Step 4: Filtered to last 30 days → Dropped 647 rows → 8148 remain
✅ Step 5: Removed 186 price outliers → 7962 remain
🎯 Final dataset shape: (7962, 26)
🧼 Missing values per column:
 brand                    0
model                    0
year                     0
price                    0
description              0
created_at               0
mileage                  0
car_ad_id                0
reference_url            0
description_detail      24
gear_type               56
color                 1025
vehicle_type          7962
fuel_type             3486
condition             3191
customer_paid_tax     7962
additional_options    2953
location                 0
owner_name              24
owner_member_since      24
owner_last_seen         24
owner_profile_url       76
owner_tel_number      7962
owner_type            1316

Save as CSV for training:

In [5]:
df.to_csv("cars_cleaned.csv", index=False)
print("📁 Cleaned data saved to cars_cleaned.csv")

📁 Cleaned data saved to cars_cleaned.csv


Train a better ML model (with more features):

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import joblib
import pandas as pd

# Step 1: Select useful features
feature_cols = ["year", "mileage", "brand", "model", "gear_type", "color", "fuel_type", "body_type"]
df_ml = df[feature_cols + ["price"]].dropna()

# Step 2: Define numerical and categorical columns
num_cols = ["year", "mileage"]
cat_cols = ["brand", "model", "gear_type", "color", "fuel_type", "body_type"]

# Step 3: Build preprocessing pipeline
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
], remainder="passthrough")  # keeps numerical columns

# Step 4: Full pipeline with model
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(n_estimators=100, random_state=42))
])

# Step 5: Train-test split
X = df_ml[feature_cols]
y = df_ml["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train
pipeline.fit(X_train, y_train)

# Optional: Evaluate
score = pipeline.score(X_test, y_test)  
print(f"🎯 Model R² Score: {score:.4f}")

🎯 Model R² Score: 0.9214


Save the model as .pkl:

In [8]:
joblib.dump(pipeline, "model_output/car_price_model_v2.pkl")
print("✅ Model saved to model_output/car_price_model_v2.pkl")

✅ Model saved to model_output/car_price_model_v2.pkl
