In [1]:
import pandas as pd
import json
import math
import random
from collections import Counter
import re

In [2]:
from google.colab import files
uploaded = files.upload()

Saving carmax_cleaned.csv to carmax_cleaned.csv


In [3]:
filename = list(uploaded.keys())[0]
raw = pd.read_csv(filename)

In [4]:
camry = raw[(raw["makeName"] == "Toyota") & (raw["modelName"] == "Camry")].copy()
camry = camry[camry["price"].notna() & (camry["price"] > 0)]
camry = camry[camry["unitMileage/value"].notna()]

In [5]:
camry.info()

<class 'pandas.core.frame.DataFrame'>
Index: 777 entries, 10 to 5345
Columns: 166 entries, accidentCount to vin
dtypes: float64(25), int64(13), object(128)
memory usage: 1013.7+ KB


In [6]:
camry["age"] = 2025 - camry["carYear"]
camry["mileage"] = camry["unitMileage/value"]

In [7]:
if "distance" in camry.columns and camry["distance"].notna().any():
    camry["listing_distance_miles"] = pd.to_numeric(camry["distance"], errors="coerce")
else:
    camry["listing_distance_miles"] = float("nan")

In [8]:
# Trim-related flags
camry["trimName"] = camry["trimName"].fillna("Unknown")
camry["le_flag"] = camry["trimName"].str.contains(r"\bLE\b", na=False).astype(int)
camry["hybrid_flag"] = camry["trimName"].str.contains("Hybrid", na=False).astype(int)

# Exterior color (top 10 + Other)
K_COLOR = 10
if "exteriorColorName" in camry.columns:
    top_colors = camry["exteriorColorName"].fillna("Unknown").value_counts().nlargest(K_COLOR).index.tolist()
    camry["exteriorColorCat"] = camry["exteriorColorName"].where(camry["exteriorColorName"].isin(top_colors), other="Other")
else:
    camry["exteriorColorCat"] = "Unknown"

# Trim (top 10 + Other)
K_TRIM = 10
top_trims = camry["trimName"].value_counts().nlargest(K_TRIM).index.tolist()
camry["trimCat"] = camry["trimName"].where(camry["trimName"].isin(top_trims), other="Other")

# Options (options/0..9) -> pick top 15 frequent options as binary flags
option_cols = [c for c in camry.columns if re.fullmatch(r"options/\d+", c)]
def norm_opt(s: str) -> str:
    s = s.strip().lower()
    s = re.sub(r"\s+", " ", s)
    return s

all_opts = []
for c in option_cols:
    all_opts += camry[c].dropna().astype(str).tolist()
normed_opts = [norm_opt(s) for s in all_opts if isinstance(s, str)]
opt_counts = Counter(normed_opts)
M_OPTS = 15
top_opts = [opt for opt, _ in opt_counts.most_common(M_OPTS)]

for opt in top_opts:
    col_name = f"opt_{re.sub(r'[^a-z0-9]+','_', opt)}"
    def has_opt_row(row):
        for c in option_cols:
            val = row.get(c)
            if pd.isna(val):
                continue
            if norm_opt(str(val)) == opt:
                return 1
        return 0
    camry[col_name] = camry.apply(has_opt_row, axis=1)

# One-hot encodings for color & trim
color_dummies = pd.get_dummies(camry["exteriorColorCat"], prefix="color")
trim_dummies  = pd.get_dummies(camry["trimCat"], prefix="trim")

# Assemble features
numeric_cols = ["age", "mileage", "listing_distance_miles"]
binary_cols  = ["le_flag", "hybrid_flag"] + [c for c in camry.columns if c.startswith("opt_")]

X_df = pd.concat([
    camry[numeric_cols],
    camry[binary_cols],
    color_dummies,
    trim_dummies
], axis=1)

y_df = camry["price"].copy()

# Clean numeric cols: drop all-NaN numeric columns, fill others w/ median
numeric_cols_clean = []
for c in numeric_cols:
    if c in X_df.columns:
        X_df[c] = pd.to_numeric(X_df[c], errors="coerce")
        if X_df[c].isna().all():
            X_df.drop(columns=[c], inplace=True)
        else:
            X_df[c] = X_df[c].fillna(X_df[c].median())
            numeric_cols_clean.append(c)

# Scale numeric features (z-score) — helps gradient descent
scaler_stats = {}
for c in numeric_cols_clean:
    mu = X_df[c].mean()
    sd = X_df[c].std(ddof=0)
    if sd == 0 or pd.isna(sd):
        sd = 1.0
    scaler_stats[c] = {"mean": float(mu), "std": float(sd)}
    X_df[c] = (X_df[c] - mu) / sd

feature_cols = list(X_df.columns)



In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(
    X_df, y_df, test_size=0.2, random_state=42, shuffle=True
)

In [11]:
feature_cols = X_train_df.columns.tolist()

In [12]:
def df_to_matrix_and_vector(X_df, y_df):
    X = [list(map(float, row)) for row in X_df.values.tolist()]
    y = [float(v) for v in y_df.values.tolist()]
    return X, y

X_train, y_train = df_to_matrix_and_vector(X_train_df, y_train_df)
X_test,  y_test  = df_to_matrix_and_vector(X_test_df,  y_test_df)


In [13]:
beta = [0.0] * (len(feature_cols) + 1)
beta[0] = float(y_train_df.mean())

In [14]:
if "intercept" not in X_train_df.columns:
    X_train_df.insert(0, "intercept", 1.0)
if "intercept" not in X_test_df.columns:
    X_test_df.insert(0, "intercept", 1.0)

In [16]:
def loss(X_train_df, beta, y_train_df):
  m = X_train_df.shape[0]
  n = len(beta)
  total_sse = 0.0
  for i in range(m):
    row = X_train_df.iloc[i]
    y_hat_i = 0.0
    for j in range(n):
      y_hat_i += beta[j] * float(row.iloc[j])

    err = y_hat_i - float(y_train_df.iloc[i])
    total_sse += err * err

  total_sse = total_sse / (2.0 * m)
  return total_sse

In [17]:
def predictions(beta, X_df):
    m = X_df.shape[0]
    n = len(beta)
    y_hats = []
    for i in range(m):
        row = X_df.iloc[i]
        y_hat_i = 0.0
        for j in range(n):
            y_hat_i += beta[j] * float(row.iloc[j])
        y_hats.append(y_hat_i)
    return y_hats


In [18]:
def compute_gradients(X_df, y_df, beta):
    m = X_df.shape[0]
    n = len(beta)
    y_hats = predictions(beta, X_df)  # reuse your helper

    grads = [0.0] * n
    for i in range(m):
        err = y_hats[i] - float(y_df.iloc[i])
        row = X_df.iloc[i]
        for j in range(n):
            grads[j] += err * float(row.iloc[j])

    for j in range(n):
        grads[j] /= m
    return grads

In [19]:
def gradient_descent(X_df, y_df, beta, lr=0.03, epochs=3000, verbose_every=500):
    history = []
    for epoch in range(1, epochs + 1):
        grads = compute_gradients(X_df, y_df, beta)
        for j in range(len(beta)):
            beta[j] -= lr * grads[j]
        if verbose_every and (epoch % verbose_every == 0 or epoch == 1 or epoch == epochs):
            history.append((epoch, loss(X_df, beta, y_df)))
    return beta, history

In [20]:
def mean_squared_error_series(y_df, y_hat_list):
    m = len(y_hat_list)
    total = 0.0
    for i in range(m):
        d = y_hat_list[i] - float(y_df.iloc[i])
        total += d * d
    return total / m

In [None]:
beta, history = gradient_descent(X_train_df, y_train_df, beta, lr=0.03, epochs=3000, verbose_every=500)

In [None]:
train_preds = predictions(beta, X_train_df)
test_preds  = predictions(beta, X_test_df)

train_mse = mean_squared_error_series(y_train_df, train_preds)
test_mse  = mean_squared_error_series(y_test_df,  test_preds)

print(f"Train MSE: {train_mse:,.2f}")
print(f" Test MSE: {test_mse:,.2f}")
print("History (first & last few):", history[:3] + history[-3:])