# Sklearn regularized regression with target scaling

Polynomial feature expansion (degrees 1–3) with feature scaling and Ridge regularization, evaluated via 10-fold (~90/10) cross-validation. Target (`y`) is scaled via `TransformedTargetRegressor` so training occurs in standardized space. Logs train/validation metrics per fold, picks the best configuration, and saves the fitted best model plus logs for downstream visualization.

In [1]:
from pathlib import Path
import pickle

import numpy as np
import pandas as pd
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

# Paths
PROJECT_ROOT = Path.cwd().parent
DATA_PATH = PROJECT_ROOT / "input" / "price-data" / "cleaned_combined_price_data.csv"
OUTPUT_DIR = PROJECT_ROOT / "working" / "price_model"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")
print(f"Data path: {DATA_PATH}")
print(f"Output dir: {OUTPUT_DIR}")


Project root: /kaggle
Data path: /kaggle/input/price-data/cleaned_combined_price_data.csv
Output dir: /kaggle/working/price_model


In [2]:
# Load cleaned data and prepare numeric features/target

df = pd.read_csv(DATA_PATH)

# Convert currency-like columns to numeric
for col in ["GROSS_TAX"]:
    df[col] = (
        df[col]
        .astype(str)
        .str.replace(r"[^0-9.-]", "", regex=True)
        .replace("", np.nan)
    )
    df[col] = pd.to_numeric(df[col], errors="coerce")

TARGET_COL = "TOTAL_VALUE"
feature_cols = [col for col in df.columns if col != TARGET_COL]

clean_df = df.dropna(subset=[TARGET_COL, *feature_cols]).reset_index(drop=True)
X = clean_df[feature_cols].astype(float)
y = clean_df[TARGET_COL].astype(float)

print(f"Dataset after cleaning: {clean_df.shape[0]} rows, {X.shape[1]} features")
print(X.head())


Dataset after cleaning: 387636 rows, 20 features
   ZIPCODE  OWN_OCC  GROSS_TAX  GROSS_AREA  LIVING_AREA  NUM_BLDGS    LUC  \
0   2128.0      1.0    7676.00      3353.0       2202.0        1.0  105.0   
1   2128.0      1.0    7947.02      3299.0       2307.0        1.0  105.0   
2   2128.0     -1.0    7794.44      3392.0       2268.0        1.0  105.0   
3   2128.0     -1.0    7126.49      3108.0       2028.0        1.0  105.0   
4   2128.0      1.0    7620.52      3700.0       2546.0        1.0  104.0   

   RES_FLOOR  YR_LATEST_DEVELOPED  BED_RMS  FULL_BTH  HLF_BTH  TT_RMS  \
0        3.0               1900.0      6.0       3.0      0.0    12.0   
1        3.0               2000.0      3.0       3.0      0.0     9.0   
2        3.0               1985.0      5.0       3.0      2.0    13.0   
3        3.0               1991.0      5.0       3.0      0.0    11.0   
4        3.0               1978.0      6.0       3.0      2.0    13.0   

   KITCHEN_TYPE  HEAT_TYPE  AC_TYPE  NUM_PARKING 

In [3]:
# Cross-validate degrees 1–3 using PolynomialFeatures + RidgeCV (with feature/target scaling) and log train/validation metrics

def make_regressor(degree: int, alphas=(0.01, 0.1, 1.0, 10.0, 100.0)) -> TransformedTargetRegressor:
    feature_pipeline = Pipeline(
        [
            ("poly", PolynomialFeatures(degree=degree, include_bias=False)),
            ("scale", StandardScaler()),
            ("model", RidgeCV(alphas=alphas, cv=5, fit_intercept=True)),
        ]
    )
    return TransformedTargetRegressor(
        regressor=feature_pipeline,
        transformer=StandardScaler(),
        check_inverse=False,
    )

def evaluate_degrees(
    X: pd.DataFrame,
    y: pd.Series,
    degrees=(1, 2, 3),
    n_splits: int = 10,
    seed: int = 42,
    alphas=(0.01, 0.1, 1.0, 10.0, 100.0),
):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    summary_rows = []
    fold_rows = []

    for degree in degrees:
        fold_val_mse = []
        fold_train_mse = []
        fold_alphas = []
        regressor = make_regressor(degree, alphas=alphas)
        for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X), start=1):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            regressor.fit(X_train, y_train)
            preds_val = regressor.predict(X_val)  # inverse-transformed
            preds_train = regressor.predict(X_train)
            val_mse = mean_squared_error(y_val, preds_val)
            train_mse = mean_squared_error(y_train, preds_train)
            alpha = float(regressor.regressor_.named_steps["model"].alpha_)
            fold_val_mse.append(val_mse)
            fold_train_mse.append(train_mse)
            fold_alphas.append(alpha)
            fold_rows.append(
                {
                    "degree": degree,
                    "fold": fold_idx,
                    "train_mse": float(train_mse),
                    "val_mse": float(val_mse),
                    "alpha": alpha,
                    "train_size": int(len(train_idx)),
                    "val_size": int(len(val_idx)),
                }
            )
            print(
                f"Degree {degree} | Fold {fold_idx}/{n_splits} | "
                f"train MSE: {train_mse:.2f} | val MSE: {val_mse:.2f} | alpha: {alpha}"
            )

        summary_rows.append(
            {
                "degree": degree,
                "val_mse_mean": float(np.mean(fold_val_mse)),
                "val_mse_std": float(np.std(fold_val_mse)),
                "train_mse_mean": float(np.mean(fold_train_mse)),
                "train_mse_std": float(np.std(fold_train_mse)),
                "alpha_mean": float(np.mean(fold_alphas)),
                "alpha_mode": float(pd.Series(fold_alphas).mode().iloc[0]),
            }
        )

    summary_df = pd.DataFrame(summary_rows)
    fold_df = pd.DataFrame(fold_rows)
    return summary_df, fold_df

summary_df, fold_df = evaluate_degrees(X, y)
print("Summary metrics by degree:")
print(summary_df)


Degree 1 | Fold 1/10 | train MSE: 100639622.89 | val MSE: 77093255.08 | alpha: 10.0
Degree 1 | Fold 2/10 | train MSE: 96472255.15 | val MSE: 114562025.96 | alpha: 10.0
Degree 1 | Fold 3/10 | train MSE: 100697870.24 | val MSE: 76545533.44 | alpha: 10.0
Degree 1 | Fold 4/10 | train MSE: 98798881.79 | val MSE: 93598427.29 | alpha: 10.0
Degree 1 | Fold 5/10 | train MSE: 95899086.48 | val MSE: 119762103.99 | alpha: 10.0
Degree 1 | Fold 6/10 | train MSE: 98559224.36 | val MSE: 95802128.16 | alpha: 10.0
Degree 1 | Fold 7/10 | train MSE: 93772787.76 | val MSE: 139070143.92 | alpha: 10.0
Degree 1 | Fold 8/10 | train MSE: 99732906.78 | val MSE: 85322397.97 | alpha: 10.0
Degree 1 | Fold 9/10 | train MSE: 100478239.90 | val MSE: 78497175.78 | alpha: 10.0
Degree 1 | Fold 10/10 | train MSE: 97655183.22 | val MSE: 103960300.81 | alpha: 1.0
Degree 2 | Fold 1/10 | train MSE: 84292036.64 | val MSE: 74652418.46 | alpha: 0.01
Degree 2 | Fold 2/10 | train MSE: 82556159.86 | val MSE: 98588160.21 | alpha: 1.

In [4]:
# Select best model (lowest mean validation MSE), refit on full data, and persist artifacts

# Save aggregate logs
summary_path = OUTPUT_DIR / "ridge_poly_cv_summary.csv"
fold_path = OUTPUT_DIR / "ridge_poly_cv_folds.csv"
summary_df.to_csv(summary_path, index=False)
fold_df.to_csv(fold_path, index=False)

# Identify best degree by lowest val_mse_mean
best_idx = summary_df["val_mse_mean"].idxmin()
best_row = summary_df.loc[best_idx]
best_degree = int(best_row["degree"])
print(f"Best degree (by mean val MSE): {best_degree}")

# Refit best regressor on all data
alphas_grid = (0.01, 0.1, 1.0, 10.0, 100.0)
best_regressor = make_regressor(best_degree, alphas=alphas_grid)
best_regressor.fit(X, y)
best_alpha = float(best_regressor.regressor_.named_steps["model"].alpha_)
print(f"Best alpha after full-data fit: {best_alpha}")

# Persist best model
best_model_path = OUTPUT_DIR / "ridge_poly_best_model.pkl"
with best_model_path.open("wb") as f:
    pickle.dump(best_regressor, f)

# Persist best-only fold metrics
best_fold_df = fold_df[fold_df["degree"] == best_degree].reset_index(drop=True)
best_fold_path = OUTPUT_DIR / "ridge_poly_best_folds.csv"
best_fold_df.to_csv(best_fold_path, index=False)

# Persist JSON summary
best_json_path = OUTPUT_DIR / "ridge_poly_best_model.json"
best_payload = {
    "degrees_tested": sorted(summary_df["degree"].tolist()),
    "alphas_grid": list(alphas_grid),
    "outer_cv_folds": 10,
    "inner_cv_folds": 5,
    "dataset_rows": int(X.shape[0]),
    "dataset_features": int(X.shape[1]),
    "best": {
        "degree": best_degree,
        "alpha": best_alpha,
        "val_mse_mean": float(best_row["val_mse_mean"]),
        "val_mse_std": float(best_row["val_mse_std"]),
        "train_mse_mean": float(best_row["train_mse_mean"]),
        "train_mse_std": float(best_row["train_mse_std"]),
    },
    "summary": summary_df.to_dict(orient="records"),
    "folds_best": best_fold_df.to_dict(orient="records"),
}
pd.Series(best_payload).to_json(best_json_path, indent=2)

print("Saved artifacts:")
print(f"- Summary CSV:       {summary_path}")
print(f"- Fold CSV:          {fold_path}")
print(f"- Best folds CSV:    {best_fold_path}")
print(f"- Best model pickle: {best_model_path}")
print(f"- Best model JSON:   {best_json_path}")


Best degree (by mean val MSE): 2
Best alpha after full-data fit: 0.1
Saved artifacts:
- Summary CSV:       /kaggle/working/price_model/ridge_poly_cv_summary.csv
- Fold CSV:          /kaggle/working/price_model/ridge_poly_cv_folds.csv
- Best folds CSV:    /kaggle/working/price_model/ridge_poly_best_folds.csv
- Best model pickle: /kaggle/working/price_model/ridge_poly_best_model.pkl
- Best model JSON:   /kaggle/working/price_model/ridge_poly_best_model.json


In [5]:
# Build full prediction DataFrame with actuals, predictions, and percentage error

# Use the best_regressor already fit on full data above
preds_all = best_regressor.predict(X)
results_df = clean_df.copy()
results_df["PRED_TOTAL_VALUE"] = preds_all
results_df["PCT_DIFF"] = ((results_df["PRED_TOTAL_VALUE"] - results_df["TOTAL_VALUE"]) / results_df["TOTAL_VALUE"]) * 100

preds_path = OUTPUT_DIR / "ridge_poly_predictions.csv"
results_df.to_csv(preds_path, index=False)

print(f"Predictions saved to: {preds_path}")
print(results_df[["TOTAL_VALUE", "PRED_TOTAL_VALUE", "PCT_DIFF"]].head())


Predictions saved to: /kaggle/working/price_model/ridge_poly_predictions.csv
   TOTAL_VALUE  PRED_TOTAL_VALUE  PCT_DIFF
0     719400.0     713110.440757 -0.874278
1     744800.0     737586.239413 -0.968550
2     730500.0     726837.574258 -0.501359
3     667900.0     662024.647242 -0.879676
4     714200.0     709944.438946 -0.595850
