In [46]:
!pip install -r ../requirements.txt --quiet

In [47]:
import os
import json

with open("../.secrets/kaggle.json") as f:
    creds = json.load(f)

os.environ["KAGGLE_USERNAME"] = creds["username"]
os.environ["KAGGLE_KEY"] = creds["key"]

In [48]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.base import clone
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

DATA_DIR = Path("../data")
TRAIN_PATH = DATA_DIR / "train.csv"
TEST_PATH = DATA_DIR / "test.csv"

In [49]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
train_df = train_df.set_index("Id")
test_df = test_df.set_index("Id")

y = train_df["SalePrice"]
X = train_df.drop(columns="SalePrice")

numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()

len(numeric_features), len(categorical_features)


(36, 43)

- I split the columns into numeric vs. categorical so I can run different cleaners on them (numbers get scaled, categories get one-hot encoded).
- This baseline only uses a single Ridge regressor, so all the heavy lifting happens in the preprocessing block below.


In [None]:
# Numbers get median-filled to keep outliers from shifting things, then I normalize variance so Ridge treats every scale fairly.
numeric_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

# Categoricals just need the most frequent label + one-hot so the linear model can read them.
categorical_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

# ColumnTransformer keeps both pipelines in sync and makes sure the column order is deterministic for Ridge.
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_features),
        ("cat", categorical_pipeline, categorical_features),
    ]
)

# Ridge with alpha=10 is my "starter" model: it shrinks noisy coefficients without being too aggressive.
baseline_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("regressor", Ridge(alpha=10.0)),
    ]
)

# Hold out 20% so I can sanity-check scores before touching Kaggle.
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

baseline_pipeline.fit(X_train, y_train)

valid_preds = baseline_pipeline.predict(X_valid)
mse = mean_squared_error(y_valid, valid_preds)
rmse = np.sqrt(mse)
r2 = r2_score(y_valid, valid_preds)

# Compare against the dumb "predict average price" strategy to prove the pipeline actually helps.
naive_pred = np.full_like(y_valid, y_train.mean(), dtype=float)
naive_mse = mean_squared_error(y_valid, naive_pred)
naive_rmse = np.sqrt(naive_mse)

{"rmse": rmse, "r2": r2, "naive_rmse": naive_rmse}


{'rmse': np.float64(30642.20118234258),
 'r2': 0.8775874105042153,
 'naive_rmse': np.float64(87619.03450611533)}

### Kaggle log-RMSE validation
Kaggle scores submissions using the RMSE between the logarithms of predicted and observed prices. This cell reports that metric for the validation split so we can anticipate leaderboard behaviour.


In [None]:
# Clamp predictions to >=1 so the log metric behaves (Kaggle also guards against negative sale prices).
safe_valid_preds = np.clip(valid_preds, 1, None)
safe_naive_preds = np.clip(naive_pred, 1, None)

log_rmse = np.sqrt(mean_squared_error(np.log(y_valid), np.log(safe_valid_preds)))
log_naive_rmse = np.sqrt(mean_squared_error(np.log(y_valid), np.log(safe_naive_preds)))

{"kaggle_log_rmse": log_rmse, "baseline_log_rmse": log_naive_rmse}


In [None]:
# Retrain on the full dataset before generating the CSV so Kaggle sees every observation I have.
final_pipeline = clone(baseline_pipeline)
final_pipeline.fit(X, y)

# Nothing fancy here—just predict on the test set and store it for upload.
test_predictions = final_pipeline.predict(test_df)
submission_df = pd.DataFrame({"Id": test_df.index, "SalePrice": test_predictions})

submission_path = DATA_DIR / "baseline_submission.csv"
submission_df.to_csv(submission_path, index=False)

submission_path, submission_df.head()


(PosixPath('../data/baseline_submission.csv'),
      Id      SalePrice
 0  1461  103172.739208
 1  1462  151293.867088
 2  1463  173090.469195
 3  1464  192257.716720
 4  1465  202619.866494)

In [52]:
# !kaggle competitions submit -c house-prices-advanced-regression-techniques -f ../data/baseline_submission.csv -m "Baseline"

100%|██████████████████████████████████████| 33.7k/33.7k [00:00<00:00, 71.9kB/s]
Successfully submitted to House Prices - Advanced Regression Techniques

![](../images/baseline.png)