In [None]:
# Ames Housing Price Prediction â€“ LASSO Regression

This notebook demonstrates a structured machine learning workflow using regularized regression for housing price prediction.

## Modeling Strategy

This notebook demonstrates a structured ML workflow:
- Data cleaning
- Feature encoding
- Outlier handling
- Log-transformed regression target
- LASSO regularization with cross-validation
- Evaluation on original scale

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings("ignore")

## 1. Load Data

In [None]:
# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train["train_flag"] = 1
test["train_flag"] = 0

df = pd.concat([train, test], axis=0, ignore_index=True)

print("Combined shape:", df.shape)

## 2. Missing Value Handling

In [None]:
def handle_missing_values(df):

    train_mask = df["train_flag"] == 1

    # Specific categorical fills
    fill_none_cols = ["PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu"]
    for col in fill_none_cols:
        if col in df.columns:
            df[col] = df[col].fillna("None")

    # LotFrontage by neighborhood median
    if "LotFrontage" in df.columns:
        lf_med = df.loc[train_mask].groupby("Neighborhood")["LotFrontage"].median()
        df["LotFrontage"] = df["LotFrontage"].fillna(df["Neighborhood"].map(lf_med))
        df["LotFrontage"] = df["LotFrontage"].fillna(df.loc[train_mask, "LotFrontage"].median())

    # Garage
    garage_cols = [c for c in df.columns if c.startswith("Garage")]
    for col in garage_cols:
        if df[col].dtype == "object":
            df[col] = df[col].fillna("None")
        else:
            df[col] = df[col].fillna(0)

    # Basement
    bsmt_cols = [c for c in df.columns if c.startswith("Bsmt")]
    for col in bsmt_cols:
        if df[col].dtype == "object":
            df[col] = df[col].fillna("None")
        else:
            df[col] = df[col].fillna(0)

    # Fill remaining with mode (train only)
    mode_row = df.loc[train_mask].mode().iloc[0]
    df = df.fillna(mode_row)

    print("Remaining missing:", df.isna().sum().sum())
    return df

## 3. Ordinal Encoding

In [None]:
def apply_ordinal_mappings(df):

    quality_map_5 = {"Po":0,"Fa":1,"TA":2,"Gd":3,"Ex":4}
    quality_map_6 = {"None":0,"Po":1,"Fa":2,"TA":3,"Gd":4,"Ex":5}
    exposure_map = {"None":0,"No":1,"Mn":2,"Av":3,"Gd":4}
    fin_map = {"None":0,"Unf":1,"LwQ":2,"Rec":3,"BLQ":4,"ALQ":5,"GLQ":6}
    garage_finish_map = {"None":0,"Unf":1,"RFn":2,"Fin":3}

    mapping_dict = {
        "ExterQual": quality_map_5,
        "ExterCond": quality_map_5,
        "BsmtQual": quality_map_6,
        "BsmtCond": quality_map_6,
        "BsmtExposure": exposure_map,
        "BsmtFinType1": fin_map,
        "BsmtFinType2": fin_map,
        "HeatingQC": quality_map_5,
        "KitchenQual": quality_map_5,
        "GarageFinish": garage_finish_map,
        "GarageQual": quality_map_6,
        "GarageCond": quality_map_6,
        "FireplaceQu": quality_map_6,
    }

    for col, mapping in mapping_dict.items():
        if col in df.columns:
            df[col] = df[col].map(mapping)

    return df

## 4. Outlier Removal

In [None]:
def remove_outliers(df):

    train_mask = df["train_flag"] == 1

    conditions = (
        (df["LotFrontage"] > 200) |
        (df["1stFlrSF"] > 4000) |
        (df["LotArea"] > 100000) |
        (df["BsmtFinSF1"] > 4000) |
        (df["TotalBsmtSF"] > 5000) |
        (df["GrLivArea"] > 4000)
    )

    drop_index = df[train_mask & conditions].index
    print("Outliers removed:", len(drop_index))

    return df.drop(index=drop_index)

## 5. Apply Preprocessing

In [None]:
df = handle_missing_values(df)
df = apply_ordinal_mappings(df)
df = remove_outliers(df)

## 6. Target Transformation

In [None]:
# Log-transform target
y = np.log1p(df.loc[df["train_flag"] == 1, "SalePrice"])
y_test_real = df.loc[df["train_flag"] == 0, "SalePrice"]

## 7. Feature Preparation

In [None]:
df_model = df.drop(["SalePrice", "Id"], axis=1, errors="ignore")

categorical_cols = df_model.select_dtypes(include=["object"]).columns
df_model = pd.get_dummies(df_model, columns=categorical_cols, drop_first=False)

X = df_model[df_model["train_flag"] == 1].drop("train_flag", axis=1)
testX = df_model[df_model["train_flag"] == 0].drop("train_flag", axis=1)

## 8. Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

## 9. Pipeline + GridSearchCV

In [None]:
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("lasso", Lasso(max_iter=10000))
])

param_grid = {
    "lasso__alpha": np.logspace(-4, 0, 50)
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    pipeline,
    param_grid,
    scoring="neg_mean_squared_error",
    cv=cv
)

grid.fit(X_train, y_train)

print("Best alpha:", grid.best_params_)

## 10. Evaluation

In [None]:
best_model = grid.best_estimator_

y_pred_log = best_model.predict(X_test)

# Convert back
y_pred = np.expm1(y_pred_log)
y_test_original = np.expm1(y_test)

rmse = np.sqrt(mean_squared_error(y_test_original, y_pred))
r2 = r2_score(y_test_original, y_pred)

print("RMSE:", rmse)
print("R2:", r2)

## 11. Feature Importance

In [None]:
coefs = pd.Series(
    best_model.named_steps["lasso"].coef_,
    index=X.columns
)

selected_features = coefs[coefs != 0]

print("Selected features:", len(selected_features))

selected_features.sort_values(key=np.abs, ascending=False).head(10)