In [2]:
# common imports
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, roc_auc_score, classification_report,
    mean_squared_error, mean_absolute_error, r2_score,
    silhouette_score, davies_bouldin_score
)
import matplotlib.pyplot as plt

# optional: make plots inline if in notebook
# %matplotlib inline


In [3]:
# 2.1 Load Data
houses = pd.read_csv('https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv',
                     na_values='?')
# (Alternatively use the Kaggle competition CSV if you have API access.)
# For demo, we'll use the StatLib housing dataset.

# 2.2 Preprocessing & feature engineering
houses = houses.dropna(subset=['median_house_value'])  # target must be present
y = np.log1p(houses['median_house_value'])
X = houses.drop('median_house_value', axis=1)

# identify numeric and categorical
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

numeric_transformer = Pipeline([
    ('imputer',            # here simple fill with median
     __import__('sklearn').impute.SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
    ('imputer', __import__('sklearn').impute.SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols),
])

# 2.3 Models to compare
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

regressors = {
    'Ridge': Ridge(alpha=1.0, random_state=42),
    'Lasso': Lasso(alpha=0.01, random_state=42),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# 2.4 Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# 2.5 Fit & evaluate
for name, reg in regressors.items():
    pipe = Pipeline([
        ('pre', preprocessor),
        ('reg', reg)
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae  = mean_absolute_error(y_test, y_pred)
    r2   = r2_score(y_test, y_pred)
    print(f"{name}: RMSE={rmse:.3f}, MAE={mae:.3f}, R2={r2:.3f}")


Ridge: RMSE=0.334, MAE=0.253, R2=0.659
Lasso: RMSE=0.341, MAE=0.261, R2=0.644
RandomForest: RMSE=0.232, MAE=0.156, R2=0.836
GradientBoosting: RMSE=0.269, MAE=0.195, R2=0.780
