In [111]:
# Import packages
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold, ParameterGrid

# Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

# Models
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from xgboost import XGBRegressor

# Model evaluation
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.base import clone

### Read and Prepare data

In [114]:
df = pd.read_csv('final_cleaned_data.csv')

In [116]:
df.head()

Unnamed: 0,City,State,Merge_Key_x,State_FIPS,Place_FIPS,NAME,Total_Population_2021,Median_Household_Income_2021,Median_Housing_Value_2021,Owner_Occupied_Units_2021,...,leisure_bleachers,leisure_garden,leisure_ice_rink,leisure_park,leisure_pitch,leisure_sports_centre,leisure_stadium,other,fortune,sbs
0,Abilene,TX,"abilene,TX",48.0,1000.0,Abilene city,125474.0,54493.0,137800.0,24821.0,...,0,0,0,48,0,0,6,11,0,11
1,Akron,OH,"akron,OH",39.0,1000.0,Akron city,191483.0,42129.0,87100.0,41753.0,...,0,0,0,238,0,0,15,14,2,21
2,Alameda,CA,"alameda,CA",6.0,562.0,Alameda city,78320.0,113339.0,972800.0,14477.0,...,0,0,0,82,0,0,2,11,0,6
3,Albany,GA,"albany,GA",13.0,1052.0,Albany city,70748.0,40036.0,101200.0,11479.0,...,0,0,0,47,0,0,2,1,0,12
4,Albany,NY,"albany,NY",36.0,1000.0,Albany city,99402.0,52583.0,186500.0,16200.0,...,0,0,0,211,0,0,9,22,0,21


In [118]:
state_to_region = {
    # Northeast
    "ME": "Northeast", "NH": "Northeast", "VT": "Northeast", "MA": "Northeast",
    "RI": "Northeast", "CT": "Northeast", "NY": "Northeast", "NJ": "Northeast",
    "PA": "Northeast",
    # Midwest
    "OH": "Midwest", "IN": "Midwest", "IL": "Midwest", "MI": "Midwest",
    "WI": "Midwest", "MN": "Midwest", "IA": "Midwest", "MO": "Midwest",
    "ND": "Midwest", "SD": "Midwest", "NE": "Midwest", "KS": "Midwest",
    # South
    "DE": "South", "MD": "South", "DC": "South", "VA": "South", "WV": "South",
    "NC": "South", "SC": "South", "GA": "South", "FL": "South",
    "KY": "South", "TN": "South", "MS": "South", "AL": "South",
    "OK": "South", "TX": "South", "AR": "South", "LA": "South",
    # West
    "MT": "West", "ID": "West", "WY": "West", "CO": "West", "NM": "West",
    "AZ": "West", "UT": "West", "NV": "West", "WA": "West", "OR": "West",
    "CA": "West", "AK": "West", "HI": "West",
}

df["region"] = df["State"].map(state_to_region).fillna("Other")

In [120]:
df.head()

Unnamed: 0,City,State,Merge_Key_x,State_FIPS,Place_FIPS,NAME,Total_Population_2021,Median_Household_Income_2021,Median_Housing_Value_2021,Owner_Occupied_Units_2021,...,leisure_garden,leisure_ice_rink,leisure_park,leisure_pitch,leisure_sports_centre,leisure_stadium,other,fortune,sbs,region
0,Abilene,TX,"abilene,TX",48.0,1000.0,Abilene city,125474.0,54493.0,137800.0,24821.0,...,0,0,48,0,0,6,11,0,11,South
1,Akron,OH,"akron,OH",39.0,1000.0,Akron city,191483.0,42129.0,87100.0,41753.0,...,0,0,238,0,0,15,14,2,21,Midwest
2,Alameda,CA,"alameda,CA",6.0,562.0,Alameda city,78320.0,113339.0,972800.0,14477.0,...,0,0,82,0,0,2,11,0,6,West
3,Albany,GA,"albany,GA",13.0,1052.0,Albany city,70748.0,40036.0,101200.0,11479.0,...,0,0,47,0,0,2,1,0,12,South
4,Albany,NY,"albany,NY",36.0,1000.0,Albany city,99402.0,52583.0,186500.0,16200.0,...,0,0,211,0,0,9,22,0,21,Northeast


In [122]:
# Drop descriptive columns
drop_cols = ['City','NAME', 'Merge_Key_x', 'Place_FIPS', 'State_FIPS']
df = df.drop(columns=drop_cols, errors='ignore')

# Drop past prices columns
price_cols_to_drop = ['Median_Housing_Value_2021', 'Median_Housing_Value_2022']
df = df.drop(columns=price_cols_to_drop, errors='ignore')

# Define target variable
target = 'Median_Housing_Value_2023'
geo_features = ['State','region']

# Exclude ALL 2023 predictor columns but KEEP the target
df = df[[col for col in df.columns if (not col.endswith("_2023")) or (col == target)]]

'''numeric_features = [
    col for col in df.columns
    if col not in geo_features and col != target and pd.api.types.is_numeric_dtype(df[col])
]'''

numeric_features = [
    col for col in df.columns
    if col != target and pd.api.types.is_numeric_dtype(df[col])
]

categorical_features = geo_features  # your two new categorical variables

# Final modeling dataset (keeping both numeric and categorical)
df_model = df[numeric_features + categorical_features + [target]].dropna()

# Define X and y
X = df_model[numeric_features + categorical_features]
y = df_model[target]

In [124]:
X.columns[150:200]

Index(['amenity_parking', 'amenity_place_of_worship', 'amenity_planetarium',
       'amenity_pub', 'amenity_public_building', 'amenity_public_facility',
       'amenity_restaurant', 'amenity_school', 'amenity_shelter',
       'amenity_shelter;ping pong tables', 'amenity_social_centre',
       'amenity_social_facility', 'amenity_studio', 'amenity_theatre',
       'amenity_toilets', 'amenity_university', 'historic_site',
       'leisure_amusement_arcade', 'leisure_bleachers', 'leisure_garden',
       'leisure_ice_rink', 'leisure_park', 'leisure_pitch',
       'leisure_sports_centre', 'leisure_stadium', 'other', 'fortune', 'sbs',
       'State', 'region'],
      dtype='object')

### Regression (linear regression, ridge, lasso)

In [133]:
categorical_feats = ["State", "region"]
numeric_feats = [col for col in X.columns if col not in categorical_feats]

numeric_transformer = Pipeline([
    ("poly", PolynomialFeatures(include_bias=False)),
    ("scaler", StandardScaler())
])
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_feats)#,
        #("cat", categorical_transformer, categorical_feats)
    ]
)


In [135]:
# Train-test set split (80%/20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1738)

In [137]:
numeric_transformer = Pipeline([
    ("poly", PolynomialFeatures(include_bias=False)),
    ("scaler", StandardScaler())
])

In [139]:
# Set up cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=1738)

# Define Base Pipelines for Each Model (include: PolynomialFeatures, StandardScaler, Regressor)

# Linear Regression
lin_pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("reg", LinearRegression())
])

# Ridge Regression
ridge_pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("reg", Ridge(random_state=42))
])


# Lasso Regression
lasso_pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("reg", Lasso(max_iter=30000, random_state=1738))
])


# Define hyperparameter Grids
models_and_params = {
    "LinearRegression": {
        "pipeline": lin_pipeline,
        "param_grid": {
            "preprocess__num__poly__degree": [1, 2],
            "reg__fit_intercept": [True]
        }
    },
    "Ridge": {
        "pipeline": ridge_pipeline,
        "param_grid": {
            "preprocess__num__poly__degree": [1, 2],
            "reg__alpha": [200, 500, 1000],
            "reg__fit_intercept": [True]
        }
    },
    "Lasso": {
        "pipeline": lasso_pipeline,
        "param_grid": {
            "preprocess__num__poly__degree": [1, 2],
            "reg__alpha": [1000, 2000, 5000],
            "reg__max_iter": [15000, 30000],
            "reg__fit_intercept": [True]
        },
        "n_jobs": 1
    }
}

# Cross-Validation Tuning with GridSearchCV
best_models = {}
cv_results_rows = []

for name, cfg in models_and_params.items():
    print(f"\n===== Tuning {name} =====")
    pipe = cfg["pipeline"]
    param_grid = cfg["param_grid"]

    n_jobs = cfg.get("n_jobs", -1)  # default -1 unless overridden

    grid = GridSearchCV(
        estimator=pipe,
        param_grid=param_grid,
        cv=5,
        scoring="r2",
        n_jobs=n_jobs,
        return_train_score=True
    )
    
    grid.fit(X_train, y_train)
    
    print("Best params:", grid.best_params_)
    print("Best CV R²:", grid.best_score_)
    
    best_models[name] = grid
    
    # Store the best cross-validation results
    cv_results_rows.append({
        "Model": name,
        "Best Params": grid.best_params_,
        "CV_R2": grid.best_score_
    })

cv_summary_df = pd.DataFrame(cv_results_rows)
print("\n=== Cross-Validation Summary (Best per Model) ===")
cv_summary_df


===== Tuning LinearRegression =====
Best params: {'preprocess__num__poly__degree': 1, 'reg__fit_intercept': True}
Best CV R²: -0.1885434956405057

===== Tuning Ridge =====
Best params: {'preprocess__num__poly__degree': 1, 'reg__alpha': 200, 'reg__fit_intercept': True}
Best CV R²: 0.581961354849611

===== Tuning Lasso =====
Best params: {'preprocess__num__poly__degree': 1, 'reg__alpha': 5000, 'reg__fit_intercept': True, 'reg__max_iter': 15000}
Best CV R²: 0.5640362501390396

=== Cross-Validation Summary (Best per Model) ===


Unnamed: 0,Model,Best Params,CV_R2
0,LinearRegression,"{'preprocess__num__poly__degree': 1, 'reg__fit...",-0.188543
1,Ridge,"{'preprocess__num__poly__degree': 1, 'reg__alp...",0.581961
2,Lasso,"{'preprocess__num__poly__degree': 1, 'reg__alp...",0.564036


In [140]:
# Evaluate best models on test set
test_rows = []

for name, grid in best_models.items():
    best_estimator = grid.best_estimator_
    y_pred = best_estimator.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    
    # Pull out a few key params for readability
    best_params = grid.best_params_
    degree = best_params.get("poly__degree", None)
    alpha = best_params.get("reg__alpha", None)
    fit_intercept = best_params.get("reg__fit_intercept", None)
    
    test_rows.append({
        "Model": name,
        "Degree": degree,
        "Alpha": alpha,
        "Fit Intercept": fit_intercept,
        "Test_R2": r2,
        "Test_RMSE": rmse,
        "Test_MAE": mae
    })

test_results_df = pd.DataFrame(test_rows)
test_results_df = test_results_df[[
    "Model", "Degree", "Alpha", "Fit Intercept", "Test_R2", "Test_RMSE", "Test_MAE"
]].round(3)

print("\n=== Test Set Performance (Best per Model) ===")
test_results_df


=== Test Set Performance (Best per Model) ===


Unnamed: 0,Model,Degree,Alpha,Fit Intercept,Test_R2,Test_RMSE,Test_MAE
0,LinearRegression,,,True,0.54,231057.504,156542.075
1,Ridge,,200.0,True,0.64,204410.118,140229.616
2,Lasso,,5000.0,True,0.65,201592.395,138282.666


### Random Forest

In [143]:
# Preprocessor for RF: numeric passthrough, categorical one-hot
rf_preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_feats),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_feats),
    ]
)

rf_pipeline = Pipeline([
    ("preprocess", rf_preprocessor),
    ("rf", RandomForestRegressor(random_state=1738))
])

rf_param_grid = {
    "rf__n_estimators": [200, 400, 600],
    "rf__max_depth": [10, 20, 50, None],
    "rf__min_samples_split": [5, 10, 20],
    "rf__min_samples_leaf": [2, 4, 10],
    "rf__max_features": ["sqrt", "log2"],
}

rf_grid = GridSearchCV(
    estimator=rf_pipeline,
    param_grid=rf_param_grid,
    cv=5,
    scoring="r2",
    n_jobs=-1,
    verbose=1
)

rf_grid.fit(X_train, y_train)

best_rf_cv_r2 = rf_grid.best_score_
best_rf_params = rf_grid.best_params_

print("\nBest Random Forest Params =", best_rf_params)
print("Best CV R² =", best_rf_cv_r2)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits

Best Random Forest Params = {'rf__max_depth': 50, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 10, 'rf__n_estimators': 600}
Best CV R² = 0.7264921664874631


In [144]:
best_rf = rf_grid.best_estimator_
feature_importances = best_rf.feature_importances_
feat_names = X_train.columns  # works if X_train is a DataFrame

fi_df = pd.DataFrame({
    "feature": feat_names,
    "importance": feature_importances
}).sort_values("importance", ascending=False)

print("\nTop features by importance:")
print(fi_df.head(20))

AttributeError: 'Pipeline' object has no attribute 'feature_importances_'

In [147]:
# Predict on test data
best_rf = rf_grid.best_estimator_
y_pred_test = best_rf.predict(X_test)

# Metrics
test_r2 = r2_score(y_test, y_pred_test)
test_mse = mean_squared_error(y_test, y_pred_test)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, y_pred_test)

print("\nTest set performance for Random Forest Regressor:")
print(f"R²      : {test_r2:.4f}")
print(f"MSE     : {test_mse:.4f}")
print(f"RMSE    : {test_rmse:.4f}")
print(f"MAE     : {test_mae:.4f}")


Test set performance for Random Forest Regressor:
R²      : 0.7683
MSE     : 26902372055.7770
RMSE    : 164019.4258
MAE     : 103802.0891


### XGBoost

In [45]:
xgb_pipeline = Pipeline([
    ("reg", XGBRegressor(
        objective='reg:squarederror',
        tree_method="hist",
        random_state=33,
        n_jobs=1
    ))
])

xgb_param_grid = {
    "reg__n_estimators": [300, 600],        # 2
    "reg__max_depth": [4, 6],               # 2
    "reg__learning_rate": [0.05, 0.1],      # 2
    "reg__subsample": [0.8],                # 1
    "reg__colsample_bytree": [0.8],         # 1
    "reg__reg_lambda": [1, 5],              # L2 regularization
    "reg__reg_alpha": [0, 0.1],             # L1 regularization
}

models_and_params["XGBoost"] = {
    "pipeline": xgb_pipeline,
    "param_grid": xgb_param_grid,
}


In [47]:
xgb_grid = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=xgb_param_grid,
    cv=5,
    scoring="r2",
    n_jobs=1,            # very important for XGBoost stability
    return_train_score=True,
    verbose=1
)

xgb_grid.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


0,1,2
,estimator,"Pipeline(step...=None, ...))])"
,param_grid,"{'reg__colsample_bytree': [0.8], 'reg__learning_rate': [0.05, 0.1], 'reg__max_depth': [4, 6], 'reg__n_estimators': [300, 600], ...}"
,scoring,'r2'
,n_jobs,1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [55]:
best_xgb = xgb_grid.best_estimator_

y_pred = best_xgb.predict(X_test)

from sklearn.metrics import r2_score

# Metrics
test_r2 = r2_score(y_test, y_pred)
test_mse = mean_squared_error(y_test, y_pred)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, y_pred)

print("\nTest set performance for XGBoost Regressor:")
print(f"R²      : {test_r2:.4f}")
print(f"MSE     : {test_mse:.4f}")
print(f"RMSE    : {test_rmse:.4f}")
print(f"MAE     : {test_mae:.4f}")


Test set performance for XGBoost Regressor:
R²      : 0.6422
MSE     : 19714842327.6312
RMSE    : 140409.5521
MAE     : 91153.2339


### Regression and Random Forest with Selected Features

In [59]:
top_k = 30

top_features = (
    fi_df
    .sort_values("importance", ascending=False)
    .head(top_k)["feature"]
    .tolist()
)

print("Top features:")
print(top_features)

Top features:
['Median_Household_Income_2022', 'Median_Household_Income_2021', 'Bachelors_Or_Higher_Rate_2022', 'Bachelors_Or_Higher_Rate_2021', 'amenity_theatre', 'leisure_garden', 'n_cafe_cuisine_bubble_tea', 'leisure_park', 'amenity_arts_centre', 'n_cuisine_japanese', 'Bachelors_Degree_Count_2021', 'Owner_Occupied_Units_2022', 'Masters_Degree_Count_2022', 'Masters_Degree_Count_2021', 'Owner_Occupied_Units_2021', 'Unemployment_Rate_2021', 'n_cuisine_ramen', 'Bachelors_Degree_Count_2022', 'other', 'n_cuisine_sushi', 'sbs', 'Unemployed_Count_2022', 'n_cuisine_american', 'Unemployment_Rate_2022', 'n_cafe', 'n_cafe_amenity_cafe', 'Total_Population_2022', 'n_cafe_brand', 'n_venue_stadium', 'n_venue_bar']


In [63]:
X_top = X[top_features]

X_train_top, X_test_top, y_train, y_test = train_test_split(
    X_top, y, test_size=0.2, random_state=33
)

In [71]:
best_rf_params = rf_grid.best_params_  # from your previous GridSearch

rf_30 = RandomForestRegressor(
    **best_rf_params,
    random_state=33,
    n_jobs=-1
)

rf_30.fit(X_train_top, y_train)


y_pred_30 = rf_30.predict(X_test_top)

print("\nRF with top 30 features:")
print("R²   :", r2_score(y_test, y_pred_30))
print("RMSE :", np.sqrt(mean_squared_error(y_test, y_pred_30)))
print("MAE  :", mean_absolute_error(y_test, y_pred_30))


RF with top 30 features:
R²   : 0.6125161464876412
RMSE : 146109.41425505767
MAE  : 102420.68806912318


In [74]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso

# Train-test split (reuse X, y or X_top, y)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=33
)

# Cross-validation strategy
cv = KFold(n_splits=5, shuffle=True, random_state=33)

# ----- Pipelines -----

# Linear Regression
lin_pipeline = Pipeline([
    ("poly", PolynomialFeatures(include_bias=False)),
    ("scaler", StandardScaler()),
    ("reg", LinearRegression())
])

# Ridge Regression
ridge_pipeline = Pipeline([
    ("poly", PolynomialFeatures(include_bias=False)),
    ("scaler", StandardScaler()),
    ("reg", Ridge(random_state=42))
])

# Lasso Regression
lasso_pipeline = Pipeline([
    ("poly", PolynomialFeatures(include_bias=False)),
    ("scaler", StandardScaler()),
    ("reg", Lasso(max_iter=30000, random_state=33))
])

# ----- Hyperparameter grids + per-model n_jobs -----

models_and_params = {
    "LinearRegression": {
        "pipeline": lin_pipeline,
        "param_grid": {
            "poly__degree": [1, 2],
            "reg__fit_intercept": [True]
        },
        "n_jobs": -1
    },
    "Ridge": {
        "pipeline": ridge_pipeline,
        "param_grid": {
            "poly__degree": [1, 2],
            "reg__alpha": [200, 500, 1000],
            "reg__fit_intercept": [True]
        },
        "n_jobs": -1
    },
    "Lasso": {
        "pipeline": lasso_pipeline,
        "param_grid": {
            "poly__degree": [1, 2],
            "reg__alpha": [1000, 2000, 5000],
            "reg__max_iter": [15000, 30000],
            "reg__fit_intercept": [True]
        },
        # 👇 Important: keep Lasso single-threaded to avoid worker crashes
        "n_jobs": 1
    }
}

best_models = {}
cv_results_rows = []

for name, cfg in models_and_params.items():
    print(f"\n===== Tuning {name} =====")
    
    pipe = cfg["pipeline"]
    param_grid = cfg["param_grid"]
    n_jobs = cfg.get("n_jobs", -1)

    grid = GridSearchCV(
        estimator=pipe,
        param_grid=param_grid,
        cv=cv,             # use your KFold object
        scoring="r2",
        n_jobs=n_jobs,
        return_train_score=True,
        verbose=1
    )
    
    grid.fit(X_train, y_train)
    
    print("Best params:", grid.best_params_)
    print("Best CV R²:", grid.best_score_)
    
    best_models[name] = grid
    
    cv_results_rows.append({
        "Model": name,
        "Best Params": grid.best_params_,
        "CV_R2": grid.best_score_
    })

cv_summary_df = pd.DataFrame(cv_results_rows)
print("\n=== Cross-Validation Summary (Best per Model) ===")
print(cv_summary_df)



===== Tuning LinearRegression =====
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best params: {'poly__degree': 1, 'reg__fit_intercept': True}
Best CV R²: -0.35164142209771887

===== Tuning Ridge =====
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best params: {'poly__degree': 1, 'reg__alpha': 200, 'reg__fit_intercept': True}
Best CV R²: 0.5961244076143114

===== Tuning Lasso =====
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best params: {'poly__degree': 1, 'reg__alpha': 5000, 'reg__fit_intercept': True, 'reg__max_iter': 15000}
Best CV R²: 0.6123101611065807

=== Cross-Validation Summary (Best per Model) ===
              Model                                        Best Params  \
0  LinearRegression    {'poly__degree': 1, 'reg__fit_intercept': True}   
1             Ridge  {'poly__degree': 1, 'reg__alpha': 200, 'reg__f...   
2             Lasso  {'poly__degree': 1, 'reg__alpha': 5000, 'reg__...   

      CV_R2  
0 -0.351641  
1  0.596124  

In [75]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd

# Evaluate best models on test set
test_rows = []

for name, grid in best_models.items():
    print(f"\n=== Evaluating {name} on test set ===")
    
    best_estimator = grid.best_estimator_
    y_pred = best_estimator.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    
    # Best params from GridSearch
    best_params = grid.best_params_
    
    # Safely pull a few key params (works even if some don't exist for future models)
    degree = best_params.get("poly__degree")
    alpha = best_params.get("reg__alpha")
    fit_intercept = best_params.get("reg__fit_intercept")
    
    test_rows.append({
        "Model": name,
        "Degree": degree,
        "Alpha": alpha,
        "Fit Intercept": fit_intercept,
        "Test_R2": r2,
        "Test_RMSE": rmse,
        "Test_MAE": mae
    })

test_results_df = pd.DataFrame(test_rows)

# Order & round columns
test_results_df = test_results_df[[
    "Model", "Degree", "Alpha", "Fit Intercept",
    "Test_R2", "Test_RMSE", "Test_MAE"
]].round(3)

# Sort by Test_R2 (best at top)
test_results_df = test_results_df.sort_values("Test_R2", ascending=False)

print("\n=== Test Set Performance (Best per Model) ===")
print(test_results_df)


=== Evaluating LinearRegression on test set ===

=== Evaluating Ridge on test set ===

=== Evaluating Lasso on test set ===

=== Test Set Performance (Best per Model) ===
              Model  Degree   Alpha  Fit Intercept  Test_R2   Test_RMSE  \
2             Lasso       1  5000.0           True    0.539  159370.562   
1             Ridge       1   200.0           True    0.511  164122.206   
0  LinearRegression       1     NaN           True    0.115  220837.255   

     Test_MAE  
2  115558.078  
1  118696.579  
0  139762.120  


In [79]:
test_results_df

Unnamed: 0,Model,Degree,Alpha,Fit Intercept,Test_R2,Test_RMSE,Test_MAE
2,Lasso,1,5000.0,True,0.539,159370.562,115558.078
1,Ridge,1,200.0,True,0.511,164122.206,118696.579
0,LinearRegression,1,,True,0.115,220837.255,139762.12
