In [1]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np

# Load the California housing dataset
california_housing = fetch_california_housing()

# Create DataFrame with features
housing = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)
housing_labels = california_housing.target  # This is already the target array

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    housing, housing_labels, test_size=0.2, random_state=42
)

# Identify numerical columns (all columns in this dataset are numerical)
num_columns = X_train.columns.tolist()

# Create preprocessing pipeline for numerical features
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

# Create column transformer (only numerical pipeline since all features are numerical)
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_columns),
])

# Prepare the training data
housing_prepared = preprocessor.fit_transform(X_train)

# Now define your parameter grid and run the grid search
param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(housing_prepared, y_train)  # Use y_train instead of housing_labels

# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best RMSE score:", np.sqrt(-grid_search.best_score_))

# You can also check the best estimator
best_forest_reg = grid_search.best_estimator_
print("Best estimator:", best_forest_reg)

Best parameters: {'max_features': 2, 'n_estimators': 30}
Best RMSE score: 0.5080549938463517
Best estimator: RandomForestRegressor(max_features=2, n_estimators=30, random_state=42)


In [2]:
grid_search.best_params_

{'max_features': 2, 'n_estimators': 30}

In [3]:
grid_search.best_estimator_

In [4]:
#QUESTION 2: Look at the score of each hyperparameter combina�on tested during the grid search
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

0.6245338646699863 {'max_features': 2, 'n_estimators': 3}
0.5374726025780652 {'max_features': 2, 'n_estimators': 10}
0.5080549938463517 {'max_features': 2, 'n_estimators': 30}
0.5974207063820612 {'max_features': 4, 'n_estimators': 3}
0.5280476980276422 {'max_features': 4, 'n_estimators': 10}
0.5102263064320371 {'max_features': 4, 'n_estimators': 30}
0.5962161278592699 {'max_features': 6, 'n_estimators': 3}
0.5337805355017322 {'max_features': 6, 'n_estimators': 10}
0.5159476822709024 {'max_features': 6, 'n_estimators': 30}
0.6005957897242895 {'max_features': 8, 'n_estimators': 3}
0.5372972486044904 {'max_features': 8, 'n_estimators': 10}
0.5191274707888076 {'max_features': 8, 'n_estimators': 30}
0.6062264596768497 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
0.5244907698347514 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
0.5944833660963391 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
0.5248593174756421 {'bootstrap': False, 'max_features': 3, 'n