# Problem A

In [1]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

In [2]:
data = fetch_california_housing()
X, y = data.data, data.target

In [3]:
cart_regressor = DecisionTreeRegressor(min_samples_leaf=10, random_state=42)
adaboost_regressor = AdaBoostRegressor(estimator=DecisionTreeRegressor(min_samples_leaf=10), n_estimators=50, random_state=42)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [4]:
def evaluate_model(model, X, y):
    mae_scores = -cross_val_score(model, X, y, cv=kf, scoring='neg_mean_absolute_error')
    r2_scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
    return np.mean(mae_scores), np.mean(r2_scores)

cart_mae, cart_r2 = evaluate_model(cart_regressor, X, y)
print("CART Mean Absolute Error:", cart_mae)
print("CART R^2:", cart_r2)

adaboost_mae, adaboost_r2 = evaluate_model(adaboost_regressor, X, y)
print("AdaBoost Mean Absolute Error:", adaboost_mae)
print("AdaBoost R^2:", adaboost_r2)

CART Mean Absolute Error: 0.4051236145928673
CART R^2: 0.7212586032129119
AdaBoost Mean Absolute Error: 0.3188597803296035
AdaBoost R^2: 0.8279258204387515


The best performing model is AdaBoost with CART as an estimator. 

# Problem B

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'max_features': [1/3],
    'min_samples_leaf': [2, 10],
    'n_estimators': [10, 50, 100]
}

rf_regressor = RandomForestRegressor(random_state=42)

kf = KFold(n_splits=10, shuffle=True, random_state=42)

grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, cv=kf, scoring=['neg_mean_absolute_error', 'r2'], refit='r2')
grid_search.fit(X_train, y_train)

In [6]:
cv_results = grid_search.cv_results_
cv_mae = -cv_results['mean_test_neg_mean_absolute_error']
cv_r2 = cv_results['mean_test_r2']

print("CV Mean Absolute Error:", np.mean(cv_mae))
print("CV R^2:", np.mean(cv_r2))

CV Mean Absolute Error: 0.357734335241464
CV R^2: 0.7951117055716747


In [7]:
best_model = grid_search.best_estimator_

test_r2 = r2_score(y_test, best_model.predict(X_test))
print("Test R^2 of Best Model:", test_r2)

Test R^2 of Best Model: 0.8058488889183437
