In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load California Housing dataset
california = fetch_california_housing()
X = pd.DataFrame(california.data, columns=california.feature_names)
y = pd.Series(california.target, name='MedHouseVal')

print(f"Dataset shape: {X.shape}")

# ---------------------
# 2-Way Split Example
# ---------------------
print("\n--- 2-Way Split (Train/Test) ---")

# Split: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
model_2way = RandomForestRegressor(random_state=42)
model_2way.fit(X_train, y_train)

# Predict & evaluate
y_pred_test = model_2way.predict(X_test)
mse_test = mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

print(f"Test MSE: {mse_test:.3f}")
print(f"Test R2: {r2_test:.3f}")

# ---------------------
# 3-Way Split Example
# ---------------------
print("\n--- 3-Way Split (Train/Validation/Test) ---")

# Step 1: split off test set (20%)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: split remaining data into train (70%) and validation (10%)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.125, random_state=42)  

print(f"Train size: {X_train.shape[0]} samples")
print(f"Validation size: {X_val.shape[0]} samples")
print(f"Test size: {X_test.shape[0]} samples")

# Train model on train set
model_3way = RandomForestRegressor(random_state=42)
model_3way.fit(X_train, y_train)

# Evaluate on validation set
y_pred_val = model_3way.predict(X_val)
mse_val = mean_squared_error(y_val, y_pred_val)
r2_val = r2_score(y_val, y_pred_val)

print(f"Validation MSE: {mse_val:.3f}")
print(f"Validation R2: {r2_val:.3f}")

# Evaluate on test set
y_pred_test = model_3way.predict(X_test)
mse_test = mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

print(f"Test MSE: {mse_test:.3f}")
print(f"Test R2: {r2_test:.3f}")


Dataset shape: (20640, 8)

--- 2-Way Split (Train/Test) ---
Test MSE: 0.255
Test R2: 0.805

--- 3-Way Split (Train/Validation/Test) ---
Train size: 14448 samples
Validation size: 2064 samples
Test size: 4128 samples
Validation MSE: 0.271
Validation R2: 0.807
Test MSE: 0.258
Test R2: 0.803
