In [1]:
import numpy as np
from sklearn.datasets import fetch_california_housing

In [2]:
# Load the California housing dataset
# Selected features: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup']
X, y = fetch_california_housing(return_X_y=True)
X = X[:, [0, 1, 2, 3, 4, 5]]  # Excluded features: ['Latitude', 'Longitude']

print(f'The California Housing dataset includes {X.shape[0]} observations on {X.shape[1]} variables') 
print(f'The California Housing dataset includes {y.shape[0]} target prices') 

The California Housing dataset includes 20640 observations on 6 variables
The California Housing dataset includes 20640 target prices


In [3]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [4]:
# Split the data into training and validation sets
X_train, X_validate, y_train, y_validate = train_test_split(X, y, test_size=0.8, random_state=42)

In [5]:
mae_list = []

# Loop over different values of k
for k in range(1, 100, 1):
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X=X_train, y=y_train)
    predictions = model.predict(X_validate)
    mae = mean_absolute_error(y_validate, predictions)
    mae_list.append(mae)

mae_array = np.array(mae_list)

In [6]:
# Find the best k and its corresponding MAE
min_mae = np.min(mae_array)
best_k = np.argmin(mae_array) + 1  # Add 1 to adjust for 0-based indexing
print(f'The minimum Mean Absolute Error: {min_mae}')
print(f'The minimum Mean Absolute Error achieved using k: {best_k} neighbors')

The minimum Mean Absolute Error: 0.8858199258547894
The minimum Mean Absolute Error achieved using k: 7 neighbors


In [None]:
# Important: Note that the target variable (𝑦), is "log-transformed median" house price in $100,000s 
# (the house price data has been transformed using a logarithmic function).
# Reduce Skewness: House prices are often right-skewed, with most values being lower and a few extremely high values (outliers).