In [None]:
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

data = fetch_california_housing()
X = data.data
y = data.target

print("X:", X.shape, "y:", y.shape)

X: (20640, 8) y: (20640,)


In [None]:
X_train_pre, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=6)
X_val_pre, X_test_pre, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=6)

In [284]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_pre)
X_val = scaler.fit_transform(X_val_pre)
X_test = scaler.fit_transform(X_test_pre)

Linear Regression NumPy implementation. Going off pure memory.

In [285]:
w = np.random.random((1, X.shape[1]))
b = np.random.random(1)

In [286]:
epochs = 100
bs = 32
lr = 0.001
for epoch in range(epochs):
    c = 0
    total_loss = 0
    for i in range(int(np.ceil(X_train.shape[0] / bs))):
        X_t, y_t = X_train[c:c+bs], y_train[c:c+bs]
        pred = (X_t @ w.T + b).flatten()
        error = pred - y_t
        m = X_t.shape[0]
        loss = np.sum((error)**2) / (2*m)
        d_w = X_t.T @ error / m
        d_b = np.mean(error)
        w -= d_w * lr
        b -= d_b * lr
        total_loss += loss
        c += bs

    val_pred = (X_val @ w.T + b).flatten()
    val_loss = np.sum((val_pred - y_val) ** 2) / (2*X_val.shape[0])
    if ((epoch+1)%10 == 0):
        print(f"Epoch {epoch+1} train loss: {total_loss / int(np.ceil(X_train.shape[0] / bs))}, validation loss: {val_loss}")

Epoch 10 train loss: 0.39042243170439517, validation loss: 0.39684310274783147
Epoch 20 train loss: 0.3261430192194603, validation loss: 0.335007735944341
Epoch 30 train loss: 0.2945364827849276, validation loss: 0.3049144387007147
Epoch 40 train loss: 0.27836311279017867, validation loss: 0.28940786701309557
Epoch 50 train loss: 0.2698235894286857, validation loss: 0.28108069119696444
Epoch 60 train loss: 0.26516278375421404, validation loss: 0.27640356933333443
Epoch 70 train loss: 0.26253373149109044, validation loss: 0.2736506833513652
Epoch 80 train loss: 0.26100437759462186, validation loss: 0.2719545589335429
Epoch 90 train loss: 0.2600902220640573, validation loss: 0.2708646032598866
Epoch 100 train loss: 0.25953115041499936, validation loss: 0.2701377992903288


In [287]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
test_pred = (X_test @ w.T + b).flatten()
mse = mean_squared_error(y_test, test_pred)
mae = mean_absolute_error(y_test, test_pred)
r2 = r2_score(y_test, test_pred)
rmse = np.sqrt(mse)


print("\nTest metrics:")
print(f"MSE : {mse}")
print(f"RMSE: {rmse}")
print(f"MAE : {mae}")
print(f"R^2 : {r2}")


Test metrics:
MSE : 0.5247961188490804
RMSE: 0.7244281322871721
MAE : 0.527560945663497
R^2 : 0.6060097891710796


On average the predictions are off by around $52k judging from the MAE, but larger errors push the average up to around $72k (RMSE). ~0.61 R^2 score is in line with what linear regression should achieve.

Replication with scikit learn:

In [288]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X_train, y_train)
test_pred = model.predict(X_test)
mse = mean_squared_error(y_test, test_pred)
mae = mean_absolute_error(y_test, test_pred)
r2 = r2_score(y_test, test_pred)
rmse = np.sqrt(mse)


print("\nTest metrics:")
print(f"MSE : {mse}")
print(f"RMSE: {rmse}")
print(f"MAE : {mae}")
print(f"R^2 : {r2}")



Test metrics:
MSE : 0.5220256448859191
RMSE: 0.7225134219417098
MAE : 0.5292048549193693
R^2 : 0.6080897199892339


Results are pretty much identical.