In [2]:
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

data = fetch_california_housing()
X = data.data
y = data.target

print("X:", X.shape, "y:", y.shape)

X: (20640, 8) y: (20640,)


In [3]:
X_train_pre, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=6)
X_val_pre, X_test_pre, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=6)

In [4]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_pre)
X_val = scaler.fit_transform(X_val_pre)
X_test = scaler.fit_transform(X_test_pre)

Linear Regression NumPy implementation. Going off pure memory.

In [5]:
w = np.random.random((1, X.shape[1]))
b = np.random.random(1)

In [12]:
epochs = 100
bs = 32
lr = 0.001
M = X_train.shape[0]
for epoch in range(epochs):
    c = 0
    total_loss = 0
    for i in range(int(np.ceil(M / bs))):
        x, y = X_train[c:c+bs], y_train[c:c+bs].reshape(-1, 1)
        m = x.shape[0]
        pred = (x @ w.T + b)
        error = pred - y
        loss = np.sum(error**2) / (2*m)
        d_w = (error.T @ x) / m
        d_b = np.mean(error)

        w -= d_w * lr
        b -= d_b * lr

        total_loss += loss
        c += bs

    val_pred = (X_val @ w.T + b).flatten()
    val_loss = np.sum((val_pred - y_val) ** 2) / (2*X_val.shape[0])
    if ((epoch + 1) % 10 == 0):
        print(f"Epoch {epoch+1} train loss: {total_loss / int(np.ceil(M / bs))}, validation loss: {val_loss}")

Epoch 10 train loss: 0.25921308884586125, validation loss: 0.26976110344240606
Epoch 20 train loss: 0.2589865625183806, validation loss: 0.26937578326733497
Epoch 30 train loss: 0.25883931996418846, validation loss: 0.26909614868476195
Epoch 40 train loss: 0.2587431676803246, validation loss: 0.2688903834562444
Epoch 50 train loss: 0.258680110884709, validation loss: 0.26873705227856326
Epoch 60 train loss: 0.25863858511476123, validation loss: 0.26862148209680925
Epoch 70 train loss: 0.2586111192753893, validation loss: 0.26853347814474254
Epoch 80 train loss: 0.2585928667438851, validation loss: 0.26846585568174586
Epoch 90 train loss: 0.2585806726653501, validation loss: 0.26841348125371234
Epoch 100 train loss: 0.25857247721653676, validation loss: 0.26837263802614414


In [287]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
test_pred = (X_test @ w.T + b).flatten()
mse = mean_squared_error(y_test, test_pred)
mae = mean_absolute_error(y_test, test_pred)
r2 = r2_score(y_test, test_pred)
rmse = np.sqrt(mse)


print("\nTest metrics:")
print(f"MSE : {mse}")
print(f"RMSE: {rmse}")
print(f"MAE : {mae}")
print(f"R^2 : {r2}")


Test metrics:
MSE : 0.5247961188490804
RMSE: 0.7244281322871721
MAE : 0.527560945663497
R^2 : 0.6060097891710796


On average the predictions are off by around $52k judging from the MAE, but larger errors push the average up to around $72k (RMSE). ~0.61 R^2 score is in line with what linear regression should achieve.

Replication with scikit learn:

In [288]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X_train, y_train)
test_pred = model.predict(X_test)
mse = mean_squared_error(y_test, test_pred)
mae = mean_absolute_error(y_test, test_pred)
r2 = r2_score(y_test, test_pred)
rmse = np.sqrt(mse)


print("\nTest metrics:")
print(f"MSE : {mse}")
print(f"RMSE: {rmse}")
print(f"MAE : {mae}")
print(f"R^2 : {r2}")



Test metrics:
MSE : 0.5220256448859191
RMSE: 0.7225134219417098
MAE : 0.5292048549193693
R^2 : 0.6080897199892339


Results are pretty much identical.