# Multiple Linear Regression
- This notebook contains the code to implement the Multiple Linear Regression algorithm from scratch using the `Normal Equation` method

## Importing Libraries

In [30]:
# data manipulation
import numpy as np
import pandas as pd

# dataset
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

# timing
from timeit import default_timer as timer

## Linear Regression Class
- This class can implement both Simple and Multiple Linear Regression algorithms naturally
- In this notebook however, we will use it to implement the Multiple Linear Regression algorithm

In [31]:
class CustomLinearRegression:

  def __init__(self):
    pass


  def fit(self, X, y):
    # this method will train the Linear Regression model

    n, p = X.shape
    b_ = np.ones((n, 1))
    X_b = np.hstack((b_, X))
    y = y.reshape(-1, 1)

    start_time = timer()
    theta = np.linalg.inv(X_b.T @ X_b) @ X_b.T @ y
    end_time = timer()

    theta = theta.ravel()
    self.b = theta[0]
    self.w = theta[1:]

    print("> Successfully trained Linear Regression Model...")
    print("> Training Summary:")
    print(f"  - {'Time Taken':<12}: {end_time - start_time:,.3f}s")
    print(f"  - {'Bias':<12}: {self.b}")
    print(f"  - {'Coefficients':<12}: {self.w}")

    return self


  def predict(self, X):
    # this method will be used for making the predictions
    return np.dot(X, self.w) + self.b

## Convenience Functions

### Splitting the Data

In [32]:
def split_data(X, y, test_frac=0.2):
  total_size = X.shape[0]
  test_size = int(test_frac * total_size)
  indices = np.random.permutation(total_size)
  train_indices = indices[:-test_size]
  test_indices = indices[-test_size:]
  X_train = X[train_indices]
  y_train = y[train_indices]
  X_test = X[test_indices]
  y_test = y[test_indices]
  return (X_train, X_test, y_train, y_test)

### Evaluation Metrics

In [33]:
# mean absolute error
def mae(y_true, y_pred):
  error = y_true - y_pred
  return np.mean(np.abs(error))

In [34]:
# mean squared error
def mse(y_true, y_pred):
  error = y_true - y_pred
  return np.mean(error ** 2)

In [35]:
# root mean squared error
def rmse(y_true, y_pred):
  return mse(y_true, y_pred) ** 0.5

In [36]:
# r-squared value
def r_sq(y_true, y_pred):
  tss = ((y_true - y_train_mean) ** 2).sum()
  rss = ((y_true - y_pred) ** 2).sum()
  return 1 - (rss / tss)

In [37]:
# adjusted r-squared value
def adj_r_sq(y_true, y_pred):
  r2 = r_sq(y_true, y_pred)
  return 1 - ((1 - r2) * (n_samples - 1) / (n_samples - n_features - 1))

## Getting the Data

In [38]:
X, y = fetch_california_housing(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = split_data(X.values, y.values, test_frac=0.3)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(14448, 8) (14448,)
(6192, 8) (6192,)


In [39]:
# used in evaluation metrics

y_train_mean = np.mean(y_train)
n_samples, n_features = X_train.shape

In [40]:
# Training the Simple Linear Regression Model

mlr = CustomLinearRegression()
mlr.fit(X_train, y_train)

> Successfully trained Linear Regression Model...
> Training Summary:
  - Time Taken  : 0.009s
  - Bias        : -36.06254199329332
  - Coefficients: [ 4.39020289e-01  9.30359674e-03 -1.10365879e-01  6.17153292e-01
 -6.14197441e-06 -3.92725650e-03 -4.12020537e-01 -4.24733614e-01]


<__main__.CustomLinearRegression at 0x7a2834806800>

In [41]:
# predictionsx

y_pred = mlr.predict(X_test)
y_pred

array([2.69476611, 2.10450363, 2.63163767, ..., 1.95968944, 0.40145493,
       2.2828926 ])

In [42]:
# evaluating the model

mlr_mae = mae(y_test, y_pred)
mlr_mse = mse(y_test, y_pred)
mlr_rmse = rmse(y_test, y_pred)
mlr_r2 = r_sq(y_test, y_pred)
mlr_adj_r2 = adj_r_sq(y_test, y_pred)

print(f"> MAE = {mlr_mae}")
print(f"> MSE = {mlr_mse}")
print(f"> RMSE = {mlr_rmse}")
print(f"> R2 = {mlr_r2}")
print(f"> Adjusted R2 = {mlr_adj_r2}")

> MAE = 0.5309649890461048
> MSE = 0.5354595053490411
> RMSE = 0.7317509858886704
> R2 = 0.604605853585777
> Adjusted R2 = 0.6043867834859562
