In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression

# Utils

In [None]:
def min_max_normalize(array):
    
    """
        Normalizes the given array using min-max normalization.
    """

    scaler = MinMaxScaler()
    return scaler.fit_transform(array)

# Linear Regression Model Class

In [None]:
class LinearRegressionModel:

    def __init__(self):
        self.w = None
        self.b = None
        self.cost_history = []

    def hypothesis(self, w, b, x):

        """
            Inputs:

                - x: features   (numpy array of shape (m, n))
                - w: weights    (numpy array of shape (1, n))
                - b: bias       (scalar)

            Output:

                - h: hypothesis (numpy array of shape (1, m))
                     where h = w.x + b
        """

        h = np.dot(w, x.T) + b
        return h

    def cost(self, y, h):

        """
            Inputs:

                - y: targets    (numpy array of shape (m, 1))
                - h: hypothesis (numpy array of shape (1, m))

            Output:

                - c: cost       (float)
                     where c = (1 / 2) sum((y - h)^2)
        """

        c = (1 / (2 * h.shape[1])) * np.sum((h - y.T) ** 2)
        return c

    def gradient(self, y, h, x):

        """
            Inputs: 

                - y: targets    (numpy array of shape (m, 1))
                - h: hypothesis (numpy array of shape (1, m))
                - x: features   (numpy array of shape (m, n))

            Output:

                - g: gradient   (dict)
                                ["w"] = weights (numpy array of shape (1, n))
                                ["b"] = bias    (scalar)
                     w = sum((y - h) . x)
                     b = sum(y - h)  
        """

        g = {}
        g["w"] = (1 / h.shape[1]) * np.dot((h - y.T), x)
        g["b"] = (1 / h.shape[1]) * np.sum(h - y.T)
        return g

    def gradient_descent(self, x_train, y_train, learning_rate, num_iterations):
        
        """
            Inputs:

                - x_train: features     (numpy array of shape (m, n))
                - y_train: targets      (numpy array of shape (m, 1))
                - learning_rate:        (float)
                - num_iterations:       (int)

            Output:

                - (dict) 
                d["w"] = weights      (numpy array of shape (1, n))
                d["b"] = bias         (scalar)
                d["c"] = cost_history (list)
        """

        # Parameters Initialization
        w = np.zeros(shape=(1, x_train.shape[1]))
        b = 0

        # Run Gradient Descent while keep tracking of the Cost
        cost_history = []
        print("Training Model...")
        for i in range(num_iterations):

            # Obtain Predictions with Current Weights
            h = self.hypothesis(w, b, x_train)

            # Update Weights
            g = self.gradient(y_train, h, x_train)
            w = w - learning_rate * g["w"]
            b = b - learning_rate * g["b"]

            # Add Iteration's Cost to Cost History
            c = self.cost(y_train, h)
            cost_history.append(c)
        
        # Update Model's Parameters
        print("Training Complete.")
        return {"w": w, "b": b, "c": cost_history}

    def fit(self, x_train, y_train, learning_rate, num_iterations):

        """
            Model's fit function.

            Inputs:

                - x_train        (numpy array of shape (num_dataPoints, num_features))
                - y_train        (numpy array of shape (num_dataPoints, 1))
                - learning rate  (float)
                - num_iterations (int)

            Output:

                - None (Updates model's parameters)

        """

        # Run Gradient Descent
        d = self.gradient_descent(x_train, y_train, learning_rate, num_iterations)

        # Update Model's Parameters
        self.w = d["w"]
        self.b = d["b"]
        self.cost_history = d["c"]

    def plot_cost(self):

        """
            Plots the cost function over iterations.
        """

        if len(cost_history) == 0:
            print("Can't plot cost function. Fit the model first.")
        else: 
            plt.plot(self.cost_history, '-')
            plt.title("Cost Function")
            plt.show()      

    def validate(self, x_val, y_val):

        """
            Inputs:

                - x_val (numpy array of shape (num_dataPoints, num_features))
                - y_val (numpy array of shape (num_dataPoints, 1))

            Output:

                - Mean Squared Error of the model on the validation data.
        """
        predictions = hypothesis(self.w, self.b, x_val)
        print(f"Mean Squared Error = {cost(y_val, predictions)}")

    def predict(self, x_test):

        """
            Inputs:
                
                - x_test (numpy array of shape (num_dataPoints, num_features))

            Output:

                - Model's predictions (numpy array of shape (1, num_dataPoints))

        """
        predictions = hypothesis(self.w, self.b, x_val)
        return predictions

# Loading the Dataset

In [None]:
train_df = pd.read_csv("sample_data/california_housing_train.csv")
test_df = pd.read_csv("sample_data/california_housing_train.csv")

In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           17000 non-null  float64
 1   latitude            17000 non-null  float64
 2   housing_median_age  17000 non-null  float64
 3   total_rooms         17000 non-null  float64
 4   total_bedrooms      17000 non-null  float64
 5   population          17000 non-null  float64
 6   households          17000 non-null  float64
 7   median_income       17000 non-null  float64
 8   median_house_value  17000 non-null  float64
dtypes: float64(9)
memory usage: 1.2 MB


# Dataset Preparation

In [None]:
target_label = "median_house_value"

# Drop Target Column
x_train = train_df.drop(target_label, axis=1)
x_test = test_df.drop(target_label, axis=1)

# Convert pandas dataframe to numpy array
x_train = np.array(x_train)
x_test = np.array(x_test)

# Normalize array
x_train = min_max_normalize(x_train)
x_test = min_max_normalize(x_test)

print(f"x_train.shape: {x_train.shape}")
print(f"x_test.shape: {x_test.shape}")

x_train.shape: (17000, 8)
x_test.shape: (17000, 8)


In [None]:
# Get Target Column
y_train = train_df[target_label]
y_test = test_df[target_label]

# Convert pandas series to numpy array
y_train = np.array(y_train).reshape(-1, 1)
y_test = np.array(y_test).reshape(-1, 1)

print(f"y_train.shape: {y_train.shape}")
print(f"y_test.shape: {y_test.shape}")

y_train.shape: (17000, 1)
y_test.shape: (17000, 1)


# Creating and Fitting the Model

In [None]:
model = LinearRegressionModel()

# Model Hyperparameters
learning_rate = 1
num_iterations = 20000

model.fit(x_train, y_train, learning_rate, num_iterations)
print(model.w)

Training Model...
Training Complete.
[[ -433309.80034907  -403817.54328963    58760.24668404  -319839.13639213
    761247.67166261 -1333259.43715122   256146.17723551   587658.78586377]]


# Comparing the Model with Sklearn's Model

In [None]:
sk_model = LinearRegression()
sk_model.fit(x_train, y_train)
print(sk_model.coef_)

[[ -433121.95806617  -403930.58376103    58685.44155621  -317828.95977251
    758127.21007451 -1373240.89961211   276296.33152023   587360.59262494]]
