In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression

In [None]:
dataframe = pd.read_csv("./Data/train.csv")
dataframe = dataframe[['LotFrontage','LotArea', 'SalePrice']]
dataframe

In [None]:
def loss_function(y, x, m, b):
    return (y - (m * x + b)) ** 2

In [None]:
def reject_outliers(lot_area, sale_price,  outlierConstant):
    upper_quartile = np.percentile(lot_area, 65)
    lower_quartile = np.percentile(lot_area, 35)
    IQR = (upper_quartile - lower_quartile) * outlierConstant
    quartileSet = (lower_quartile - IQR, upper_quartile + IQR)

    indices_to_delete = []
    for index, value in enumerate(lot_area):
        if not (value >= quartileSet[0] and value <= quartileSet[1]):
            indices_to_delete.append(index)
            
    return np.delete(lot_area, indices_to_delete), np.delete(sale_price, indices_to_delete)

In [None]:
def gradient_descent(m_now, b_now, lotAreaPoints, salePricePoints,  learningRate):
    m_gradient = 0
    b_gradient = 0
    n = len(lotAreaPoints)

    for i in range(n):
        x = lotAreaPoints[i]
        y = salePricePoints[i]

        m_gradient += -(2/n) * (x * (y - (m_now * x + b_now)))
        b_gradient += -(2/n) * (y - (m_now * x + b_now))

    m = m_now - m_gradient * learningRate
    b = b_now - b_gradient * learningRate
    loss_value = loss_function(salePricePoints[i], lotAreaPoints[i], m, b)

    return m, b, loss_value

In [None]:
def get_values(lot_areas: np.ndarray, sale_prices: np.ndarray, epochs: int, learning_rate: float):
    m = 0
    b = 0

    loss_values = []
    for _ in range(epochs):
        m, b, loss_value = gradient_descent(m, b, lot_areas, sale_prices, learning_rate)
        loss_values.append(loss_value)

    return m, b, loss_values

In [None]:
def plot_loss(epochs: int, loss_values: np.ndarray):
    figure, loss = plt.subplots()
    figure.set_label("Loss")
    epoch_values = [x for x in range(epochs)]
    loss.plot(epoch_values, loss_values)

In [None]:
def get_my_predictions(lot_areas, sale_prices, epochs, learning_rate):
    m, b, loss_values = get_values(lot_areas, sale_prices, epochs, learning_rate)
    return [m * lot_area + b for lot_area in lot_areas], loss_values

In [None]:
def get_sk_predictions(lot_areas, sale_prices):
    return LinearRegression().fit(lot_areas, sale_prices).predict(lot_areas)

In [None]:
learning_rate = 0.0001
epochs = 150

scaled_lot_area = np.log2(np.array(dataframe['LotArea']))
scaled_sale_price = np.log10(np.array(dataframe['SalePrice']))

(scaled_lot_area, scaled_sale_price) = reject_outliers(scaled_lot_area, scaled_sale_price, 1.5)
my_predictions, loss_values = get_my_predictions(scaled_lot_area, scaled_sale_price, epochs, learning_rate)
sk_predictions = get_sk_predictions(scaled_lot_area.reshape(1, -1).transpose(), scaled_sale_price.reshape(1, -1).transpose())

plt.scatter(scaled_lot_area, scaled_sale_price)
plt.plot(scaled_lot_area, my_predictions, color='red')
plt.plot(scaled_lot_area, sk_predictions, color='orange')

plot_loss(epochs, loss_values)

In [None]:
predictions_dataframe = pd.DataFrame({
    'Lot Area': scaled_lot_area,
    'Sale Price': scaled_sale_price,
    'Sale Price (My)': my_predictions,
    'Sale Price (SK)': sk_predictions
})