# Normal Equations Implementation

In [36]:
import numpy as np
import pandas as pd

In [37]:
# X = [1, 2, 3, 4], Y = [1, 3, 3, 5]

X = np.array([[1], [2], [3], [4]])

B = np.ones((len(X), 1))


print(X)
print(B)

[[1]
 [2]
 [3]
 [4]]
[[1.]
 [1.]
 [1.]
 [1.]]


In [38]:
X = np.hstack((B, X))
X

array([[1., 1.],
       [1., 2.],
       [1., 3.],
       [1., 4.]])

In [39]:
Y = np.array([[1], [3], [3], [5]])
Y

array([[1],
       [3],
       [3],
       [5]])

In [40]:
XT_X = np.matmul(X.T, X)
XT_X

array([[ 4., 10.],
       [10., 30.]])

In [41]:
XT_Y = np.matmul(X.T, Y)
XT_Y

array([[12.],
       [36.]])

In [42]:
W = np.matmul(np.linalg.inv(XT_X), XT_Y)
W

array([[0. ],
       [1.2]])

In [43]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

model.fit(X, Y)

In [44]:
model.coef_

array([[0. , 1.2]])

In [45]:
model.intercept_

array([-4.4408921e-16])

In [46]:
data = pd.read_csv('data/vw.csv')
data.head(5)

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,T-Roc,2019,25000,Automatic,13904,Diesel,145,49.6,2.0
1,T-Roc,2019,26883,Automatic,4562,Diesel,145,49.6,2.0
2,T-Roc,2019,20000,Manual,7414,Diesel,145,50.4,2.0
3,T-Roc,2019,33492,Automatic,4825,Petrol,145,32.5,2.0
4,T-Roc,2019,22900,Semi-Auto,6500,Petrol,150,39.8,1.5


In [47]:
data['Intercept'] = 1
data

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,Intercept
0,T-Roc,2019,25000,Automatic,13904,Diesel,145,49.6,2.0,1
1,T-Roc,2019,26883,Automatic,4562,Diesel,145,49.6,2.0,1
2,T-Roc,2019,20000,Manual,7414,Diesel,145,50.4,2.0,1
3,T-Roc,2019,33492,Automatic,4825,Petrol,145,32.5,2.0,1
4,T-Roc,2019,22900,Semi-Auto,6500,Petrol,150,39.8,1.5,1
...,...,...,...,...,...,...,...,...,...,...
15152,Eos,2012,5990,Manual,74000,Diesel,125,58.9,2.0,1
15153,Fox,2008,1799,Manual,88102,Petrol,145,46.3,1.2,1
15154,Fox,2009,1590,Manual,70000,Petrol,200,42.0,1.4,1
15155,Fox,2006,1250,Manual,82704,Petrol,150,46.3,1.2,1


In [52]:
X = data[['Intercept',  "year", "mileage", "tax", "mpg", "engineSize"]].values
Y = data['price'].values

print(X)
print(Y)

[[1.0000e+00 2.0190e+03 1.3904e+04 1.4500e+02 4.9600e+01 2.0000e+00]
 [1.0000e+00 2.0190e+03 4.5620e+03 1.4500e+02 4.9600e+01 2.0000e+00]
 [1.0000e+00 2.0190e+03 7.4140e+03 1.4500e+02 5.0400e+01 2.0000e+00]
 ...
 [1.0000e+00 2.0090e+03 7.0000e+04 2.0000e+02 4.2000e+01 1.4000e+00]
 [1.0000e+00 2.0060e+03 8.2704e+04 1.5000e+02 4.6300e+01 1.2000e+00]
 [1.0000e+00 2.0070e+03 7.4000e+04 1.4500e+02 4.6300e+01 1.2000e+00]]
[25000 26883 20000 ...  1590  1250  2295]


In [49]:
XT_X = np.matmul(X.T, X)
XT_X

array([[1.51570000e+04, 3.05755460e+07, 3.34860352e+08, 1.70886500e+06,
        8.14739600e+05, 2.42617000e+04],
       [3.05755460e+07, 6.16787611e+10, 6.74997154e+11, 3.44794038e+09,
        1.64341203e+09, 4.89421260e+07],
       [3.34860352e+08, 6.74997154e+11, 1.41769393e+13, 3.10339421e+10,
        1.94382889e+10, 5.52363226e+08],
       [1.70886500e+06, 3.44794038e+09, 3.10339421e+10, 2.53744075e+08,
        8.50523325e+07, 2.86466900e+06],
       [8.14739600e+05, 1.64341203e+09, 1.94382889e+10, 8.50523325e+07,
        4.66156570e+07, 1.27770438e+06],
       [2.42617000e+04, 4.89421260e+07, 5.52363226e+08, 2.86466900e+06,
        1.27770438e+06, 4.20662100e+04]])

In [50]:
XT_Y = np.matmul(X.T, Y)
XT_Y

array([2.55228001e+08, 5.15008505e+11, 4.35208077e+12, 3.23790954e+10,
       1.29227042e+10, 4.39751701e+08])

In [51]:
W = np.matmul(np.linalg.inv(XT_X), XT_Y)
W

array([-2.91490872e+06,  1.44873859e+03, -8.40294434e-02,  4.41568545e+00,
       -7.74879714e+01,  9.24299894e+03])

In [54]:
# Verification of this Weights can be done by Ordinary Least Squares (OLS)

import statsmodels.api as sm

regressor = sm.OLS(Y, X).fit()
print(regressor.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.756
Model:                            OLS   Adj. R-squared:                  0.756
Method:                 Least Squares   F-statistic:                     9392.
Date:                Sat, 23 Mar 2024   Prob (F-statistic):               0.00
Time:                        15:53:56   Log-Likelihood:            -1.4656e+05
No. Observations:               15157   AIC:                         2.931e+05
Df Residuals:                   15151   BIC:                         2.932e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -2.915e+06   4.83e+04    -60.354      0.0