# Student information

Name: Hoang Tuan Tu

ID: 21000709

In [None]:
# Importing library
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

In [None]:
# Read data
data = []
with open("vidu3_lin_reg.txt") as f:
    data = f.readlines()

# Remove header row
data = data[1:]

# slit and cast data to float
data = [list(map(float, x.strip().split())) for x in data]

# Cast data to numpy array
data = np.array(data)

# Remove index row
data = data[:, 1:]

print(data)

In [None]:
# Slit data to input data set and label
label = data[:, len(data[0]) - 1:]
data = data[:, :-1]


print(data[:5])

In [None]:
# Slit data to test and train
## Train dataset
X_train = data[:80]
Y_train = label[:80]

## Test dataset
X_test = data[80:]
Y_test = label[80:]

In [None]:
# Train data
print(X_train[:5])
print(Y_train[:5])

In [None]:
# Test data
print(X_test[:5])
print(Y_test[:5])

In [None]:
# Define function
def qr_householder(A):
    M = A.shape[0]
    N = A.shape[1]

    # set Q to the identity matrix
    Q = np.identity(M)

    # set R to zero matrix
    R = np.copy(A)

    for n in range(N):
    # vector to transform
        x = A[n:, n]
        k = x.shape[0]

        # compute ro=-sign(x0)||x||
        ro = -np.sign(x[0]) * np.linalg.norm(x)

        # compute the householder vector v
        e = np.zeros(k)
        e[0] = 1
        v = (1 / (x[0] - ro)) * (x - (ro * e))

        # apply v to each column of A to find R
        for i in range(N):
            R[n:, i] = R[n:, i] - (2 / (v@v)) * ((np.outer(v, v)) @ R[n:, i])

        # apply v to each column of Q
        for i in range(M):
            Q[n:, i] = Q[n:, i] - (2 / (v@v)) * ((np.outer(v, v)) @ Q[n:, i])

    return Q.transpose(), R

def linear_regression(x_data, y_data):
    # add column 1
    x_bars = np.concatenate((np.ones((x_data.shape[0], 1)), x_data), axis=1)

    Q, R = qr_householder(x_bars) # QR decomposition
    R_pinv = np.linalg.pinv(R) # calculate inverse matrix of R
    A = np.dot(R_pinv, Q.T) # apply formula

    return np.dot(A, y_data)

In [None]:
# Solve
w = linear_regression(X_train, Y_train)
w = w.T.tolist()
intercept = w[0][0]
coef = w[0][1:]
print('Intercept:', intercept)
print("Coefficient: ", coef)

In [None]:
# Predict with test data
x = np.array(X_test)

y_pred = np.array([intercept] * len(x))

for i in range(len(x)):
    for j in range(len(x[0])):
        y_pred[i] += coef[j] * x[i, j]
        
print(y_pred)

In [None]:
# Skit learn solution
model = LinearRegression()

model.fit(X_train, Y_train)
print(model.coef_[0])
print(model.intercept_[0])

In [None]:
# Predict with sklearn solution
pred = model.predict(X_test)
print(pred.T[0])

In [None]:
df = pd.DataFrame({"My Solution" :  y_pred,"Sklearn Solution" : pred[:, 0], "Label": Y_test[:, 0]})
print(df)

In [None]:
# Caculate Mean Squared Error
mse = np.sum((Y_test - y_pred) ** 2) / len(y_pred)
print(mse)

In [None]:
# caculate Variance of the Error Term
var = np.sum((Y_test - y_pred - mse) ** 2) / len(y_pred)
print(var)