In [1]:
import numpy as np
import csv
import random
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
filename = 'auto-mpg.csv'
txtfile = 'auto-mpg.txt'

In [3]:
col_names = ['mpg', 'cylinders', 'displacement', 'hoursepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']

In [4]:
pandas_data = pd.read_csv(txtfile, header=None, delim_whitespace=True, names=col_names, na_values=['?'])

hp_mean = pandas_data.hoursepower.mean()
weight_mean = pandas_data.weight.mean()

#replace the missing values in hp and weight with the mean of those columns
values = {'hoursepower': hp_mean, 'weight': weight_mean}
pandas_data = pandas_data.fillna(value=values)

In [5]:
#random array between 0 and 1 with same size as the dataset.
#sets true if value is less than 0.8, creating an approximate 80% 20% split.
msk = np.random.rand(len(pandas_data)) < 0.8
#creates the train set
train = pandas_data[msk]
#creates the test set
test = pandas_data[~msk]

In [7]:
#X and Y are the two given data arrays
def norm_function(X, Y):
    ## normal equation: B = (H^T H)^(-1) H^T z

    #initialize matrix H and z
    H = []
    z = []
    
    #builds matrix H and z from given data arrays X and Y
    for i in range(len(X)):
        H.append(np.array([1, X[i]]))
        z.append(Y[i])

    #turn lists into numpy arrays
    H = np.array(H)
    z = np.array(z)

    # normal function calculations:
    
    # calculates H^T dot H
    hTh = np.transpose(H).dot(H)
    # calculates H^T dot z
    hTz = H.T.dot(z)
    # calculates the rest of the equation: B = (H^T H)^(-1) H^T z
    b = np.linalg.solve(hTh, hTz)
    
    #return the calculated weights b0 and b1
    return b

In [8]:
# find linear regression weights based on the data columns
weights = norm_function(np.array(train.hoursepower.values), np.array(train.weight.values))
print(weights)

[994.31468006  18.97227421]
