In [1]:
import numpy as np
import random
import csv

# Feel free to import other packages, if needed.
# As long as they are supported by CSL machines.

In [2]:
def get_dataset(filename):
    file = open(filename, encoding='utf-8')
    file_reader = csv.reader(file)
    dataset = list(file_reader)
    file.close()
    dataset = dataset[1:]
    for data in dataset:
        data.pop(0)
        for i in range(len(data)):
            data[i] = float(data[i])
    return np.array(dataset).reshape(len(dataset),len(dataset[0]))

In [3]:
def print_stats(dataset, col):
    num = len(dataset)
    data_sum = 0
    for data in dataset:
        data_sum += data[col]
    avg = data_sum / num
    deviation = 0
    for data in dataset:
        deviation += (data[col] - avg)**2 
    sd = (deviation / (num -1))**0.5
    print('{}\n{:.2f}\n{:.2f}'.format(num,avg,sd))
    return None

In [4]:
def regression(dataset, cols, betas):
    mse =0
    betas = np.array(betas).reshape(len(betas),1)
    for data in dataset:
        col_value = []
        for i in cols:
            col_value.append(data[i])
        col_value.insert(0,1)
        col_value = np.array(col_value)
        mse += (np.dot(col_value,betas) - data[0])**2
    return float(mse / len(dataset))

In [5]:
def gradient_descent(dataset, cols, betas):
    beta = []
    betas = np.array(betas).reshape(len(betas),1)
    for b in range(len(betas)):
        gd = 0
        for data in dataset:
            col_value = []
            for i in cols:
                col_value.append(data[i])
            col_value.insert(0,1)
            col_value = np.array(col_value)
            gd += (np.dot(col_value,betas) - data[0])*col_value[b]
        gd = gd*2 / len(dataset)
        beta.append(float(gd))
    return np.array(beta)

In [6]:
def iterate_gradient(dataset, cols, betas, T, eta):
    t = 1
    while t <= T:
        gradients = gradient_descent(dataset, cols, betas)
        for i in range(len(betas)):
            betas[i] = betas[i] - eta * gradients[i]
        mse = regression(dataset, cols, betas)
        print('{} {:.2f}'.format(t,mse), end = ' ')
        for beta in betas:
            print('{:.2f}'.format(beta), end = ' ')
        print('\n',end = '')
        t += 1
    return None

In [7]:
def compute_betas(dataset, cols):
    X = []
    y = []
    for data in dataset:
        col_value = [1]
        y.append(data[0])
        for i in cols:
            col_value.append(data[i])
        X.append(col_value)
    X = np.array(X)
    y = np.array(y).reshape(len(dataset),1)
    betas = np.dot(np.dot(np.linalg.inv(np.dot(np.transpose(X), X)), np.transpose(X)),y)
    mse = regression(dataset, cols, betas)
    betas = betas.flatten()
    return mse,*betas


In [8]:
def predict(dataset, cols, features):
    features.insert(0,1)
    betas = np.array(compute_betas(dataset, cols)[1:]).reshape(len(cols)+1,1)
    value = np.dot(features,betas)
    
    return float(value)

In [9]:
def random_index_generator(min_val, max_val, seed=42):
    """
    DO NOT MODIFY THIS FUNCTION.
    DO NOT CHANGE THE SEED.
    This generator picks a random value between min_val and max_val,
    seeded by 42.
    """
    random.seed(seed)
    while True:
        yield random.randrange(min_val, max_val)


In [10]:
def sgd(dataset, cols, betas, T, eta):
    t = 1
    idx = random_index_generator(0, len(dataset), seed=42)
    while t <= T:
        gradient = []
        data = dataset[next(idx)]
        col_value = []
        for i in cols:
            col_value.append(data[i])
        col_value.insert(0,1)
        col_value = np.array(col_value)
        for i in range(len(betas)):
            gradient.append((np.dot(col_value, betas) - data[0]) * 2 * col_value[i])
        for i in range(len(betas)):
            betas[i] = betas[i] - eta * gradient[i]
        mse = regression(dataset, cols, betas)
        print('{} {:.2f}'.format(t,mse), end = ' ')
        for beta in betas:
            print('{:.2f}'.format(beta), end = ' ')
        print('\n',end = '')
        t += 1
    return None