# Linear Regression

In [418]:
# Libraries
import pandas as pd
import numpy as np
import bokeh
from bokeh.plotting import figure, show, output_notebook
output_notebook()
from pandas.api.types import is_numeric_dtype

In [419]:
# Load data
dataset = pd.read_csv("../dataset/student-mat.csv", delimiter=";")

In [420]:
# Choose features and target
target = "G3"
skiped_features = []

features = list(dataset.columns)
features.remove(target)
features = np.setdiff1d(features, skiped_features)

In [421]:
# Convert nominal features and/or target to numeric values

# This function get a dataset as parameter and for every nominal feature makes
# a dictionary which maps every original nominal value of the feature to the 
# new numeric value. Original numeric features are skiped. The function returns
# dictionary where keys are nominal features from the input dataset and values are nested dictionares
# of feature values mappings as described above.
def nom_to_num(dataset):
    raw_dictionary = []
    for feature in dataset.columns:
        if not is_numeric_dtype(type(dataset.loc[0,feature])):
            nominals = np.unique(dataset.loc[:,feature])
            enumeration = np.arange(len(nominals))
            mapping = dict(list(zip(nominals, enumeration)))
            raw_dictionary.append((feature, mapping))
    return dict(raw_dictionary)

# Below function converts given dataset to a dataset with only numerical values
def to_numeric(dataset):
    dataset = dataset.copy()
    mapping = nom_to_num(dataset)
    features = list(mapping.keys())
    dataset[features] = dataset[features].apply(lambda col: col.map(lambda val: mapping[col.name][val]))
    return dataset

dataset = to_numeric(dataset)

In [422]:
def standardize(dataset):
    dataset = dataset.copy()
    dataset = (dataset - dataset.apply(np.mean))/(dataset.apply(np.max) - dataset.apply(np.min))
    return dataset

dataset = standardize(dataset)

In [423]:
# # Add square features
# X = np.concatenate((X,X**2), axis=1)

In [424]:
# Prepare datasets
X = dataset.loc[:,features].values
Y = dataset.loc[:,target].values

dataset_length = len(X)
cv_start_index = int(0.6*dataset_length)
test_start_index = int(0.8*dataset_length)

X_training = X[:cv_start_index]
Y_training = Y[:cv_start_index]
X_cv = X[cv_start_index:test_start_index]
Y_cv = Y[cv_start_index:test_start_index]
X_test = X[test_start_index:]
Y_test = Y[test_start_index:]

In [425]:
# Hypothesis function
def h(THETA, X):
    return X.dot(THETA)

In [426]:
# Cost function
def J(THETA, X, Y, LAMBDA):
    return 1/(2*len(X)) * (np.sum((h(THETA, X)-Y)**2) + LAMBDA * np.sum(THETA**2))

# Derrivative of cost function respects to i-th THETA parameter
def dJ(THETA, X, Y, LAMBDA, i):
    return 1/(len(X)) * (np.sum((h(THETA, X)-Y)*X[:,i]) + LAMBDA * THETA[i])

In [427]:
# Gradient Descent
def gradient_descent(X, Y, a, LAMBDA):
    costs = []
    THETA = np.zeros(X.shape[1])
    while True:
        NEW_THETA = np.copy(THETA)
        for i in range(len(THETA)):
            NEW_THETA[i] = THETA[i] - a*dJ(THETA, X, Y, LAMBDA, i)
        costs.append(J(THETA, X, Y, LAMBDA))
        if J(NEW_THETA, X, Y, LAMBDA) <= J(THETA, X, Y, LAMBDA):
            THETA = NEW_THETA
        else:
            return THETA, costs

In [443]:
%%time
a = 0.01 # Learning rate
LAMBDA = 10 # Regularization parameter
THETA, costs = gradient_descent(X_training, Y_training, a, LAMBDA)
print(t)

[-0.00442697  0.00999092  0.0088568   0.06706327  0.08015343  0.0195475
  0.01516268 -0.00587735 -0.00655391 -0.00087929  0.01673184  0.01689368
 -0.01141962 -0.04139694  0.00767325  0.00940427 -0.01629806  0.004866
 -0.02073699 -0.00398711 -0.01420926  0.01540552  0.01027979  0.00588808
  0.01351926  0.01339571 -0.02283363 -0.00136894 -0.01516256  0.01651471
  0.00357814 -0.01254951]
CPU times: user 15.1 s, sys: 63.7 ms, total: 15.2 s
Wall time: 15.2 s


In [444]:
p = figure(title = "Cost function value against iterations",
           x_axis_type="log", x_axis_label="Iterations",
           y_axis_label="Cost function value",
          width=500, height=500)
iterations = list(range(0,len(costs)))
p.line(iterations, costs)
p.line(iterations, costs[-1], line_color="red")
show(p)

In [445]:
J(THETA, X_cv, Y_cv, LAMBDA)

0.022409150796286356

In [446]:
J(THETA, X_test, Y_test, LAMBDA)

0.026081044651253833