# Linear Regression

In [30]:
# Libraries
import pandas as pd
import numpy as np
import bokeh
from bokeh.plotting import figure, show, output_notebook
output_notebook()
from bokeh.layouts import row
from pandas.api.types import is_numeric_dtype

In [31]:
# Load data
dataset = pd.read_csv("../dataset/student-mat.csv", delimiter=";")

In [32]:
# Choose features and target
target = "G3"
skiped_features = []

features = list(dataset.columns)
features.remove(target)
features = np.setdiff1d(features, skiped_features)

In [33]:
# Convert nominal features and/or target to numeric values

# This function get a dataset as parameter and for every nominal feature makes
# a dictionary which maps every original nominal value of the feature to the 
# new numeric value. Original numeric features are skiped. The function returns
# dictionary where keys are nominal features from the input dataset and values are nested dictionares
# of feature values mappings as described above.
def nom_to_num(dataset):
    raw_dictionary = []
    for feature in dataset.columns:
        if not is_numeric_dtype(type(dataset.loc[0,feature])):
            nominals = np.unique(dataset.loc[:,feature])
            enumeration = np.arange(len(nominals))
            mapping = dict(list(zip(nominals, enumeration)))
            raw_dictionary.append((feature, mapping))
    return dict(raw_dictionary)

# Below function converts given dataset to a dataset with only numerical values
def to_numeric(dataset):
    dataset = dataset.copy()
    mapping = nom_to_num(dataset)
    features = list(mapping.keys())
    dataset[features] = dataset[features].apply(lambda col: col.map(lambda val: mapping[col.name][val]))
    return dataset

dataset = to_numeric(dataset)

In [34]:
def standardize(dataset):
    dataset = dataset.copy()
    dataset = (dataset - dataset.apply(np.mean))/(dataset.apply(np.max) - dataset.apply(np.min))
    return dataset

dataset = standardize(dataset)

In [35]:
# # Add square features
# X = np.concatenate((X,X**2), axis=1)

In [36]:
# Prepare datasets
X = dataset.loc[:,features].values
Y = dataset.loc[:,target].values

dataset_length = len(X)
cv_start_index = int(0.6*dataset_length)
test_start_index = int(0.8*dataset_length)

X_training = X[:cv_start_index]
Y_training = Y[:cv_start_index]
X_cv = X[cv_start_index:test_start_index]
Y_cv = Y[cv_start_index:test_start_index]
X_test = X[test_start_index:]
Y_test = Y[test_start_index:]

In [37]:
features

array(['Dalc', 'Fedu', 'Fjob', 'G1', 'G2', 'Medu', 'Mjob', 'Pstatus',
       'Walc', 'absences', 'activities', 'address', 'age', 'failures',
       'famrel', 'famsize', 'famsup', 'freetime', 'goout', 'guardian',
       'health', 'higher', 'internet', 'nursery', 'paid', 'reason',
       'romantic', 'school', 'schoolsup', 'sex', 'studytime',
       'traveltime'], dtype='<U10')

In [38]:
# Hypothesis function
def h(THETA, X):
    return X.dot(THETA)

In [39]:
# Cost function
def J(THETA, X, Y, LAMBDA):
    return 1/(2*len(X)) * (np.sum((h(THETA, X)-Y)**2) + LAMBDA * np.sum(THETA**2))

# Derrivative of cost function respects to i-th THETA parameter
def dJ(THETA, X, Y, LAMBDA, i):
    return 1/(len(X)) * (np.sum((h(THETA, X)-Y)*X[:,i]) + LAMBDA * THETA[i])

In [40]:
# Gradient Descent
def gradient_descent(X, Y, a, LAMBDA):
    costs = []
    THETA = np.zeros(X.shape[1])
    while True:
        NEW_THETA = np.copy(THETA)
        for i in range(len(THETA)):
            NEW_THETA[i] = THETA[i] - a*dJ(THETA, X, Y, LAMBDA, i)
        costs.append(J(THETA, X, Y, LAMBDA))
        if J(NEW_THETA, X, Y, LAMBDA) <= J(THETA, X, Y, LAMBDA):
            THETA = NEW_THETA
        else:
            return THETA, costs

In [49]:
%%time
a = 0.1 # Learning rate
LAMBDA = 10 # Regularization parameter
THETA, costs = gradient_descent(X_training, Y_training, a, LAMBDA)
print(THETA)

[-0.01373747 -0.01163843  0.02116659  0.23587211  0.34326232  0.02504813
  0.01378351 -0.02204264  0.01734251  0.00425708  0.02040056  0.02447455
 -0.02403447 -0.08367963  0.03270197  0.01034443 -0.01660961  0.01555428
 -0.0547056  -0.00676678 -0.02794757  0.03118802  0.00292406 -0.00098843
  0.02252796  0.01482841 -0.03185651 -0.01421146 -0.01417877  0.01269531
 -0.00072033 -0.02348018]
CPU times: user 3.12 s, sys: 3.83 ms, total: 3.13 s
Wall time: 3.16 s


In [50]:
p = figure(title = "Cost function value against iterations",
           x_axis_type="log", x_axis_label="Iterations",
           y_axis_label="Cost function value",
          width=500, height=500)
iterations = list(range(0,len(costs)))
p.line(iterations, costs)
p.line(iterations, costs[-1], line_color="red")

def accuracy(THETA, X, Y, LAMBDA):
    J_MAX = 2.0
    return J(THETA, X, Y, LAMBDA)/J_MAX

from bokeh.models import LinearAxis, Range1d
    
J(THETA, X_cv, Y_cv, LAMBDA)
J(THETA, X_test, Y_test, LAMBDA)

accuracies = np.array(costs)/2.0
p1 = figure(title = "Cost function percentage against iterations",
           x_axis_type="log", x_axis_label="Iterations",
           y_axis_label="Cost function percentage",
          width=500, height=500)
iterations = list(range(0,len(costs)))
p1.yaxis.formatter = NumeralTickFormatter(format='0.f %')
p1.line(iterations, accuracies)
p1.line(iterations, accuracies[-1], line_color="red")

show(row(p,p1))

In [51]:
# Importance of the features
absolute_theta = np.absolute(THETA)
normalized_theta = absolute_theta/np.sum(absolute_theta)
features_importance = pd.DataFrame(data=[normalized_theta], columns=features)
sorted_features_importance = features_importance.sort_values(by=0,axis=1)
sorted_importance = sorted_features_importance.loc[0].values
sorted_features = sorted_features_importance.columns.values

from bokeh.models import NumeralTickFormatter


p1 = figure(title = "Importance of features",
            x_axis_label="Features", y_axis_label="Importance",
            x_range=sorted_features,
          width=800, height=500)
p1.yaxis.formatter = NumeralTickFormatter(format='0 %')
p1.xaxis.major_label_orientation = "vertical"
p1.vbar(x=sorted_features, top=sorted_importance, width=1,
       fill_color='orange', line_color='black')
show(p1)

# Weights of the features
features_weights = pd.DataFrame(data=[THETA], columns=features)
sorted_features_weights = features_weights.sort_values(by=0,axis=1)
sorted_weights = sorted_features_weights.loc[0].values
sorted_features = sorted_features_weights.columns.values

p2 = figure(title = "Features weights",
            x_axis_label="Features", y_axis_label="Weights",
            x_range=sorted_features,
          width=800, height=500)
p2.xaxis.major_label_orientation = "vertical"
p2.vbar(x=sorted_features, top=sorted_weights, width=1,
       fill_color='blue', line_color='black')
show(p2)