# Multi-variate Linear Regression on Boston Dataset

In [None]:
# This is a code to implement multivariate linear regression on boston dataset from scratch

# ===================================Importing libraries====================================
import numpy as np                              #for handling arrays and matrices
import pandas as pd                             #for working with datasets and dataframes
from matplotlib import pyplot as plt            #for visualisation purposes
from sklearn.datasets import load_boston        #importing the boston dataset from scikit-learn  

from sklearn.linear_model import LinearRegression  #to compare our results with standard library results
from sklearn.model_selection import train_test_split #to split the dataset into test and train 

In [None]:
# loading dataset from sklearn
dataset = ?()         #loading the contents of the dictionary-dataset into 'dataset'
print(?.?())           #displaying the keys of the dictionary dataset

#=====================================DATASET INFORMATION===================================
#data-input data
#target- results of each sample stored in the target variable
#feature_names- attribute names/column names in the input data
#DESCR- description of the dataset

print(?['feature_names'])     #printing the names of the features

print(?.?.?)           #printing the shape (dimensional size) of the input data
print(?.?.?)         #printing the size of target variable
print(dataset.?)                #describing the dataset

In [None]:
# storing features in x and target variable in y
x = dataset.?
y = dataset.?

y = y.?(y.shape[0], 1)

# converting into a dataframe
df = pd.?(x)
df.columns = dataset.?
df.head()

In [None]:
#adding target variable to our dataframe
df['PRICE'] = y

# ============finding correlation between attributes and target variable===============
print(np.corrcoef(np.transpose(df))[-1, :])

df.head()

In [None]:
df.columns

In [None]:
# ============================ normalising the dataset=========================

df =(df - df.mean()) / df.std()
df.head()

In [None]:
#=============================== MULTI-VARIATE LINEAR REGRESSION ========================

# ============================== INITIALISING HYPER-PARAMETERS =========================
# Hyper-parameters for a multi-variate linear regression are: 
#  1. Theta (weights) which describe the line
#  2. epochs- number of times we run our linear regression to minimise the loss
#  3. alpha- the learning rate


# initialising theta with zeros 
theta = np.?((1, x.shape[1] + 1)) #its dimensions are (1,14) because of the presence of a bias term (intercept)
print(theta.?)
print(theta)

#Both epoch and alpha can be changed and tested on different numbers 
# to minimise loss at a different rate(Advisable)
epoch = ?
alpha = ? 

# creating bias vector x0
x0 = np.?((x.shape[0], 1))

# forming input variable
X = np.?((x0, x), axis = 1)
print(X.shape)

In [None]:
# ======================splitting the data into train and test===========================

def train_test_splitt(dataset, ratio = 0.7):
    m = len(df)
    
    #if ratio to be divided is given in percentage, multiply with 0.01
    if ratio > 1:
        train_ratio = int(ratio * 0.01 * m)
    else:
        train_ratio = int(ratio * m)

    x_train = dataset[:train_ratio, :-1]
    x_test = dataset[train_ratio: , :-1]
    
    y_train = dataset[:train_ratio, -1:]
    y_test = dataset[train_ratio: , -1:]
    
    return x_train, x_test, y_train, y_test

# data = np.concatenate((X, y), axis = 1)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
# x_train, x_test, y_train, y_test = train_test_split(data)
y_train = y_train.reshape((y_train.shape[0], 1))
y_test = y_test.reshape((y_test.shape[0], 1))

print(df.shape)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# ================================ DEFINING COST FUNCTION ================================
def cost_function(X, y, theta):
    h = X.?(theta.T)
    loss = h - y
    return np.?(loss ** 2)/ (2 * len(X))

# For testing the function
cost = ?(x_train, y_train, theta)
print(cost)

In [None]:
# ============================ DEFINING GRADIENT DESCENT =========================
def grad_descent(X, y, theta, alpha):
    h = X.?(theta.T)
    loss = ?
    dj = (?.T).dot(X)
    theta = theta - (alpha/len(X)) * dj
    return theta

# For testing the function 
cost = ?(x_train, y_train, theta)
print("Cost before: ", cost)

theta = ?(x_train, y_train, theta, 0.0000001)

cost = ?(x_train, y_train, theta)
print("Cost after: ", cost)

In [None]:
# ========================== DEFINING OUR LINEAR REGRESSION =========================
def linear_reg(epoch, X, y, theta, alpha):
    ?:
        
        #calculate new theta
        theta = ?(X, y, theta, alpha)
        
        #compute new loss
        loss = ?(X, y, theta)
        print("Cost function: ", loss)
        
    return theta

theta = ?(epoch, x_train, y_train, theta, alpha)

In [None]:
# ========================= TESTING THE DATA ========================
def predict(x_test, theta):
    return x_test.?(theta.T)

y_pred =  predict(x_test, theta)

In [None]:
# ====================== Find error in the predicted values=============
def mean_squared_error(h, y):
    m = y.shape[0]
    return np.sum(((h - y) ** 2 ) / m)

# for testing the function
# mean_squared_error(y_pred, y_test)

In [None]:
# ================= Plotting hypothesis value vs actual value for train set=========
h = x_train.?(theta.T)
plt.plot(h, y_train, 'b.')
plt.ylabel('Actual value')
plt.xlabel('Predicted value')
plt.show()

print(?(h, y_train))
print(?(x_train, y_train, theta))

In [None]:
# =============== Plotting hypothesis value vs actual value for test set =======

plt.?(y_pred, y_test, 'b.')
plt.?('Actual value')
plt.?('Predicted value')
plt.show()

print(?(y_pred, y_test))
print(?(x_test, y_test, theta))

In [None]:
# ============= COMPARING WITH SKLEARN'S LINEAR REGRESSION MODEL ===============

lm = LinearRegression()
lm.?(x_train, y_train)

y_predict = lm.?(x_test)

plt.?(y_predict, y_test, 'b.')
plt.?('Actual value')
plt.?('Predicted value')
plt.show()

print(lm.?(x_test, y_test))
print(?(y_test, y_predict))

### Comparing error acheived by our algorithm with the inbuilt library for linear regression, we see there is a difference between the models.