This file has code for Linear Regression using gradient descent and its accuracy will be compared with sklearn using mean squared error

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [2]:
#Getting boston data from sklearn and creating a dataframe
boston = load_boston()
X = pd.DataFrame(boston.data, columns = boston.feature_names)
Y = boston.target
Y = pd.DataFrame(Y).rename(columns={0:'Target'})
df = pd.concat([X, Y], axis=1)
df.head() #Data before standardization

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [3]:
#Data standardization using StandardScaler
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df)
df = pd.DataFrame(data=data_scaled, columns=df.columns)

#Split train and test data
X_train = df.iloc[:int(len(df)*0.8), :-1]
Y_train = df.iloc[:int(len(df)*0.8), -1:]
X_test = df.iloc[int(len(df)*0.8):, :-1]
Y_test = df.iloc[int(len(df)*0.8):, -1:]
df.head() #Data after standardization

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Target
0,-0.419782,0.28483,-1.287909,-0.272599,-0.144217,0.413672,-0.120013,0.140214,-0.982843,-0.666608,-1.459,0.441052,-1.075562,0.159686
1,-0.417339,-0.487722,-0.593381,-0.272599,-0.740262,0.194274,0.367166,0.55716,-0.867883,-0.987329,-0.303094,0.441052,-0.492439,-0.101524
2,-0.417342,-0.487722,-0.593381,-0.272599,-0.740262,1.282714,-0.265812,0.55716,-0.867883,-0.987329,-0.303094,0.396427,-1.208727,1.324247
3,-0.41675,-0.487722,-1.306878,-0.272599,-0.835284,1.016303,-0.809889,1.077737,-0.752922,-1.106115,0.113032,0.416163,-1.361517,1.182758
4,-0.412482,-0.487722,-1.306878,-0.272599,-0.835284,1.228577,-0.51118,1.077737,-0.752922,-1.106115,0.113032,0.441052,-1.026501,1.487503


In [4]:
#Creating a class for Linear Regression
class LinearRegression:
    def prediction_error(self,X,y,m,c):
        y_pred = []
        sum_of_errors = []
        for i in range(0, X.shape[0]):
            prediction = 0
            for j in range(0, X.shape[1]):
                prediction = prediction + (m[j] * X.iloc[i, j])
            prediction = prediction + c
            y_pred.append(prediction)
            sum_of_errors.append((y.iloc[i,0]-prediction)**2) #Finding the difference between actual and predicted values
        return np.array(y_pred), sum(sum_of_errors) / (2 * X.shape[0])
    
    def gradient_descent(self,X,y,y_pred):
        #First order differentiation to calculate m and c
        diff_m = []
        diff_c = 0
        for i in range(0, X.shape[1]):
            grad_m = 0
            for j in range(0, X.shape[0]):
                grad_m = (-2/X.shape[0]) *  (grad_m + (y.iloc[j,0] - y_pred[j]) * X.iloc[j, i])
                if i==0:
                    diff_c = (-2/X.shape[0]) *  (diff_c + (y.iloc[j,0] - y_pred[j]))
            diff_m.append(grad_m)
        return np.array(diff_m), diff_c

    def fit(self, X, Y):
        #Assigning initial value as 0 to m (slope) and c (intercept)
        m = np.zeros(X.shape[1]) #Calculating m value for each column in the dataframe
        c = 0
        #Learning rate and iterations values are crucial to converge the error value to 0. These can be changed based on the data. 
        learning_rate=0.2
        iterations=50 
        for i in range(0, iterations):
            y_pred, error = self.prediction_error(X, Y, m, c) #Calculating the error between actual and predicted values
            #print("{},{}".format(i, error))
            diff_m, diff_c = self.gradient_descent(X, Y, y_pred)
            m = m - (diff_m * learning_rate)
            c = c - (diff_c * learning_rate)
        self.m=m
        self.c=c
        self.Y=Y
        return m,c
    
    def predict(self, X):
        #Predicting the output by passing m and c values to prediction_error function
        y_pred, sum_error = self.prediction_error(X, self.Y, self.m, self.c)
        return np.array(y_pred)
        
model_Lin = LinearRegression()
model_Lin.fit(X_train, Y_train)
Y_pred_Lin = model_Lin.predict(X_test)

#Mean Squared Error to find the accuracy
print("Mean Sqaured Error using custom code: ",mean_squared_error(Y_pred_Lin, Y_test))

Mean Sqaured Error using custom code:  0.2352024777959336


In [5]:
from sklearn.linear_model import LinearRegression
model_sk = LinearRegression()
model_sk.fit(X_train, Y_train)
Y_pred_sk = model_sk.predict(X_test)

#Mean Squared Error to find the accuracy
print("Mean Sqaured Error using Sklearn: ",mean_squared_error(Y_pred_sk, Y_test))

Mean Sqaured Error using Sklearn:  0.38853393897891725


This above results show that the first model with 50 iterations (0.23) has mean sqaured error less than the sklearn model (0.38). That means, the first model is performing better than the sklearn model. Increasing iterations will further reduce the mean squared error of the first model