# Linear Regression from scratch

In [250]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
%pylab inline
df=pd.read_csv('https://media.githubusercontent.com/media/thisismetis/nyc18_ds15/master/content/challenges/challenges-data/2013_movies.csv?token=AiNjP8C5uzUCk8KRRTMbh7CsmEIgk78rks5bbJmOwA%3D%3D')
        

Populating the interactive namespace from numpy and matplotlib


In [251]:
df['year']= [x.year for x in pd.to_datetime(df.ReleaseDate)]
df['month']= [x.month for x in pd.to_datetime(df.ReleaseDate)]
df=df.drop(columns=['Title','Director','ReleaseDate','year'])
df=df.dropna()
df=pd.get_dummies(df)

In [259]:
df.head()

Unnamed: 0,Budget,DomesticTotalGross,Runtime,month,Rating_PG,Rating_PG-13,Rating_R
0,130000000.0,424668047,146,11,0,1,0
1,200000000.0,409013994,129,5,0,1,0
2,150000000.0,400738009,108,11,1,0,0
3,76000000.0,368061265,98,7,1,0,0
4,225000000.0,291045518,143,6,0,1,0


In this exercise, I want to predict Domestic Total Gross using Budget, Runtime, month, and Ratings

In [252]:
y=df.DomesticTotalGross
x=df.drop(columns=['DomesticTotalGross'])


In [260]:
x.head()

Unnamed: 0,Budget,Runtime,month,Rating_PG,Rating_PG-13,Rating_R
0,130000000.0,146,11,0,1,0
1,200000000.0,129,5,0,1,0
2,150000000.0,108,11,1,0,0
3,76000000.0,98,7,1,0,0
4,225000000.0,143,6,0,1,0


Let's split up our dataset into a test and train set. Also, we want to normalize all the variables for faster convergence.

In [253]:

xtrain,xtest,ytrain, ytest= train_test_split(x,y,test_size=.25,random_state=42)


scaler=MinMaxScaler(feature_range=(0, 1))
xtrain=scaler.fit_transform(xtrain)
xtest=scaler.transform(xtest)
xtrain=np.concatenate((xtrain,np.ones([len(xtrain),1])),axis=1)
xtest=np.concatenate((xtest,np.ones([len(xtest),1])),axis=1)               

# Building Linear Regression class

In [236]:
class LR():
    def __init__(self,verbose=False,):
        self.verbose=verbose
        return
    def fit(self,predictors,response,learning_rate):
        ''' lets initialize the gradients and weights. I will iterate through them until we find convergence
        '''
        gradient=np.ones(len(predictors.T))
        weights=np.ones(len(predictors.T))
        count=0
        '''Since the linear regression loss function is convex by way of sum(actual-predicted)^2/n, we can minimize the\
        loss function by bringing our gradients as close to 0 as possible'''
        while np.mean(abs(gradient))>=.001:
            predictions=np.dot(predictors,weights) #equivalent to x1b1 + x2b2+ xkbk across all observations            
            MSE = sum((response-predictions) ** 2)/(2*len(predictors))
            
            '''Gradient descent is calculated as follows:
                1. from the MSE equation I take partial derivitives with respect to each feature. Combined with the
                    chain rule we get: deriv(x1)= sum(-x1(errors), where errors is y-predicted
                2. divide gradients by number of samples in the set
                3. Multiply result by learning rate
                4. Subtract result from weights '''
            
            errors=response-predictions   
            gradient=np.dot(-predictors.T,errors)
            gradient/=len(predictors)
            reduction=gradient * learning_rate
            weights -= reduction
            count+=1
            
            if self.verbose==True and count%20==0:
                print(f'MSE: {round(MSE,0)} | iteration: {count} | avg_gradient {round(np.mean(gradient),3)}')
        self.weights=weights
        self.count=count
        
        return f"Fit complete: {count} iterations"
    def predict(self,test):
        return np.dot(test,self.weights)
    def score(self,predictors,response):
        SST= sum((response-np.mean(response))**2)
        predictions=np.dot(predictors,self.weights)
        SSE= sum( (response-predictions)**2)
        self.R2= 1-SSE/SST
        return self.R2
    def coef(self):
        return self.weights

In [255]:
lr=LR(verbose=True)
lr.fit(xtrain,ytrain,1)

MSE: 6662817695107725.0 | iteration: 20 | avg_gradient 16164826.116
MSE: 5478201701665878.0 | iteration: 40 | avg_gradient 5217600.259
MSE: 5348620246736620.0 | iteration: 60 | avg_gradient 1685523.252
MSE: 5332785607357017.0 | iteration: 80 | avg_gradient 546066.503
MSE: 5330165748636634.0 | iteration: 100 | avg_gradient 178156.062
MSE: 5329445327750424.0 | iteration: 120 | avg_gradient 59046.238
MSE: 5329154192098413.0 | iteration: 140 | avg_gradient 20229.135
MSE: 5329017797327546.0 | iteration: 160 | avg_gradient 7388.057
MSE: 5328951240364446.0 | iteration: 180 | avg_gradient 3002.99
MSE: 5328918424414219.0 | iteration: 200 | avg_gradient 1410.043
MSE: 5328902201349122.0 | iteration: 220 | avg_gradient 767.518
MSE: 5328894175518715.0 | iteration: 240 | avg_gradient 468.433
MSE: 5328890204213202.0 | iteration: 260 | avg_gradient 306.83
MSE: 5328888239036337.0 | iteration: 280 | avg_gradient 208.67
MSE: 5328887266563490.0 | iteration: 300 | avg_gradient 144.537
MSE: 5328886785330238

'Fit complete: 1088 iterations'

In [256]:
print(f'Coefficients: {lr.coef()}')
print(f'R2 Value: {lr.score(xtrain,ytrain)}')

Coefficients: [ 1.54955606e+08  1.99773865e+07 -3.34279751e+06  3.06537983e+07
 -3.60884752e+06  7.57615502e+06  3.46211038e+07]
R2 Value: 0.27163893217302315


## Let's see how this compares with Sci-kit Learn:

In [257]:
from sklearn.linear_model import LinearRegression
z=LinearRegression(fit_intercept=False)
z.fit(xtrain,ytrain)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)

In [258]:
print(f'Coefficients: {z.coef_}')
print(f'R2 Value: {z.score(xtrain,ytrain)}')

Coefficients: [ 1.54955606e+08  1.99773863e+07 -3.34279748e+06  3.06537978e+07
 -3.60884801e+06  7.57615455e+06  3.46211043e+07]
R2 Value: 0.27163893217302315
