# Multiple Linear Regression

## Importing the libraries

In [132]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [133]:
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [134]:
print(X)

[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']
 [131876.9 99814.71 362861.36 'New York']
 [134615.46 147198.87 127716.82 'California']
 [130298.13 145530.06 323876.68 'Florida']
 [120542.52 148718.95 311613.29 'New York']
 [123334.88 108679.17 304981.62 'California']
 [101913.08 110594.11 229160.95 'Florida']
 [100671.96 91790.61 249744.55 'California']
 [93863.75 127320.38 249839.44 'Florida']
 [91992.39 135495.07 252664.93 'California']
 [119943.24 156547.42 256512.92 'Florida']
 [114523.61 122616.84 261776.23 'New York']
 [78013.11 121597.55 264346.06 'California']
 [94657.16 145077.58 282574.31 'New York']
 [91749.16 114175.79 294919.57 'Florida']
 [86419.7 153514.11 0.0 'New York']
 [76253.86 113867.3 298664.47 'California']
 [78389.47 153773.43 299737.29 'New York']
 [73994.56 122782.75 303319.26 'Florida']
 [67532

In [135]:
print(y)

[192261.83 191792.06 191050.39 182901.99 166187.94 156991.12 156122.51
 155752.6  152211.77 149759.96 146121.95 144259.4  141585.52 134307.35
 132602.65 129917.04 126992.93 125370.37 124266.9  122776.86 118474.03
 111313.02 110352.25 108733.99 108552.04 107404.34 105733.54 105008.31
 103282.38 101004.64  99937.59  97483.56  97427.84  96778.92  96712.8
  96479.51  90708.19  89949.14  81229.06  81005.76  78239.91  77798.83
  71498.49  69758.98  65200.33  64926.08  49490.75  42559.73  35673.41
  14681.4 ]


## Encoding categorical data

In [136]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [137]:
print(X)

[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 0.0 1.0 131876.9 99814.71 362861.36]
 [1.0 0.0 0.0 134615.46 147198.87 127716.82]
 [0.0 1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 0.0 1.0 120542.52 148718.95 311613.29]
 [1.0 0.0 0.0 123334.88 108679.17 304981.62]
 [0.0 1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 0.0 0.0 100671.96 91790.61 249744.55]
 [0.0 1.0 0.0 93863.75 127320.38 249839.44]
 [1.0 0.0 0.0 91992.39 135495.07 252664.93]
 [0.0 1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 0.0 1.0 94657.16 145077.58 282574.31]
 [0.0 1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 0.0 1.0 86419.7 153514.11 0.0]
 [1.0 0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 0.0 1.0 78389.47 153773.43 299737.29]
 [0.0 1.0 0.0 73994.56 122782.75 3

## Splitting the dataset into the Training set and Test set

In [138]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Multiple Linear Regression model on the Training set

In [139]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

## Predicting the Test set results

In [140]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))
#reshape function allows you to change vector from displaying horizontally by default to vertically with rows of height (len(y_pred)) and a length of 1.

[[103015.2  103282.38]
 [132582.28 144259.4 ]
 [132447.74 146121.95]
 [ 71976.1   77798.83]
 [178537.48 191050.39]
 [116161.24 105008.31]
 [ 67851.69  81229.06]
 [ 98791.73  97483.56]
 [113969.44 110352.25]
 [167921.07 166187.94]]


## Backward Elimination for Statistical Significance
Not required when using scikit learn since applied for you

In [141]:
import statsmodels.api as sm
#X = np.append(arr = np.ones((50, 1)).astype(int), values = X, axis = 1)
#add new column to the array to represent the missing dummy variable that is left out during the Hot Encoding. 
#This is normally considered b0x0 or the constant as part of the regression equation

print(X)

[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 0.0 1.0 131876.9 99814.71 362861.36]
 [1.0 0.0 0.0 134615.46 147198.87 127716.82]
 [0.0 1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 0.0 1.0 120542.52 148718.95 311613.29]
 [1.0 0.0 0.0 123334.88 108679.17 304981.62]
 [0.0 1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 0.0 0.0 100671.96 91790.61 249744.55]
 [0.0 1.0 0.0 93863.75 127320.38 249839.44]
 [1.0 0.0 0.0 91992.39 135495.07 252664.93]
 [0.0 1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 0.0 1.0 94657.16 145077.58 282574.31]
 [0.0 1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 0.0 1.0 86419.7 153514.11 0.0]
 [1.0 0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 0.0 1.0 78389.47 153773.43 299737.29]
 [0.0 1.0 0.0 73994.56 122782.75 3

## Backward Elimination Continued

In [143]:
X_opt = np.array(X[:, [0, 1, 2, 3, 4, 5]], dtype=float)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Tue, 12 Apr 2022",Prob (F-statistic):,1.34e-27
Time:,04:48:07,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
x2,5.032e+04,7251.767,6.940,0.000,3.57e+04,6.49e+04
x3,5.008e+04,6952.587,7.204,0.000,3.61e+04,6.41e+04
x4,0.8060,0.046,17.369,0.000,0.712,0.900
x5,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x6,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,2450000.0


In [None]:
header = []
i=0
#for row in dataset:
#    header.append(row)
for row in dataset:
    if i < 5:
        header.append(row)
        i+=1
    
print(header)
print(X_opt)

['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit']
[[1.00e+00 0.00e+00 0.00e+00 1.00e+00 1.65e+05 1.37e+05 4.72e+05]
 [1.00e+00 1.00e+00 0.00e+00 0.00e+00 1.63e+05 1.51e+05 4.44e+05]
 [1.00e+00 0.00e+00 1.00e+00 0.00e+00 1.53e+05 1.01e+05 4.08e+05]
 [1.00e+00 0.00e+00 0.00e+00 1.00e+00 1.44e+05 1.19e+05 3.83e+05]
 [1.00e+00 0.00e+00 1.00e+00 0.00e+00 1.42e+05 9.14e+04 3.66e+05]
 [1.00e+00 0.00e+00 0.00e+00 1.00e+00 1.32e+05 9.98e+04 3.63e+05]
 [1.00e+00 1.00e+00 0.00e+00 0.00e+00 1.35e+05 1.47e+05 1.28e+05]
 [1.00e+00 0.00e+00 1.00e+00 0.00e+00 1.30e+05 1.46e+05 3.24e+05]
 [1.00e+00 0.00e+00 0.00e+00 1.00e+00 1.21e+05 1.49e+05 3.12e+05]
 [1.00e+00 1.00e+00 0.00e+00 0.00e+00 1.23e+05 1.09e+05 3.05e+05]
 [1.00e+00 0.00e+00 1.00e+00 0.00e+00 1.02e+05 1.11e+05 2.29e+05]
 [1.00e+00 1.00e+00 0.00e+00 0.00e+00 1.01e+05 9.18e+04 2.50e+05]
 [1.00e+00 0.00e+00 1.00e+00 0.00e+00 9.39e+04 1.27e+05 2.50e+05]
 [1.00e+00 1.00e+00 0.00e+00 0.00e+00 9.20e+04 1.35e+05 2.53e+05]
 [1.00

## Remove index with P Value above Significance Level

In [144]:
X_opt = np.array(X[:, [0, 1, 2, 3, 4]], dtype=float)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.948
Model:,OLS,Adj. R-squared:,0.943
Method:,Least Squares,F-statistic:,205.0
Date:,"Tue, 12 Apr 2022",Prob (F-statistic):,2.9e-28
Time:,04:48:24,Log-Likelihood:,-526.75
No. Observations:,50,AIC:,1064.0
Df Residuals:,45,BIC:,1073.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,5.46e+04,6371.060,8.571,0.000,4.18e+04,6.74e+04
x2,5.57e+04,6502.532,8.565,0.000,4.26e+04,6.88e+04
x3,5.457e+04,6445.883,8.465,0.000,4.16e+04,6.75e+04
x4,0.8609,0.031,27.665,0.000,0.798,0.924
x5,-0.0527,0.050,-1.045,0.301,-0.154,0.049

0,1,2,3
Omnibus:,14.275,Durbin-Watson:,1.197
Prob(Omnibus):,0.001,Jarque-Bera (JB):,19.26
Skew:,-0.953,Prob(JB):,6.57e-05
Kurtosis:,5.369,Cond. No.,1150000.0


In [146]:
print(header)

['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit']


In [145]:
X_opt = np.array(X[:, [0, 1, 2, 3]], dtype=float)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.943
Method:,Least Squares,F-statistic:,272.4
Date:,"Tue, 12 Apr 2022",Prob (F-statistic):,2.76e-29
Time:,04:48:40,Log-Likelihood:,-527.35
No. Observations:,50,AIC:,1063.0
Df Residuals:,46,BIC:,1070.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,4.875e+04,3040.118,16.036,0.000,4.26e+04,5.49e+04
x2,4.991e+04,3422.664,14.584,0.000,4.3e+04,5.68e+04
x3,4.876e+04,3275.140,14.888,0.000,4.22e+04,5.54e+04
x4,0.8530,0.030,28.226,0.000,0.792,0.914

0,1,2,3
Omnibus:,13.418,Durbin-Watson:,1.122
Prob(Omnibus):,0.001,Jarque-Bera (JB):,17.605
Skew:,-0.907,Prob(JB):,0.00015
Kurtosis:,5.271,Cond. No.,290000.0
