# Multiple Linear Regression

## Importing the libraries

In [101]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [102]:
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
print(X)

[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']
 [131876.9 99814.71 362861.36 'New York']
 [134615.46 147198.87 127716.82 'California']
 [130298.13 145530.06 323876.68 'Florida']
 [120542.52 148718.95 311613.29 'New York']
 [123334.88 108679.17 304981.62 'California']
 [101913.08 110594.11 229160.95 'Florida']
 [100671.96 91790.61 249744.55 'California']
 [93863.75 127320.38 249839.44 'Florida']
 [91992.39 135495.07 252664.93 'California']
 [119943.24 156547.42 256512.92 'Florida']
 [114523.61 122616.84 261776.23 'New York']
 [78013.11 121597.55 264346.06 'California']
 [94657.16 145077.58 282574.31 'New York']
 [91749.16 114175.79 294919.57 'Florida']
 [86419.7 153514.11 0.0 'New York']
 [76253.86 113867.3 298664.47 'California']
 [78389.47 153773.43 299737.29 'New York']
 [73994.56 122782.75 303319.26 'Florida']
 [67532

## Encoding categorical data

In [103]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [104]:
print(X)

[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 0.0 1.0 131876.9 99814.71 362861.36]
 [1.0 0.0 0.0 134615.46 147198.87 127716.82]
 [0.0 1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 0.0 1.0 120542.52 148718.95 311613.29]
 [1.0 0.0 0.0 123334.88 108679.17 304981.62]
 [0.0 1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 0.0 0.0 100671.96 91790.61 249744.55]
 [0.0 1.0 0.0 93863.75 127320.38 249839.44]
 [1.0 0.0 0.0 91992.39 135495.07 252664.93]
 [0.0 1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 0.0 1.0 94657.16 145077.58 282574.31]
 [0.0 1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 0.0 1.0 86419.7 153514.11 0.0]
 [1.0 0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 0.0 1.0 78389.47 153773.43 299737.29]
 [0.0 1.0 0.0 73994.56 122782.75 3

In [105]:
# X = X[:, 1:]

## Splitting the dataset into the Training set and Test set

In [106]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Multiple Linear Regression model on the Training set

In [107]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)


LinearRegression()

## Predicting the Test set results

In [108]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))


[[103015.2  103282.38]
 [132582.28 144259.4 ]
 [132447.74 146121.95]
 [ 71976.1   77798.83]
 [178537.48 191050.39]
 [116161.24 105008.31]
 [ 67851.69  81229.06]
 [ 98791.73  97483.56]
 [113969.44 110352.25]
 [167921.07 166187.94]]


## Building an optimal model for Backward Elimination

In [120]:
import statsmodels.api as sm
X = np.append(arr = np.ones((50, 1)).astype(int), values = X, axis = 1)
X_opt = X[:, [0, 1, 2, 3, 4, 5, 6]]
X_opt = X_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

In [121]:
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.024
Model:,OLS,Adj. R-squared:,-0.018
Method:,Least Squares,F-statistic:,0.5748
Date:,"Fri, 11 Mar 2022",Prob (F-statistic):,0.567
Time:,09:33:10,Log-Likelihood:,-600.05
No. Observations:,50,AIC:,1206.0
Df Residuals:,47,BIC:,1212.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.275e+04,1972.327,11.535,0.000,1.88e+04,2.67e+04
x1,2.275e+04,1972.327,11.535,0.000,1.88e+04,2.67e+04
x2,2.275e+04,1972.327,11.535,0.000,1.88e+04,2.67e+04
x3,2.275e+04,1972.327,11.535,0.000,1.88e+04,2.67e+04
x4,2.275e+04,1972.327,11.535,0.000,1.88e+04,2.67e+04
x5,-9851.2712,1.39e+04,-0.706,0.483,-3.79e+04,1.82e+04
x6,5017.5779,1.42e+04,0.354,0.725,-2.35e+04,3.35e+04

0,1,2,3
Omnibus:,0.111,Durbin-Watson:,0.081
Prob(Omnibus):,0.946,Jarque-Bera (JB):,0.207
Skew:,0.104,Prob(JB):,0.902
Kurtosis:,2.762,Cond. No.,7.22e+32


In [122]:
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
X_opt = X_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.021
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,1.043
Date:,"Fri, 11 Mar 2022",Prob (F-statistic):,0.312
Time:,09:33:49,Log-Likelihood:,-600.12
No. Observations:,50,AIC:,1204.0
Df Residuals:,48,BIC:,1208.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.324e+04,1402.665,16.567,0.000,2.04e+04,2.61e+04
x1,2.324e+04,1402.665,16.567,0.000,2.04e+04,2.61e+04
x2,2.324e+04,1402.665,16.567,0.000,2.04e+04,2.61e+04
x3,2.324e+04,1402.665,16.567,0.000,2.04e+04,2.61e+04
x4,2.324e+04,1402.665,16.567,0.000,2.04e+04,2.61e+04
x5,-1.228e+04,1.2e+04,-1.021,0.312,-3.65e+04,1.19e+04

0,1,2,3
Omnibus:,0.079,Durbin-Watson:,0.073
Prob(Omnibus):,0.961,Jarque-Bera (JB):,0.19
Skew:,0.087,Prob(JB):,0.909
Kurtosis:,2.753,Cond. No.,1.85e+35


In [123]:
X_opt = X[:, [0, 1, 2, 3, 4]]
X_opt = X_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

  return self.ess/self.df_model


0,1,2,3
Dep. Variable:,y,R-squared:,0.0
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,inf
Date:,"Fri, 11 Mar 2022",Prob (F-statistic):,
Time:,09:34:07,Log-Likelihood:,-600.65
No. Observations:,50,AIC:,1203.0
Df Residuals:,49,BIC:,1205.0
Df Model:,0,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.24e+04,1140.031,19.651,0.000,2.01e+04,2.47e+04
x1,2.24e+04,1140.031,19.651,0.000,2.01e+04,2.47e+04
x2,2.24e+04,1140.031,19.651,0.000,2.01e+04,2.47e+04
x3,2.24e+04,1140.031,19.651,0.000,2.01e+04,2.47e+04
x4,2.24e+04,1140.031,19.651,0.000,2.01e+04,2.47e+04

0,1,2,3
Omnibus:,0.018,Durbin-Watson:,0.02
Prob(Omnibus):,0.991,Jarque-Bera (JB):,0.068
Skew:,0.023,Prob(JB):,0.966
Kurtosis:,2.825,Cond. No.,3.81e+48


In [124]:
y_pred = regressor.predict(X_opt)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))


ValueError: ignored