In [52]:
#import standard libraries
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
import pylab as pl
from scipy import stats

import statsmodels.api as sm  #for backward Elimination

#import model and metrics
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [43]:
dataset = pd.read_csv('50_Startups.csv')
X= dataset.iloc[:, :-1].values #features
y=dataset.iloc[:, -1].values #target

In [44]:
X[:5]

array([[165349.2, 136897.8, 471784.1, 'New York'],
       [162597.7, 151377.59, 443898.53, 'California'],
       [153441.51, 101145.55, 407934.54, 'Florida'],
       [144372.41, 118671.85, 383199.62, 'New York'],
       [142107.34, 91391.77, 366168.42, 'Florida']], dtype=object)

In [45]:
#don't need to del the one dummy variable (0 or 1) column, but for other model we need to use n-1 dummy column

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))


In [46]:
# always create in the first columns
print(X)

[[0.0 1.0 165349.2 136897.8 471784.1]
 [0.0 0.0 162597.7 151377.59 443898.53]
 [1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 1.0 144372.41 118671.85 383199.62]
 [1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 1.0 131876.9 99814.71 362861.36]
 [0.0 0.0 134615.46 147198.87 127716.82]
 [1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 1.0 120542.52 148718.95 311613.29]
 [0.0 0.0 123334.88 108679.17 304981.62]
 [1.0 0.0 101913.08 110594.11 229160.95]
 [0.0 0.0 100671.96 91790.61 249744.55]
 [1.0 0.0 93863.75 127320.38 249839.44]
 [0.0 0.0 91992.39 135495.07 252664.93]
 [1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 1.0 114523.61 122616.84 261776.23]
 [0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 1.0 94657.16 145077.58 282574.31]
 [1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 1.0 86419.7 153514.11 0.0]
 [0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 1.0 78389.47 153773.43 299737.29]
 [1.0 0.0 73994.56 122782.75 303319.26]
 [1.0 0.0 67532.53 105751.03 304768.73]
 [0.0 1.0 77044.01 99281.34 140574.81]
 [0

In [47]:
#spliting dataset into the Traninh and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [48]:
#training the Multiple Linear regression model on the training set
regr = LinearRegression()
regr.fit(X_train, y_train)

LinearRegression()

In [49]:
#prediction the test set result
y_pred = regr.predict(X_test)
y_pred

array([104440.73, 132253.82, 132872.07,  71707.79, 178678.99, 115078.13,
        66093.93,  98759.73, 114113.6 , 167979.49,  95786.77,  87785.35,
       110455.98])

In [51]:
#y_pred is an numpy array of row
np.set_printoptions(precision = 2) #only 2 decimals after point
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

#all the predicted values from model on the left side and real values on the right side

[[104440.73 103282.38]
 [132253.82 144259.4 ]
 [132872.07 146121.95]
 [ 71707.79  77798.83]
 [178678.99 191050.39]
 [115078.13 105008.31]
 [ 66093.93  81229.06]
 [ 98759.73  97483.56]
 [114113.6  110352.25]
 [167979.49 166187.94]
 [ 95786.77  96778.92]
 [ 87785.35  96479.51]
 [110455.98 105733.54]]


#### Significance level - Backward elimination
Backward elimination is a feature selection. It is used to remove those features that do not have a significant effect/level on the dependent variable or prediction of output.

You need to add one column of ones in the startinf of the column.
You have a P-value = 1 - significance level. If P_value is high significance level is less, you will be deleating features one by one whose P-value is high which means it has less significance level

In [53]:
#building the optimal model usinf Backward Elimination
X = np.append(arr = np.ones((50, 1)).astype(float), values = X, axis = 1)
print(X)

[[1.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [1.0 1.0 0.0 153441.51 101145.55 407934.54]
 [1.0 0.0 1.0 144372.41 118671.85 383199.62]
 [1.0 1.0 0.0 142107.34 91391.77 366168.42]
 [1.0 0.0 1.0 131876.9 99814.71 362861.36]
 [1.0 0.0 0.0 134615.46 147198.87 127716.82]
 [1.0 1.0 0.0 130298.13 145530.06 323876.68]
 [1.0 0.0 1.0 120542.52 148718.95 311613.29]
 [1.0 0.0 0.0 123334.88 108679.17 304981.62]
 [1.0 1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 0.0 0.0 100671.96 91790.61 249744.55]
 [1.0 1.0 0.0 93863.75 127320.38 249839.44]
 [1.0 0.0 0.0 91992.39 135495.07 252664.93]
 [1.0 1.0 0.0 119943.24 156547.42 256512.92]
 [1.0 0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 0.0 0.0 78013.11 121597.55 264346.06]
 [1.0 0.0 1.0 94657.16 145077.58 282574.31]
 [1.0 1.0 0.0 91749.16 114175.79 294919.57]
 [1.0 0.0 1.0 86419.7 153514.11 0.0]
 [1.0 0.0 0.0 76253.86 113867.3 298664.47]
 [1.0 0.0 1.0 78389.47 153773.43 299737.29]
 [1.0 1.0 0.0 73994.56 122782.75 3

In [55]:
X_opt = np.array(X[:, [0, 1, 2, 3, 4, 5]], dtype = float)
X_opt

array([[1.00e+00, 0.00e+00, 1.00e+00, 1.65e+05, 1.37e+05, 4.72e+05],
       [1.00e+00, 0.00e+00, 0.00e+00, 1.63e+05, 1.51e+05, 4.44e+05],
       [1.00e+00, 1.00e+00, 0.00e+00, 1.53e+05, 1.01e+05, 4.08e+05],
       [1.00e+00, 0.00e+00, 1.00e+00, 1.44e+05, 1.19e+05, 3.83e+05],
       [1.00e+00, 1.00e+00, 0.00e+00, 1.42e+05, 9.14e+04, 3.66e+05],
       [1.00e+00, 0.00e+00, 1.00e+00, 1.32e+05, 9.98e+04, 3.63e+05],
       [1.00e+00, 0.00e+00, 0.00e+00, 1.35e+05, 1.47e+05, 1.28e+05],
       [1.00e+00, 1.00e+00, 0.00e+00, 1.30e+05, 1.46e+05, 3.24e+05],
       [1.00e+00, 0.00e+00, 1.00e+00, 1.21e+05, 1.49e+05, 3.12e+05],
       [1.00e+00, 0.00e+00, 0.00e+00, 1.23e+05, 1.09e+05, 3.05e+05],
       [1.00e+00, 1.00e+00, 0.00e+00, 1.02e+05, 1.11e+05, 2.29e+05],
       [1.00e+00, 0.00e+00, 0.00e+00, 1.01e+05, 9.18e+04, 2.50e+05],
       [1.00e+00, 1.00e+00, 0.00e+00, 9.39e+04, 1.27e+05, 2.50e+05],
       [1.00e+00, 0.00e+00, 0.00e+00, 9.20e+04, 1.35e+05, 2.53e+05],
       [1.00e+00, 1.00e+00, 0.00e+

In [56]:
# the variable whose P_value is greater of all and is more then significance level 0.05 is deleted as it means it has less significance on the column

model = sm.OLS(endog = y, exog = X_opt)
regressor_OLS = model.fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Tue, 01 Jun 2021",Prob (F-statistic):,1.34e-27
Time:,17:22:10,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
x1,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
x2,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229
x3,0.8060,0.046,17.369,0.000,0.712,0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x5,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


#### Explain
R_squared -  it tell about the goodness of the fit, ranges between 0 and 1(the better it is). 

Adj. R_squared - this parameter has a penalising factor and it always decreases or stays identical to the previous value as the number of independent variables increases. If its value keeps increasing on removing the unnecessary parameters go ahead with the model or stop and revert.

F statistic - it is used to compare two variances and is always greater than 0. It is formulated as v12/v22. In regression, it is the ratio of the explained to the unexplained variance of the model. AIC and BIC – AIC stands for Akaike’s information criterion and BIC stands for Bayesian information criterion Both these parameters depend on the likelihood function L. 

Skew - informs about the data symmetry about the mean

Kurtosis - it measures the shape of the distribution i.e.the amount of data close to the mean than far away from the mean.

Omnibus - D’Angostino’s test. It provides a combined statistical test for the presence of skewness and kurtosis.

Log-likelihood it is the log of the likehood function

In [61]:
X_opt = np.array(X[:, [0, 3, 5]], dtype= float)
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Tue, 01 Jun 2021",Prob (F-statistic):,2.1600000000000003e-31
Time:,18:55:40,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04
x1,0.7966,0.041,19.266,0.000,0.713,0.880
x2,0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


In [75]:
#making a single prediction (for example RD_Spend = 160000, Adm Spend = 130000, Marketing Spend = 300000 and State = 'Colifornia')
print(regr.predict([[0, 0, 160000, 130000, 300000]]))

[182024.02]


In [81]:
print(regr.coef_ )
print(regr.intercept_ )

[-5.22e+02  1.12e+01  7.81e-01  4.58e-02  3.35e-02]
41049.23809436479


The equation of the multiple linear regression model is

Profit = 86.6 * DummyState_1 - 873 * DummyState_2 + 786 * DummyState_3 - 0.773 * RD_Spend + 0,0329 * Administretion + 0.0366 * MArketing Spend + 42467.53


Note: to get these coefficients - called the coef and intercept atributes from our regressor object