### Step 1 : Basic preprocessing and encoding

In [1]:
# import the necessary libraries 
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split 

In [2]:
# import the dataset 
df = pd.read_csv('50_Startups.csv') 

In [3]:
# first five entries of the dataset 
df.head() 

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
# split the dataframe into dependent and independent variables.  
x = df[['R&D Spend', 'Administration', 'Marketing Spend', 'State']] 
y = df['Profit'] 

In [5]:
# since the state is a string datatype column we need to encode it. 
x = pd.get_dummies(x) 
x.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,0,0,1
1,162597.7,151377.59,443898.53,1,0,0
2,153441.51,101145.55,407934.54,0,1,0
3,144372.41,118671.85,383199.62,0,0,1
4,142107.34,91391.77,366168.42,0,1,0


### Step 2 : Splitting the data into training and testing set and making predictions

In [6]:
x_train, x_test, y_train, y_test = train_test_split( 
        x, y, test_size = 0.3, random_state = 0) 

In [7]:
from sklearn.linear_model import LinearRegression 
lm = LinearRegression()

In [8]:
lm.fit(x_train, y_train) 
pred = lm.predict(x_test) 

### Step 3 : Using the backward elimination technique

In [10]:
import statsmodels.regression.linear_model as sm 

In [11]:
# add a column of ones as integer data type 
x = np.append(arr = np.ones((50, 1)).astype(int),  
              values = x, axis = 1) 

In [12]:
# choose a Significance level usually 0.05, if p>0.05 
#  for the highest values parameter, remove that value 
x_opt = x[:, [0, 1, 2, 3, 4, 5]] 
ols = sm.OLS(endog = y, exog = x_opt).fit() 
ols.summary() 

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Thu, 28 Jan 2021",Prob (F-statistic):,1.34e-27
Time:,23:14:46,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.008e+04,6952.587,7.204,0.000,3.61e+04,6.41e+04
x1,0.8060,0.046,17.369,0.000,0.712,0.900
x2,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x3,0.0270,0.017,1.574,0.123,-0.008,0.062
x4,41.8870,3256.039,0.013,0.990,-6520.229,6604.003
x5,240.6758,3338.857,0.072,0.943,-6488.349,6969.701

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1470000.0


In [13]:
# remove the 4th column as it has the highest value 
x_opt = x[:, [0, 1, 2, 3, 5]] 
ols = sm.OLS(endog = y, exog = x_opt).fit() 
ols.summary() 

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Thu, 28 Jan 2021",Prob (F-statistic):,8.49e-29
Time:,23:15:19,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.011e+04,6647.870,7.537,0.000,3.67e+04,6.35e+04
x1,0.8060,0.046,17.606,0.000,0.714,0.898
x2,-0.0270,0.052,-0.523,0.604,-0.131,0.077
x3,0.0270,0.017,1.592,0.118,-0.007,0.061
x4,220.1585,2900.536,0.076,0.940,-5621.821,6062.138

0,1,2,3
Omnibus:,14.758,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.172
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


In [14]:
# remove the 5th column as it has the highest value 
x_opt = x[:, [0, 1, 2, 3]] 
ols = sm.OLS(endog = y, exog = x_opt).fit() 
ols.summary() 

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Thu, 28 Jan 2021",Prob (F-statistic):,4.53e-30
Time:,23:15:33,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
x1,0.8057,0.045,17.846,0.000,0.715,0.897
x2,-0.0268,0.051,-0.526,0.602,-0.130,0.076
x3,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [15]:
# remove the 3rd column as it has the highest value 
x_opt = x[:, [0, 1, 2]] 
ols = sm.OLS(endog = y, exog = x_opt).fit() 
ols.summary() 

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.948
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,426.8
Date:,"Thu, 28 Jan 2021",Prob (F-statistic):,7.29e-31
Time:,23:15:52,Log-Likelihood:,-526.83
No. Observations:,50,AIC:,1060.0
Df Residuals:,47,BIC:,1065.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.489e+04,6016.718,9.122,0.000,4.28e+04,6.7e+04
x1,0.8621,0.030,28.589,0.000,0.801,0.923
x2,-0.0530,0.049,-1.073,0.289,-0.152,0.046

0,1,2,3
Omnibus:,14.678,Durbin-Watson:,1.189
Prob(Omnibus):,0.001,Jarque-Bera (JB):,20.449
Skew:,-0.961,Prob(JB):,3.63e-05
Kurtosis:,5.474,Cond. No.,665000.0


In [16]:
# remove the 2nd column as it has the highest value 
x_opt = x[:, [0, 1]] 
ols = sm.OLS(endog = y, exog = x_opt).fit() 
ols.summary() 

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Thu, 28 Jan 2021",Prob (F-statistic):,3.5000000000000004e-32
Time:,23:16:10,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0
