In [1]:
# Importing the libraries
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 

In [2]:
# Importing the dataframe
df = pd.read_csv('best_invest.csv')

In [3]:
print(df.head())

R&D Spend  Administration  Marketing Spend       State     Profit
0  165349.20       136897.80        471784.10    New York  192261.83
1  162597.70       151377.59        443898.53  California  191792.06
2  153441.51       101145.55        407934.54     Florida  191050.39
3  144372.41       118671.85        383199.62    New York  182901.99
4  142107.34        91391.77        366168.42     Florida  166187.94


In [4]:
# Add column for constant
df.insert(0, 'Constant', 1)

In [5]:
print(df.head())

Constant  R&D Spend  Administration  Marketing Spend       State     Profit
0         1  165349.20       136897.80        471784.10    New York  192261.83
1         1  162597.70       151377.59        443898.53  California  191792.06
2         1  153441.51       101145.55        407934.54     Florida  191050.39
3         1  144372.41       118671.85        383199.62    New York  182901.99
4         1  142107.34        91391.77        366168.42     Florida  166187.94


In [6]:
# Get dummy variables for categorical columns
df_dummies = pd.get_dummies(df, columns=['State'], drop_first=True)


In [7]:
print(df_dummies.head())

Constant  R&D Spend  Administration  Marketing Spend     Profit  \
0         1  165349.20       136897.80        471784.10  192261.83   
1         1  162597.70       151377.59        443898.53  191792.06   
2         1  153441.51       101145.55        407934.54  191050.39   
3         1  144372.41       118671.85        383199.62  182901.99   
4         1  142107.34        91391.77        366168.42  166187.94   

   State_Florida  State_New York  
0              0               1  
1              0               0  
2              1               0  
3              0               1  
4              1               0  


In [8]:
# Split X and y variables
X = df_dummies.drop(columns=['Profit']).values
y = df_dummies.filter(regex='Profit', axis=1).values

In [9]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [10]:
# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [11]:
# Predicting the Test set results
y_pred = regressor.predict(X_test)

In [14]:
# Building the optimal model using Backward Elimination
import statsmodels.api as sm
regressor_OLS = sm.OLS(endog = y, exog = X).fit()
regressor_OLS.summary(xname=['Const (0)', 'R&D (1)', 'Admin (2)', 'Marketing (3)', 'State_Florida (4)', 'State_New York (5)'])

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Sat, 28 Mar 2020",Prob (F-statistic):,1.34e-27
Time:,21:17:11,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Const (0),5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
R&D (1),0.8060,0.046,17.369,0.000,0.712,0.900
Admin (2),-0.0270,0.052,-0.517,0.608,-0.132,0.078
Marketing (3),0.0270,0.017,1.574,0.123,-0.008,0.062
State_Florida (4),198.7888,3371.007,0.059,0.953,-6595.030,6992.607
State_New York (5),-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


In [17]:
# After removal of X variable 'State_New York' in index 5
X_opt = X[:, [0, 1, 2, 3, 4]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary(xname=['Const (0)', 'R&D (1)', 'Admin (2)', 'Marketing (3)', 'State_Florida (4)'])

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Sat, 28 Mar 2020",Prob (F-statistic):,8.49e-29
Time:,21:27:29,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Const (0),5.011e+04,6647.870,7.537,0.000,3.67e+04,6.35e+04
R&D (1),0.8060,0.046,17.606,0.000,0.714,0.898
Admin (2),-0.0270,0.052,-0.523,0.604,-0.131,0.077
Marketing (3),0.0270,0.017,1.592,0.118,-0.007,0.061
State_Florida (4),220.1585,2900.536,0.076,0.940,-5621.821,6062.138

0,1,2,3
Omnibus:,14.758,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.172
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


In [18]:
# After removal of X variable 'State_Florida' in index 4
X_opt = X[:, [0, 1, 2, 3]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary(xname=['Const (0)', 'R&D (1)', 'Admin (2)', 'Marketing (3)'])

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Sat, 28 Mar 2020",Prob (F-statistic):,4.53e-30
Time:,21:29:47,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Const (0),5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
R&D (1),0.8057,0.045,17.846,0.000,0.715,0.897
Admin (2),-0.0268,0.051,-0.526,0.602,-0.130,0.076
Marketing (3),0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [20]:
# After removal of X variable 'Admin' in index 2
X_opt = X[:, [0, 1, 3]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary(xname=['Const (0)', 'R&D (1)', 'Marketing (3)'])

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Sat, 28 Mar 2020",Prob (F-statistic):,2.1600000000000003e-31
Time:,21:31:11,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Const (0),4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04
R&D (1),0.7966,0.041,19.266,0.000,0.713,0.880
Marketing (3),0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


In [21]:
# After removal of X variable 'Marketing' in index 2
X_opt = X[:, [0, 1]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary(xname=['Const (0)', 'R&D (1)'])

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Sat, 28 Mar 2020",Prob (F-statistic):,3.5000000000000004e-32
Time:,21:32:08,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Const (0),4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
R&D (1),0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0
