### Unlike simple li.. we have to input multiple features as independent variables

In [57]:
# Data Preprocessing Template

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values

from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelencoder_X = LabelEncoder()
#Encoding values of the first categorical column
X[:,3] = labelencoder_X.fit_transform(X[:,3])

onehotencoder = OneHotEncoder(categorical_features = [3]) #Creating an object of class
X = onehotencoder.fit_transform(X).toarray()

#Avoiding the dummy variable trap(selecting  n-1 dummy columns)
X = X[:,1:]


# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [58]:
#Fitting model to training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [59]:
#Predicting the results
y_pred = regressor.predict(X_test) #Predicting the profit based on the given test set's 3 features or columns
print('Real Profits {}'.format(y_test))
print()
print('Predicted Profits {}'.format(y_pred))

Real Profits [103282.38 144259.4  146121.95  77798.83 191050.39 105008.31  81229.06
  97483.56 110352.25 166187.94]

Predicted Profits [103015.20159796 132582.27760815 132447.73845175  71976.09851258
 178537.48221056 116161.24230166  67851.69209676  98791.73374687
 113969.43533013 167921.06569551]


In [60]:
print('The Difference between actual and predicted results')
a = y_test - y_pred
print(a)

The Difference between actual and predicted results
[   267.17840204  11677.12239185  13674.21154825   5822.73148742
  12512.90778944 -11152.93230166  13377.36790324  -1308.17374687
  -3617.18533013  -1733.12569551]


### We have included all the independent variables in our ML model but do we need to do so..?
### There may be some features which are of no use to our model and instead increase time-complexity and effectivness of our model.
### Hence we will now begin eliminating unwated features from our set of independent variables
## Building the optimal model using backward elimination
### Backward Elimination includes adding all independent variables at first and then  removing variables which are not statistically signifact one by one

## Pseducode (Backward Elimination)
### 1) Select a threshold(level) for variables to stay in the model (eg:- SL = 0.05)
### 2) Fit the model with all the possible predictors(independent variable)
### 3) If P-value of the feature > 0.05(5%), Consider the predictor with the highest  P-value ; Else Break
### 4) Remove the predictor
### 5)  Fit the model without this variable

In [52]:
#Trick adding a column of 1's to the beginning of matrix X (for bo*1) as required by statsmodel
import statsmodels.formula.api as sm
#Adding X to a column of ones having no of rows as 50
X = np.append(arr = np.ones((50,1)).astype(int), values = X, axis =1)

In [53]:
#X_Opt is a matrix consisting of all features that are actually needed
# 1) Checking P-value of all features
X_opt = X[:, [0,1,2,3,4,5]] #for now we take all the independent variables
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Thu, 09 May 2019",Prob (F-statistic):,1.34e-27
Time:,14:22:04,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
x1,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
x2,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229
x3,0.8060,0.046,17.369,0.000,0.712,0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x5,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


In [29]:
#2) Removing variable x2 as it has highest p-value
X_opt = X[:, [0,1,3,4,5]] 
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Thu, 09 May 2019",Prob (F-statistic):,8.49e-29
Time:,14:04:20,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.011e+04,6647.870,7.537,0.000,3.67e+04,6.35e+04
x1,220.1585,2900.536,0.076,0.940,-5621.821,6062.138
x2,0.8060,0.046,17.606,0.000,0.714,0.898
x3,-0.0270,0.052,-0.523,0.604,-0.131,0.077
x4,0.0270,0.017,1.592,0.118,-0.007,0.061

0,1,2,3
Omnibus:,14.758,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.172
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


In [30]:
#3) Now Removing variable x1 as it has highest p-value
X_opt = X[:, [0,3,4,5]] 
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Thu, 09 May 2019",Prob (F-statistic):,4.53e-30
Time:,14:04:41,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
x1,0.8057,0.045,17.846,0.000,0.715,0.897
x2,-0.0268,0.051,-0.526,0.602,-0.130,0.076
x3,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [54]:
#3) Now Removing variable x2(4) as it has highest p-value
X_opt = X[:, [0,3,5]] 
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Thu, 09 May 2019",Prob (F-statistic):,2.1600000000000003e-31
Time:,14:22:11,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04
x1,0.7966,0.041,19.266,0.000,0.713,0.880
x2,0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


In [55]:
#3) Now Removing variable x2(4) as it has highest p-value
X_opt = X[:, [0,3]] 
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Thu, 09 May 2019",Prob (F-statistic):,3.5000000000000004e-32
Time:,14:22:20,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0


### We now come to know that R&D  spend is the powerful predictor and is the sole member of the optimal team that affect the profit and we ignore Admin,and Marketing spend  and state features!

In [63]:
# Data Preprocessing Template

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, 0:1].values #We will now only take R&D spend independent variables.
y = dataset.iloc[:, 4].values

#from sklearn.preprocessing import LabelEncoder,OneHotEncoder
#labelencoder_X = LabelEncoder()
#Encoding values of the first categorical column
#X[:,3] = labelencoder_X.fit_transform(X[:,3])

#onehotencoder = OneHotEncoder(categorical_features = [3]) #Creating an object of class
#X = onehotencoder.fit_transform(X).toarray()

#Avoiding the dummy variable trap(selecting  n-1 dummy columns)
#X = X[:,1:]


# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

#Fitting model to training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

#Predicting the results
y_pred = regressor.predict(X_test) #Predicting the profit based on the given test set's 3 features or columns
print('Real Profits {}'.format(y_test))
print()
print('Predicted Profits {}'.format(y_pred))
print()
print('The Difference between actual and predicted results')
print(y_test - y_pred)

Real Profits [103282.38 144259.4  146121.95  77798.83 191050.39 105008.31  81229.06
  97483.56 110352.25 166187.94]

Predicted Profits [104667.27805998 134150.83410578 135207.80019517  72170.54428856
 179090.58602508 109824.77386586  65644.27773757 100481.43277139
 111431.75202432 169438.14843539]

The Difference between actual and predicted results
[-1384.89805998 10108.56589422 10914.14980483  5628.28571144
 11959.80397492 -4816.46386586 15584.78226243 -2997.87277139
 -1079.50202432 -3250.20843539]
