# Multiple Linear Regression

# We want to create a model based on the sample data of 50  startup companies (50_Startups.csv) that will allow  venture captalist to assess where and in which into which companies they want to invest to achieve their goal of maximizing profit 


In [2]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
# Importing the dataset
dataset = pd.read_csv('D:/Data science/Machine Learning A-Z Template Folder/Part 2 - Regression/Section 5 - Multiple Linear Regression/50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 1].values

In [4]:
dataset[:5]

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


# State is a categorical Data 

# Encoding categorical data

In [5]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()
X[:, 3] = labelencoder.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray()

# Avoiding the Dummy Variable Trap

In [6]:
X = X[:, 1:] #Removed 1st column from X # we took all coumns starting from index 1 till the end


# Splitting the dataset into the Training set and Test set

In [7]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)




# Feature Scaling
Linear Regression Class will take care of the freature scaling

# Fitting Multiple Linear Regression to the Training set

In [9]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

# Predicting the Test set results

In [10]:
y_pred = regressor.predict(X_test)

In [11]:
y_pred

array([ 182645.56,   91790.61,  110594.11,   84710.77,  101145.55,
        127864.55,   65947.93,  152701.92,  122782.75,   91391.77])

In [12]:
y_test

array([ 182645.56,   91790.61,  110594.11,   84710.77,  101145.55,
        127864.55,   65947.93,  152701.92,  122782.75,   91391.77])

# Building the optimal model using backward elimination

In [13]:
import statsmodels.formula.api as sm
"""
To append column of 50 ones in X matrix 
#ones will be appended at the end of the matrix X
X = np.append(arr = X, values = np.ones(50,1).astype(int),axis = 1 ) 
# we are appending a column of 1's in X matrix for B0 constant
#axis = 0 means adding columns
"""
"""
But we want to append the ones before the matrix
so we reverrse the procedure
we put the array of ones first and the append the X matrix of features late
"""

X = np.append(arr = np.ones((50,1)).astype(int), values = X,axis = 1)

In [14]:
#optimal matrix of features # matrix of only the variables which are statistically significant
X_opt = X[:,[0,1,2,3,4,5]]

In [15]:
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

regressor_OLS.summary() #Gives a summary of the some constants that define the goodness of model

0,1,2,3
Dep. Variable:,y,R-squared:,1.0
Model:,OLS,Adj. R-squared:,1.0
Method:,Least Squares,F-statistic:,1.063e+30
Date:,"Sun, 05 Aug 2018",Prob (F-statistic):,0.0
Time:,17:04:17,Log-Likelihood:,1091.6
No. Observations:,50,AIC:,-2171.0
Df Residuals:,44,BIC:,-2160.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.366e-11,6.21e-11,0.703,0.485,-8.14e-11,1.69e-10
x1,-4.002e-11,3.04e-11,-1.317,0.195,-1.01e-10,2.12e-11
x2,-2.183e-11,2.93e-11,-0.744,0.461,-8.1e-11,3.73e-11
x3,-2.776e-17,4.18e-16,-0.066,0.947,-8.71e-16,8.15e-16
x4,1.0000,4.71e-16,2.12e+15,0.000,1.000,1.000
x5,-5.239e-16,1.55e-16,-3.390,0.001,-8.35e-16,-2.12e-16

0,1,2,3
Omnibus:,5.241,Durbin-Watson:,0.89
Prob(Omnibus):,0.073,Jarque-Bera (JB):,2.157
Skew:,-0.117,Prob(JB):,0.34
Kurtosis:,2.01,Cond. No.,1450000.0


# Remove predictor X2 because its P value is Sl(Significance level)

In [16]:
X_opt = X[:,[0,1,3,4,5]]

regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

regressor_OLS.summary()


0,1,2,3
Dep. Variable:,y,R-squared:,1.0
Model:,OLS,Adj. R-squared:,1.0
Method:,Least Squares,F-statistic:,5.653e+30
Date:,"Sun, 05 Aug 2018",Prob (F-statistic):,0.0
Time:,17:04:22,Log-Likelihood:,1127.3
No. Observations:,50,AIC:,-2245.0
Df Residuals:,45,BIC:,-2235.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.164e-10,2.94e-11,3.963,0.000,5.72e-11,1.76e-10
x1,3.638e-11,1.28e-11,2.838,0.007,1.06e-11,6.22e-11
x2,-2.359e-16,2.02e-16,-1.166,0.250,-6.43e-16,1.72e-16
x3,1.0000,2.28e-16,4.38e+15,0.000,1.000,1.000
x4,-1.804e-16,7.49e-17,-2.409,0.020,-3.31e-16,-2.95e-17

0,1,2,3
Omnibus:,0.981,Durbin-Watson:,0.75
Prob(Omnibus):,0.612,Jarque-Bera (JB):,1.028
Skew:,0.239,Prob(JB):,0.598
Kurtosis:,2.484,Cond. No.,1400000.0


# remove predictor x1 COZ its P value is Sl(Significance level)

In [17]:
X_opt = X[:,[0,3,4,5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,1.0
Model:,OLS,Adj. R-squared:,1.0
Method:,Least Squares,F-statistic:,3.846e+30
Date:,"Sun, 05 Aug 2018",Prob (F-statistic):,0.0
Time:,17:04:43,Log-Likelihood:,1109.9
No. Observations:,50,AIC:,-2212.0
Df Residuals:,46,BIC:,-2204.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.601e-10,4.11e-11,3.894,0.000,7.73e-11,2.43e-10
x1,2.429e-16,2.82e-16,0.860,0.394,-3.26e-16,8.11e-16
x2,1.0000,3.19e-16,3.13e+15,0.000,1.000,1.000
x3,-4.718e-16,1.03e-16,-4.586,0.000,-6.79e-16,-2.65e-16

0,1,2,3
Omnibus:,0.059,Durbin-Watson:,0.879
Prob(Omnibus):,0.971,Jarque-Bera (JB):,0.245
Skew:,-0.039,Prob(JB):,0.885
Kurtosis:,2.666,Cond. No.,1400000.0


# remove predictor x4 COZ its P value is Sl(Significance level)

In [18]:
X_opt = X[:,[0,3,5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.149
Model:,OLS,Adj. R-squared:,0.113
Method:,Least Squares,F-statistic:,4.115
Date:,"Sun, 05 Aug 2018",Prob (F-statistic):,0.0226
Time:,17:04:49,Log-Likelihood:,-578.44
No. Observations:,50,AIC:,1163.0
Df Residuals:,47,BIC:,1169.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.173e+05,7749.035,15.141,0.000,1.02e+05,1.33e+05
x1,0.3405,0.119,2.859,0.006,0.101,0.580
x2,-0.0999,0.045,-2.235,0.030,-0.190,-0.010

0,1,2,3
Omnibus:,1.684,Durbin-Watson:,1.748
Prob(Omnibus):,0.431,Jarque-Bera (JB):,1.351
Skew:,-0.401,Prob(JB):,0.509
Kurtosis:,2.936,Cond. No.,532000.0


# remove predictor x5 COZ its P value is Sl(Significance level)

In [19]:
X_opt = X[:,[0,3]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()


0,1,2,3
Dep. Variable:,y,R-squared:,0.059
Model:,OLS,Adj. R-squared:,0.039
Method:,Least Squares,F-statistic:,2.985
Date:,"Sun, 05 Aug 2018",Prob (F-statistic):,0.0905
Time:,17:04:55,Log-Likelihood:,-580.96
No. Observations:,50,AIC:,1166.0
Df Residuals:,48,BIC:,1170.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.105e+05,7402.924,14.921,0.000,9.56e+04,1.25e+05
x1,0.1477,0.085,1.728,0.090,-0.024,0.320

0,1,2,3
Omnibus:,0.845,Durbin-Watson:,1.71
Prob(Omnibus):,0.656,Jarque-Bera (JB):,0.823
Skew:,-0.289,Prob(JB):,0.663
Kurtosis:,2.755,Cond. No.,165000.0
