In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


# Importing Data

In [3]:
dataset = pd.read_csv('drive/My Drive/Machine Learning A_to_Z/Linear Regression/50_Startups.csv')
dataset

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


In [4]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,4].values

# Encoding independent variable

In [5]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelencoder_X  =LabelEncoder()

X[:,3] = labelencoder_X.fit_transform(X[:,3])
onehotencoder = OneHotEncoder()

X = onehotencoder.fit_transform(X).toarray()


# Avoiding dummy variable trap

In [6]:
X = X[: , 1:]

# Splitting dataset into training and test set

In [7]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.2, random_state=0)

# Fitting multiple linear regression into training set

In [8]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()

regressor.fit(X_train , y_train)

#Predicting test set results

In [9]:
y_pred = regressor.predict(X_test)
y_pred

array([110421.259933  ,  99635.0731779 , 110421.259933  , 110421.259933  ,
       110421.259933  , 119829.39601805, 119829.39601805, 119829.39601805,
       110421.259933  , 110421.259933  ])

# Building optimal model using backward elimination

In [10]:
import statsmodels.api as sm
x = np.append(arr = X , values=np.ones((50,1)).astype(int) , axis=1)
X_opt = X[: , [0,1,2,3,4,5]]
regressor_OLS = sm.OLS(endog= y , exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.033
Model:,OLS,Adj. R-squared (uncentered):,-0.098
Method:,Least Squares,F-statistic:,0.2539
Date:,"Sun, 20 Aug 2023",Prob (F-statistic):,0.955
Time:,15:40:34,Log-Likelihood:,-654.4
No. Observations:,50,AIC:,1321.0
Df Residuals:,44,BIC:,1332.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,3.567e+04,1.25e+05,0.286,0.776,-2.15e+05,2.87e+05
x2,6.493e+04,1.25e+05,0.521,0.605,-1.86e+05,3.16e+05
x3,4.949e+04,1.25e+05,0.397,0.693,-2.02e+05,3.01e+05
x4,6.976e+04,1.25e+05,0.560,0.578,-1.81e+05,3.21e+05
x5,8.123e+04,1.25e+05,0.652,0.518,-1.7e+05,3.32e+05
x6,6.52e+04,1.25e+05,0.523,0.603,-1.86e+05,3.16e+05

0,1,2,3
Omnibus:,3.633,Durbin-Watson:,0.036
Prob(Omnibus):,0.163,Jarque-Bera (JB):,3.22
Skew:,-0.62,Prob(JB):,0.2
Kurtosis:,2.916,Cond. No.,1.0


In [11]:
# remove the index with highest p value
X_opt = X[: , [1,2,3,4,5]]
regressor_OLS = sm.OLS(endog= y , exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.032
Model:,OLS,Adj. R-squared (uncentered):,-0.076
Method:,Least Squares,F-statistic:,0.2942
Date:,"Sun, 20 Aug 2023",Prob (F-statistic):,0.914
Time:,15:49:03,Log-Likelihood:,-654.45
No. Observations:,50,AIC:,1319.0
Df Residuals:,45,BIC:,1328.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,6.493e+04,1.23e+05,0.526,0.601,-1.83e+05,3.13e+05
x2,4.949e+04,1.23e+05,0.401,0.690,-1.99e+05,2.98e+05
x3,6.976e+04,1.23e+05,0.566,0.574,-1.79e+05,3.18e+05
x4,8.123e+04,1.23e+05,0.659,0.514,-1.67e+05,3.3e+05
x5,6.52e+04,1.23e+05,0.529,0.600,-1.83e+05,3.14e+05

0,1,2,3
Omnibus:,3.284,Durbin-Watson:,0.033
Prob(Omnibus):,0.194,Jarque-Bera (JB):,2.838
Skew:,-0.583,Prob(JB):,0.242
Kurtosis:,2.943,Cond. No.,1.0


In [12]:
# remove the index with highest p value
X_opt = X[: , [1,3,4,5]]
regressor_OLS = sm.OLS(endog= y , exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.028
Model:,OLS,Adj. R-squared (uncentered):,-0.056
Method:,Least Squares,F-statistic:,0.3336
Date:,"Sun, 20 Aug 2023",Prob (F-statistic):,0.854
Time:,15:55:05,Log-Likelihood:,-654.54
No. Observations:,50,AIC:,1317.0
Df Residuals:,46,BIC:,1325.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,6.493e+04,1.22e+05,0.531,0.598,-1.81e+05,3.11e+05
x2,6.976e+04,1.22e+05,0.571,0.571,-1.76e+05,3.16e+05
x3,8.123e+04,1.22e+05,0.665,0.510,-1.65e+05,3.27e+05
x4,6.52e+04,1.22e+05,0.534,0.596,-1.81e+05,3.11e+05

0,1,2,3
Omnibus:,2.894,Durbin-Watson:,0.034
Prob(Omnibus):,0.235,Jarque-Bera (JB):,2.385
Skew:,-0.535,Prob(JB):,0.303
Kurtosis:,2.999,Cond. No.,1.0
