## Multiple Linear regression
### This a straightforward All-In model training practise.


In [1]:
import numpy as np
import matplotlib.pyplot as plt     # for plotting
import pandas as pd                # for data handling

In [2]:
# Importing the dataset
df = pd.read_csv('./50_Startups.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
# Splitting the dataset into the independent and dependent variables
x = df.iloc[:, :-1].values
y = df.iloc[:, 4].values


In [4]:
x_test = x
x_test

array([[165349.2, 136897.8, 471784.1, 'New York'],
       [162597.7, 151377.59, 443898.53, 'California'],
       [153441.51, 101145.55, 407934.54, 'Florida'],
       [144372.41, 118671.85, 383199.62, 'New York'],
       [142107.34, 91391.77, 366168.42, 'Florida'],
       [131876.9, 99814.71, 362861.36, 'New York'],
       [134615.46, 147198.87, 127716.82, 'California'],
       [130298.13, 145530.06, 323876.68, 'Florida'],
       [120542.52, 148718.95, 311613.29, 'New York'],
       [123334.88, 108679.17, 304981.62, 'California'],
       [101913.08, 110594.11, 229160.95, 'Florida'],
       [100671.96, 91790.61, 249744.55, 'California'],
       [93863.75, 127320.38, 249839.44, 'Florida'],
       [91992.39, 135495.07, 252664.93, 'California'],
       [119943.24, 156547.42, 256512.92, 'Florida'],
       [114523.61, 122616.84, 261776.23, 'New York'],
       [78013.11, 121597.55, 264346.06, 'California'],
       [94657.16, 145077.58, 282574.31, 'New York'],
       [91749.16, 114175.79, 29491

#### Now I have 3 choice to encode the data
1. using pandas before spliting
2. using pandas after spliting
3. using scikit-learn to encode

#### Encode using scikit-learn

In [5]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

### Label Encoding 
###### Label Encoding is a technique that is used to convert categorical columns into numerical ones so that they can be fitted by machine learning models which only take numerical data. It is an important pre-processing step in a machine-learning project

In [6]:
# LabelEncoder_x = LabelEncoder()
# x[:, 3] = LabelEncoder_x.fit_transform(x[:, 3]) 


In [7]:
# x

#### One Hot Encoding 
###### One hot encoding is a technique that we use to represent categorical variables as numerical values in a machine learning model.

In [8]:
# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

In [9]:
# Extract the last column for encoding
last_column = x[:, -1].reshape(-1, 1)
last_column

array([['New York'],
       ['California'],
       ['Florida'],
       ['New York'],
       ['Florida'],
       ['New York'],
       ['California'],
       ['Florida'],
       ['New York'],
       ['California'],
       ['Florida'],
       ['California'],
       ['Florida'],
       ['California'],
       ['Florida'],
       ['New York'],
       ['California'],
       ['New York'],
       ['Florida'],
       ['New York'],
       ['California'],
       ['New York'],
       ['Florida'],
       ['Florida'],
       ['New York'],
       ['California'],
       ['Florida'],
       ['New York'],
       ['Florida'],
       ['New York'],
       ['Florida'],
       ['New York'],
       ['California'],
       ['Florida'],
       ['California'],
       ['New York'],
       ['Florida'],
       ['California'],
       ['New York'],
       ['California'],
       ['California'],
       ['Florida'],
       ['California'],
       ['New York'],
       ['California'],
       ['New York'],
       ['Florida'],

In [10]:
# Encode the last column
encoded_city = encoder.fit_transform(last_column)
encoded_city

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0

In [11]:
# x
encoded_x = np.hstack((x[:, :-1], encoded_city))
encoded_x

array([[165349.2, 136897.8, 471784.1, 0.0, 0.0, 1.0],
       [162597.7, 151377.59, 443898.53, 1.0, 0.0, 0.0],
       [153441.51, 101145.55, 407934.54, 0.0, 1.0, 0.0],
       [144372.41, 118671.85, 383199.62, 0.0, 0.0, 1.0],
       [142107.34, 91391.77, 366168.42, 0.0, 1.0, 0.0],
       [131876.9, 99814.71, 362861.36, 0.0, 0.0, 1.0],
       [134615.46, 147198.87, 127716.82, 1.0, 0.0, 0.0],
       [130298.13, 145530.06, 323876.68, 0.0, 1.0, 0.0],
       [120542.52, 148718.95, 311613.29, 0.0, 0.0, 1.0],
       [123334.88, 108679.17, 304981.62, 1.0, 0.0, 0.0],
       [101913.08, 110594.11, 229160.95, 0.0, 1.0, 0.0],
       [100671.96, 91790.61, 249744.55, 1.0, 0.0, 0.0],
       [93863.75, 127320.38, 249839.44, 0.0, 1.0, 0.0],
       [91992.39, 135495.07, 252664.93, 1.0, 0.0, 0.0],
       [119943.24, 156547.42, 256512.92, 0.0, 1.0, 0.0],
       [114523.61, 122616.84, 261776.23, 0.0, 0.0, 1.0],
       [78013.11, 121597.55, 264346.06, 1.0, 0.0, 0.0],
       [94657.16, 145077.58, 282574.31, 0.

### Model Train

In [12]:
# Avoiding the Dummy Variable Trap
final_x = encoded_x[:, :-1]
final_x

array([[165349.2, 136897.8, 471784.1, 0.0, 0.0],
       [162597.7, 151377.59, 443898.53, 1.0, 0.0],
       [153441.51, 101145.55, 407934.54, 0.0, 1.0],
       [144372.41, 118671.85, 383199.62, 0.0, 0.0],
       [142107.34, 91391.77, 366168.42, 0.0, 1.0],
       [131876.9, 99814.71, 362861.36, 0.0, 0.0],
       [134615.46, 147198.87, 127716.82, 1.0, 0.0],
       [130298.13, 145530.06, 323876.68, 0.0, 1.0],
       [120542.52, 148718.95, 311613.29, 0.0, 0.0],
       [123334.88, 108679.17, 304981.62, 1.0, 0.0],
       [101913.08, 110594.11, 229160.95, 0.0, 1.0],
       [100671.96, 91790.61, 249744.55, 1.0, 0.0],
       [93863.75, 127320.38, 249839.44, 0.0, 1.0],
       [91992.39, 135495.07, 252664.93, 1.0, 0.0],
       [119943.24, 156547.42, 256512.92, 0.0, 1.0],
       [114523.61, 122616.84, 261776.23, 0.0, 0.0],
       [78013.11, 121597.55, 264346.06, 1.0, 0.0],
       [94657.16, 145077.58, 282574.31, 0.0, 0.0],
       [91749.16, 114175.79, 294919.57, 0.0, 1.0],
       [86419.7, 153514.1

In [13]:
# split the dataset into the training set and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(final_x, y, test_size = 0.2, random_state = 0)

###### Fitting Model

In [14]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)

In [15]:
# Predicting the test set results
y_pred = regressor.predict(x_test)

In [16]:
y_pred

array([103015.20159776, 132582.27760831, 132447.73845184,  71976.09851266,
       178537.4822107 , 116161.24230157,  67851.69209689,  98791.73374679,
       113969.43533008, 167921.06569569])

### Till now same as Normal ML Model


##### What is Backward Elimination Multivariable Linear regression ??
##### Algorithm are below - 
1. Select a significance level to stay in the model (ex: SL = .05)
2. Fit the model with all possible parametres.
3. Consider the parameter with highest P-value. IF P>SL go to next step / Finish the model train.
4. Remove the parameter fit model again without this parameter.

In [28]:
# Building the optimal model using Backward Elimination
import statsmodels.api as sm_api

In [20]:
x_constant = np.ones((40, 1)).astype(int)
x_constant

array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1]])

In [21]:
x_train = np.append(arr = x_constant, values = x_train, axis = 1)

In [22]:
x_train

array([[1, 55493.95, 103057.49, 214634.81, 0.0, 1.0],
       [1, 46014.02, 85047.44, 205517.64, 0.0, 0.0],
       [1, 75328.87, 144135.98, 134050.07, 0.0, 1.0],
       [1, 46426.07, 157693.92, 210797.67, 1.0, 0.0],
       [1, 91749.16, 114175.79, 294919.57, 0.0, 1.0],
       [1, 130298.13, 145530.06, 323876.68, 0.0, 1.0],
       [1, 119943.24, 156547.42, 256512.92, 0.0, 1.0],
       [1, 1000.23, 124153.04, 1903.93, 0.0, 0.0],
       [1, 542.05, 51743.15, 0.0, 0.0, 0.0],
       [1, 65605.48, 153032.06, 107138.38, 0.0, 0.0],
       [1, 114523.61, 122616.84, 261776.23, 0.0, 0.0],
       [1, 61994.48, 115641.28, 91131.24, 0.0, 1.0],
       [1, 63408.86, 129219.61, 46085.25, 1.0, 0.0],
       [1, 78013.11, 121597.55, 264346.06, 1.0, 0.0],
       [1, 23640.93, 96189.63, 148001.11, 1.0, 0.0],
       [1, 76253.86, 113867.3, 298664.47, 1.0, 0.0],
       [1, 15505.73, 127382.3, 35534.17, 0.0, 0.0],
       [1, 120542.52, 148718.95, 311613.29, 0.0, 0.0],
       [1, 91992.39, 135495.07, 252664.93, 

In [26]:
import statsmodels as sm
print((sm.__version__))

0.14.1


In [43]:
'''
Right now, we have two ways for selecting parameters:
'''

x_train_opt = np.array(x_train[:, [0, 1, 2, 3, 4, 5]], dtype=float)
# x_train_opt

# Or

# x_train_opt = x_train[:, [0, 1, 2, 3, 4, 5]]
# x_train_opt = x_train_opt.astype(float)
# x_train_opt


In [44]:
regressor_OLS = sm_api.OLS(endog = y_train, exog = x_train_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.943
Method:,Least Squares,F-statistic:,129.7
Date:,"Fri, 29 Dec 2023",Prob (F-statistic):,3.91e-21
Time:,01:11:06,Log-Likelihood:,-421.1
No. Observations:,40,AIC:,854.2
Df Residuals:,34,BIC:,864.3
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.325e+04,8315.816,5.201,0.000,2.64e+04,6.02e+04
x1,0.7735,0.055,14.025,0.000,0.661,0.886
x2,0.0329,0.066,0.495,0.624,-0.102,0.168
x3,0.0366,0.019,1.884,0.068,-0.003,0.076
x4,-699.3691,3661.563,-0.191,0.850,-8140.560,6741.822
x5,-1658.6532,4209.221,-0.394,0.696,-1.02e+04,6895.513

0,1,2,3
Omnibus:,15.823,Durbin-Watson:,2.468
Prob(Omnibus):,0.0,Jarque-Bera (JB):,23.231
Skew:,-1.094,Prob(JB):,9.03e-06
Kurtosis:,6.025,Cond. No.,1480000.0


In [48]:
x_train_opt = np.array(x_train[:, [0, 1, 2, 3,5]], dtype=float)
regressor_OLS = sm_api.OLS(endog = y_train, exog = x_train_opt).fit()
regressor_OLS.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.292e+04,8020.397,5.352,0.000,2.66e+04,5.92e+04
x1,0.7754,0.053,14.498,0.000,0.667,0.884
x2,0.0319,0.065,0.488,0.629,-0.101,0.165
x3,0.0363,0.019,1.902,0.065,-0.002,0.075
x4,-1272.1608,3639.780,-0.350,0.729,-8661.308,6116.986


In [49]:
x_train_opt = np.array(x_train[:, [0, 1, 2, 3]], dtype=float)
regressor_OLS = sm_api.OLS(endog = y_train, exog = x_train_opt).fit()
regressor_OLS.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.299e+04,7919.773,5.428,0.000,2.69e+04,5.91e+04
x1,0.7788,0.052,15.003,0.000,0.674,0.884
x2,0.0294,0.064,0.458,0.650,-0.101,0.160
x3,0.0347,0.018,1.896,0.066,-0.002,0.072


In [50]:
x_train_opt = np.array(x_train[:, [0, 1, 3]], dtype=float)
regressor_OLS = sm_api.OLS(endog = y_train, exog = x_train_opt).fit()
regressor_OLS.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.635e+04,2971.236,15.598,0.000,4.03e+04,5.24e+04
x1,0.7886,0.047,16.846,0.000,0.694,0.883
x2,0.0326,0.018,1.860,0.071,-0.003,0.068


In [51]:
x_train_opt = np.array(x_train[:, [0, 1,]], dtype=float)
regressor_OLS = sm_api.OLS(endog = y_train, exog = x_train_opt).fit()
regressor_OLS.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.842e+04,2842.717,17.032,0.000,4.27e+04,5.42e+04
x1,0.8516,0.033,25.542,0.000,0.784,0.919


In [57]:
x_train[:5, [0, 1,]]

array([[1, 55493.95],
       [1, 46014.02],
       [1, 75328.87],
       [1, 46426.07],
       [1, 91749.16]], dtype=object)

In [55]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


##### So according to our algorithm we have more correlation with R&D and profits.
