# Multiple Linear Regression

## Importing the libraries

In [4]:
#importing the usual libraries: numpy, matplotlib, pandas
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [5]:
#importing the dataset
dataset = pd.read_csv('50_Startups.csv')
#creating the predictor matrix
X = dataset.iloc[:, :-1].values
#creating the dependent variable feature vector
y = dataset.iloc[:, -1].values

In [8]:
#getting the number of rows
dataset.shape[0]

50

In [9]:
#getting the number of columns
dataset.shape[1]

5

In [10]:
#examining the predictor matrix
print(X)

[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']
 [131876.9 99814.71 362861.36 'New York']
 [134615.46 147198.87 127716.82 'California']
 [130298.13 145530.06 323876.68 'Florida']
 [120542.52 148718.95 311613.29 'New York']
 [123334.88 108679.17 304981.62 'California']
 [101913.08 110594.11 229160.95 'Florida']
 [100671.96 91790.61 249744.55 'California']
 [93863.75 127320.38 249839.44 'Florida']
 [91992.39 135495.07 252664.93 'California']
 [119943.24 156547.42 256512.92 'Florida']
 [114523.61 122616.84 261776.23 'New York']
 [78013.11 121597.55 264346.06 'California']
 [94657.16 145077.58 282574.31 'New York']
 [91749.16 114175.79 294919.57 'Florida']
 [86419.7 153514.11 0.0 'New York']
 [76253.86 113867.3 298664.47 'California']
 [78389.47 153773.43 299737.29 'New York']
 [73994.56 122782.75 303319.26 'Florida']
 [67532

In [13]:
#examining the DV vector
print(y)

[192261.83 191792.06 191050.39 182901.99 166187.94 156991.12 156122.51
 155752.6  152211.77 149759.96 146121.95 144259.4  141585.52 134307.35
 132602.65 129917.04 126992.93 125370.37 124266.9  122776.86 118474.03
 111313.02 110352.25 108733.99 108552.04 107404.34 105733.54 105008.31
 103282.38 101004.64  99937.59  97483.56  97427.84  96778.92  96712.8
  96479.51  90708.19  89949.14  81229.06  81005.76  78239.91  77798.83
  71498.49  69758.98  65200.33  64926.08  49490.75  42559.73  35673.41
  14681.4 ]


## Encoding categorical data

In [14]:
#now we will use the sklearn library to make the state variable into dummy variables
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

#making an object of class column transformer
#in this case, our categorical variable is in the third column, so we now change the index to [3] 
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
#fitting the class to the predictor matrix 
X = np.array(ct.fit_transform(X))

#in multiple linear regression, there is no need to scale the features because the coefficients 
#Different explanatory variables are almost always on different scales (i.e., measured in different units). 
#This is not a problem; 
#the betas are estimated such that they convert the units of each explanatory variable into the units of the response variable appropriately

#typically we do not check the assumption in multiple regression because if, for example, there are no linear relationships
#then the model will perform poorly in comparison to other models 

In [15]:
#double checking to see if our code appropriately made the dummy variables 
print(X)

[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 0.0 1.0 131876.9 99814.71 362861.36]
 [1.0 0.0 0.0 134615.46 147198.87 127716.82]
 [0.0 1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 0.0 1.0 120542.52 148718.95 311613.29]
 [1.0 0.0 0.0 123334.88 108679.17 304981.62]
 [0.0 1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 0.0 0.0 100671.96 91790.61 249744.55]
 [0.0 1.0 0.0 93863.75 127320.38 249839.44]
 [1.0 0.0 0.0 91992.39 135495.07 252664.93]
 [0.0 1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 0.0 1.0 94657.16 145077.58 282574.31]
 [0.0 1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 0.0 1.0 86419.7 153514.11 0.0]
 [1.0 0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 0.0 1.0 78389.47 153773.43 299737.29]
 [0.0 1.0 0.0 73994.56 122782.75 3

## Splitting the dataset into the Training set and Test set

In [17]:
#we will use the sklearn library to split the dataset into the testing and training sets
from sklearn.model_selection import train_test_split
#splitting the dataset with an 80/20 split and random_state (seed) = 0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Multiple Linear Regression model on the Training set

In [18]:
#using the sklearn library to conduct the linear regression
from sklearn.linear_model import LinearRegression
#making an object of class LinearRegression()
regressor = LinearRegression() #as a note, we do not have to worry about the dummy variable trap
#this is because the function takes care of this by default 
regressor.fit(X_train, y_train)

#do we have to deploy the backward elimination technique? 
#no we do not because the class LinearRegression will automatically identify the best features to figure out
#how to predict the DV with the highest accuracy 

LinearRegression()

## Predicting the Test set results

In [26]:
#using the equation we produced from the training data and using it on the testing set 
y_pred = regressor.predict(X_test)
#setting so that the output rounds to 2 decimals
np.set_printoptions(precision=2)
#showing the output
#the concatenate function expects as a first argument the tupple of arrays/vectors you want to concatenate - they must have the same shape
#we are essentially displaying 2 vectors at once and making sure they are displayed as columns rather than rows
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))
#second argument of np.concatenate() is the axis, with 0 being vertical and 1 being rows - we use because
#2 vertical vectors concatenated together attach horizontally 

[[103015.2  103282.38]
 [132582.28 144259.4 ]
 [132447.74 146121.95]
 [ 71976.1   77798.83]
 [178537.48 191050.39]
 [116161.24 105008.31]
 [ 67851.69  81229.06]
 [ 98791.73  97483.56]
 [113969.44 110352.25]
 [167921.07 166187.94]]
