# Multiple Linear Regression

## Importing the libraries

In [4]:
import pandas as pd
import numpy as np

## Importing the dataset

In [5]:
data = pd.read_csv("50_Startups.csv")
inputs = data.iloc[:,:-1].values
targets = data.iloc[:,-1].values


## Encoding categorical data

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers = [('encoder',OneHotEncoder(),[-1])], remainder = "passthrough")
inputs = np.array(ct.fit_transform(inputs))

In [7]:
inputs

array([[0.0, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [1.0, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [0.0, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [0.0, 1.0, 0.0, 142107.34, 91391.77, 366168.42],
       [0.0, 0.0, 1.0, 131876.9, 99814.71, 362861.36],
       [1.0, 0.0, 0.0, 134615.46, 147198.87, 127716.82],
       [0.0, 1.0, 0.0, 130298.13, 145530.06, 323876.68],
       [0.0, 0.0, 1.0, 120542.52, 148718.95, 311613.29],
       [1.0, 0.0, 0.0, 123334.88, 108679.17, 304981.62],
       [0.0, 1.0, 0.0, 101913.08, 110594.11, 229160.95],
       [1.0, 0.0, 0.0, 100671.96, 91790.61, 249744.55],
       [0.0, 1.0, 0.0, 93863.75, 127320.38, 249839.44],
       [1.0, 0.0, 0.0, 91992.39, 135495.07, 252664.93],
       [0.0, 1.0, 0.0, 119943.24, 156547.42, 256512.92],
       [0.0, 0.0, 1.0, 114523.61, 122616.84, 261776.23],
       [1.0, 0.0, 0.0, 78013.11, 121597.55, 264346.06],
       [0.0, 0.0, 1.0, 94657.16, 145077.58

## Splitting the dataset into the Training set and Test set

In [8]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(inputs, targets, test_size = .2, random_state = 1)

In [9]:
x_test

array([[0.0, 0.0, 1.0, 72107.6, 127864.55, 353183.81],
       [0.0, 0.0, 1.0, 46014.02, 85047.44, 205517.64],
       [1.0, 0.0, 0.0, 28754.33, 118546.05, 172795.67],
       [0.0, 0.0, 1.0, 20229.59, 65947.93, 185265.1],
       [0.0, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [0.0, 0.0, 1.0, 542.05, 51743.15, 0.0],
       [0.0, 0.0, 1.0, 65605.48, 153032.06, 107138.38],
       [0.0, 1.0, 0.0, 1315.46, 115816.21, 297114.46],
       [0.0, 0.0, 1.0, 61136.38, 152701.92, 88218.23]], dtype=object)

## Training the Multiple Linear Regression model on the Training set

In [10]:
## This class we will use to build the model will automatically pass the dummy trap and 
## will implement backwards elimination
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

## Predicting the Test set results

In [11]:
y_pred = regressor.predict(x_test)
np.set_printoptions(precision=2)

In [32]:
y_pred = y_pred.reshape(len(y_pred),1)
y_pred

array([[114664.42],
       [ 90593.16],
       [ 75692.84],
       [ 70221.89],
       [179790.26],
       [171576.92],
       [ 49753.59],
       [102276.66],
       [ 58649.38],
       [ 98272.03]])

In [61]:
y_test = y_test.reshape(len(y_test),1)
y_test

array([[105008.31],
       [ 96479.51],
       [ 78239.91],
       [ 81229.06],
       [191050.39],
       [182901.99],
       [ 35673.41],
       [101004.64],
       [ 49490.75],
       [ 97483.56]])

In [55]:
dif_array = np.concatenate((y_test.reshape(len(y_test),1),y_pred.reshape(len(y_pred),1)),axis=1)

In [76]:
df = pd.DataFrame(data = dif_array, columns=["Real","Predicted"])
df["Difference"] = df.Predicted - df.Real
df.round(2)

Unnamed: 0,Real,Predicted,Difference
0,105008.31,114664.42,9656.11
1,96479.51,90593.16,-5886.35
2,78239.91,75692.84,-2547.07
3,81229.06,70221.89,-11007.17
4,191050.39,179790.26,-11260.13
5,182901.99,171576.92,-11325.07
6,35673.41,49753.59,14080.18
7,101004.64,102276.66,1272.02
8,49490.75,58649.38,9158.63
9,97483.56,98272.03,788.47
