# Multiple Linear Regression

In [1]:
# Importing the librabries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Importing the dataset
dataset = pd.read_csv('50_Startups.csv')

In [3]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
X = dataset.iloc[:, 0:4].values
y = dataset.iloc[:, 4].values
X.shape, y.shape

((50, 4), (50,))

In [5]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()
X[:, 3] = labelencoder.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray()

In [None]:
# Avoiding the Dummy Variable Trap
X = X[:, 1:]

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
mul_reg = LinearRegression()
mul_reg.fit(X_train, y_train)

In [None]:
# Predicting the Test set results
y_pred = mul_reg.predict(X_test)

In [None]:
# Accuracy
train_score = mul_reg.score(X_train, y_train)
test_score = mul_reg.score(X_test, y_test)
train_score, test_score

In [None]:
# Building the optimal model using Backward Elimination
import statsmodels.formula.api as sm

# Adding X0 to X as y = b0*X0 + b1*X1 + .... (where X0 = 1)
X = np.append(arr = np.ones((50,1)).astype(int), values = X, axis=1)

# Fitting the model to X_opt(with all columns)
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

In [None]:
# Fitting the model to X_opt(with columns [0, 1, 3, 4, 5])
X_opt = X[:, [0, 1, 3, 4, 5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

In [None]:
# Fitting the model to X_opt(with columns [0, 3, 4, 5])
X_opt = X[:, [0, 3, 4, 5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

In [None]:
# Fitting the model to X_opt(with columns [0, 3, 5])
X_opt = X[:, [0, 3, 5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

In [None]:
# Fitting the model to X_opt(with columns [0, 3])
X_opt = X[:, [0, 3]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

In [None]:
# Optimal dataset
X = X_opt

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
mul_reg = LinearRegression()
mul_reg.fit(X_train, y_train)

# Predicting the Test set results
y_pred = mul_reg.predict(X_test)

# Accuracy
train_score = mul_reg.score(X_train, y_train)
test_score = mul_reg.score(X_test, y_test)
train_score, test_score