# Multiple Regression

In [1]:
# Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Dataset
dataset = pd.read_csv('AA50_Startups.csv')
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
dataset.corr()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
R&D Spend,1.0,0.241955,0.724248,0.9729
Administration,0.241955,1.0,-0.032154,0.200717
Marketing Spend,0.724248,-0.032154,1.0,0.747766
Profit,0.9729,0.200717,0.747766,1.0


In [4]:
dataset.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')

In [5]:
# X and y
X = dataset[['R&D Spend', 'Administration', 'Marketing Spend']]
y = dataset['Profit']
X.shape, y.shape

((50, 3), (50,))

In [6]:
X = np.concatenate((np.ones((len(X), 1)), X), axis=1)
X.shape

(50, 4)

In [7]:
# Splitting data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [8]:
# c = ((X^T).(X)).((X^T).(y))
coeff = np.dot(np.linalg.inv(np.dot(X_train.T, X_train)), np.dot(X_train.T, y_train))
coeff

array([4.59256271e+04, 7.79946897e-01, 1.74002997e-02, 3.60211219e-02])

In [9]:
y_pred = np.dot(X_test, coeff)
y_pred

array([ 88287.41062646, 151438.84017595, 120537.82823503,  99289.67306481,
       116699.81127514, 191366.93722057, 112014.78869329,  59669.24898719,
        71369.19151394,  48282.09608943])

In [10]:
y_test

37     89949.14
14    132602.65
21    111313.02
32     97427.84
22    110352.25
1     191792.06
26    105733.54
46     49490.75
42     71498.49
47     42559.73
Name: Profit, dtype: float64

In [19]:
u = np.sum((y_test - y_pred) ** 2) / float(len(y_test))
v = np.sum((y_test - y_test.mean()) ** 2) / float(len(y_test))
r2 = 1 - (u/v)
r2

0.9599179765801995

In [20]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [21]:
mae = mean_absolute_error(y_test, y_pred)
mae

6066.865715986234

In [22]:
mse = mean_squared_error(y_test, y_pred)
mse

66241729.6248004

In [23]:
rmse = mse ** 0.5
rmse

8138.902237083353