### Loading Salary Dataset

In [1]:
# To Avoid Warnings
import warnings
warnings.filterwarnings("ignore")

#importing the Libraies
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Display First 5 rows in dataset
dataset = pd.read_csv('50_Startups.csv')
print("Rows",dataset.shape[0],"Columns",dataset.shape[1])
dataset.head()

Rows 50 Columns 5


Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [2]:
print("Columns: ",[i for i in dataset.columns])

Columns:  ['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit']


### Converting Categorical to Numerical Column

<pre>
Since Categorical column cannot be processed directly by machine,we convert the categorical columns into numerical
There are two ways to do the conversion, 
a. when the categorical is of ordinal(eg. First, Second or Monday, Tuesday)
b. when the categorical is of nominal(eg. Green, Blue, Red, Purple)
</pre>

In [3]:
# If we do not use the drop_first Parameter,
# we get redundant data for all categorical data, as shown below
pd.get_dummies(dataset).head(3)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,False,False,True
1,162597.7,151377.59,443898.53,191792.06,True,False,False
2,153441.51,101145.55,407934.54,191050.39,False,True,False


In [4]:
# We remove redundant data, that can be later acquired from rest of the columns
# by dropping the first column generated
dataset=pd.get_dummies(dataset,drop_first=True)
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,False,True
1,162597.7,151377.59,443898.53,191792.06,False,False
2,153441.51,101145.55,407934.54,191050.39,True,False
3,144372.41,118671.85,383199.62,182901.99,False,True
4,142107.34,91391.77,366168.42,166187.94,True,False


### Assigning Input and Output Variables (5 i/p & 1 o/p)

In [5]:
independant=dataset[['R&D Spend','Administration', 'Marketing Spend','State_Florida', 'State_New York']]
independant.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
0,165349.2,136897.8,471784.1,False,True
1,162597.7,151377.59,443898.53,False,False
2,153441.51,101145.55,407934.54,True,False
3,144372.41,118671.85,383199.62,False,True
4,142107.34,91391.77,366168.42,True,False


In [6]:
dependant=dataset[['Profit']]
dependant.head()

Unnamed: 0,Profit
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


### Split of Data into Train and Test

In [7]:
#split into training set and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(independant, dependant, test_size = 1/3, random_state = 43)

In [8]:
print("X_train","Rows:",X_train.shape[0],"Columns:",X_train.shape[1])
X_train.head(2)

X_train Rows: 33 Columns: 5


Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
7,130298.13,145530.06,323876.68,True,False
22,73994.56,122782.75,303319.26,True,False


In [9]:
print("X_test","Rows:",X_test.shape[0],"Columns:",X_test.shape[1])
X_test.head(2)

X_test Rows: 17 Columns: 5


Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
19,86419.7,153514.11,0.0,False,True
36,28663.76,127056.21,201126.82,True,False


In [10]:
print("y_train","Rows:",y_train.shape[0],"Columns:",y_train.shape[1])
y_train.head(2)

y_train Rows: 33 Columns: 1


Unnamed: 0,Profit
7,155752.6
22,110352.25


In [11]:
print("y_test","Rows:",y_test.shape[0],"Columns:",y_test.shape[1])
y_test.head(2)

y_test Rows: 17 Columns: 1


Unnamed: 0,Profit
19,122776.86
36,90708.19



### Training Model using Train Data (X_train, y_train)

$y = (a_1x_1 + a_2x_2 + a_3x_3 + a_4x_4 + a_5x_5) + b_0$

In [12]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train) 

In [13]:
weight=regressor.coef_
print("Weights of the model (a1-a5)={}".format(weight[0]))
bias=regressor.intercept_
print("Intercept of the model (b0)={}".format(bias[0]))

Weights of the model (a1-a5)=[ 8.37383196e-01 -5.05365809e-02  2.78515029e-02  5.93192452e+02
 -2.44037336e+03]
Intercept of the model (b0)=50453.52936165679


In [14]:
y_pred=regressor.predict(X_test)
print("y_pred","Rows:",y_pred.shape[0],"Columns:",y_pred.shape[1])
pd.DataFrame(y_pred, columns=['y_pred']).head(3)

y_pred Rows: 17 Columns: 1


Unnamed: 0,y_pred
0,112621.482331
1,74229.970552
2,100417.477544


#### Model Evaluation
https://scikit-learn.org/stable/modules/model_evaluation.html

In [15]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score,mean_absolute_percentage_error,mean_squared_log_error,max_error
mae=mean_absolute_error(y_test,y_pred)
msle=mean_squared_log_error(y_test,y_pred)
map=mean_absolute_percentage_error(y_test,y_pred)
mse=mean_squared_error(y_test,y_pred)
max_er=max_error(y_test,y_pred)
r_score=r2_score(y_test,y_pred)
print("Mean absolute error:",mae)
print("Mean squared logarithmic error:",msle)
print("Mean absolute percentage error:",map)
print("Mean squared error:",mse)
print("Maximum error:",max_er)
print("R2 Score:",r_score)

Mean absolute error: 6992.7371999944535
Mean squared logarithmic error: 0.01517856633861719
Mean absolute percentage error: 0.06860542892746677
Mean squared error: 87214479.88883439
Maximum error: 22296.59104555419
R2 Score: 0.9177499385783591


#### Saving Trained Model

In [16]:
import pickle
filename="Multiple_Linear_Trained_Model.sav"
pickle.dump(regressor,open(filename,"wb"))