In [164]:
# Importing Libraries for Multiple Linear Regression

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [165]:
# Reading 50_Startups.csv file from paticular location

df = pd.read_csv('/content/50_Startups.csv')

In [166]:
# Checking the shape (Columns and Rows) of data

df.shape

(50, 5)

In [167]:
# Checking the head first 5 records of data

df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [168]:
# Checking the tail last 5 records of data

df.tail()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
45,1000.23,124153.04,1903.93,New York,64926.08
46,1315.46,115816.21,297114.46,Florida,49490.75
47,0.0,135426.92,0.0,California,42559.73
48,542.05,51743.15,0.0,New York,35673.41
49,0.0,116983.8,45173.06,California,14681.4


In [169]:
# Checking the sample random 5 records of data

df.sample(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
44,22177.74,154806.14,28334.72,California,65200.33
28,66051.52,182645.56,118148.2,Florida,103282.38
45,1000.23,124153.04,1903.93,New York,64926.08
15,114523.61,122616.84,261776.23,New York,129917.04
1,162597.7,151377.59,443898.53,California,191792.06


In [170]:
# Calling data of 50_Startups.csv file

df

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


In [171]:
# Checking categorical, unique values in State column

df['State'].unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [172]:
# Checking categorical, unique values counts in State column

df['State'].value_counts()

Unnamed: 0_level_0,count
State,Unnamed: 1_level_1
New York,17
California,17
Florida,16


In [173]:
# Converting categorical, unique values counts data into numerical values in State column

df['State'] = df['State'].map({'New York' : 0, 'California' : 1, 'Florida' : 2}).astype(int)

In [174]:
# Checking the head first 5 records after converting categorical, unique values counts data into numerical values in State column

df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,0,192261.83
1,162597.7,151377.59,443898.53,1,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,0,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


In [175]:
# Dividing data into Independent and Dependent rows and columns

X = df.iloc[: , : -1]  # Independent rows and columns
Y = df.iloc[: , -1]    # Dependent rows and columns

In [176]:
# Checking the X Independent rows and columns

X

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,0
1,162597.7,151377.59,443898.53,1
2,153441.51,101145.55,407934.54,2
3,144372.41,118671.85,383199.62,0
4,142107.34,91391.77,366168.42,2
5,131876.9,99814.71,362861.36,0
6,134615.46,147198.87,127716.82,1
7,130298.13,145530.06,323876.68,2
8,120542.52,148718.95,311613.29,0
9,123334.88,108679.17,304981.62,1


In [177]:
# Checking the Y Dependent rows and columns

Y

Unnamed: 0,Profit
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94
5,156991.12
6,156122.51
7,155752.6
8,152211.77
9,149759.96


In [178]:
# Selecting regression model by importing train_test_split

from sklearn.model_selection import train_test_split

In [179]:
# Splitting the 50_Startups.csv into training model -> 70 % and testing model -> 30 %

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.3,random_state = 42)

In [180]:
# Checking the length of X_train and Y_train

len(X_train), len(Y_train)

(35, 35)

In [181]:
# Checking the length of X_test and Y_test

len(X_test), len(Y_test)

(15, 15)

`Since we have 4 independent columns`

`y = m1x1 + m2x2 +m3x3 + m4x4 + c`

In [182]:
# Give the data to Algorithm

from sklearn.linear_model import LinearRegression

reg = LinearRegression()

reg.fit(X_train, Y_train)

In [183]:
# Multiple Linear Regression -> y = mx + c, Finding m value

reg.coef_

array([ 8.05287266e-01, -9.09759187e-02,  2.76195629e-02,  8.47949958e+02])

In [184]:
# Multiple Linear Regression -> y = mx + c, Finding c value

reg.intercept_

np.float64(56350.35199301147)

- `Data -> Algorithm -> Model`
- `Algorithm -> y = m1x1 + m2x2 + m3x3 + m4x4 + c , i.e y = mx + c`
- `Model -> y =  8.05287266e-01 * x1 -9.09759187e-02 * x2 + 2.76195629e-02 * x3 + 8.47949958e+02 * x4 + 56350.35199301147`

**Training Data**

In [185]:
# Creating dataframe for traning data -> X_train as X_Train_Values and Y_train as Y_Train_Values

training_data = pd.DataFrame()
training_data = X_train.copy()                  # Shallow Copy
training_data['Actual_Profit_Values'] = Y_train

In [186]:
# Calling dataframe for traning data -> X_train as X_Train_Values and Y_train as Y_Train_Values

training_data

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Actual_Profit_Values
6,134615.46,147198.87,127716.82,1,156122.51
41,27892.92,84710.77,164470.71,2,77798.83
46,1315.46,115816.21,297114.46,2,49490.75
47,0.0,135426.92,0.0,1,42559.73
15,114523.61,122616.84,261776.23,0,129917.04
9,123334.88,108679.17,304981.62,1,149759.96
16,78013.11,121597.55,264346.06,1,126992.93
24,77044.01,99281.34,140574.81,0,108552.04
34,46426.07,157693.92,210797.67,1,96712.8
31,61136.38,152701.92,88218.23,0,97483.56


**Train Performance**

In [187]:
 # Example for calculating model with the above training_data with y = mx + c

 p =  8.05287266e-01 * 134615.46 -9.09759187e-02 * 147198.87 + 2.76195629e-02 * 127716.82 + 8.47949958e+02 * 1 + 56350.35199301147
 p

155738.34800926992

In [188]:
# Storing X_train Prediction data using Multiple Linear Regression As 'reg' In Y_Train_Predictions

Y_train_pred = reg.predict(X_train)

In [189]:
# Dataframe for traning data and predicted data comparison

training_data['Values_From_Model'] = Y_train_pred

In [190]:
# Checking dataframe for traning data and predicted data comparison

training_data

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Actual_Profit_Values,Values_From_Model
6,134615.46,147198.87,127716.82,1,156122.51,155738.348062
41,27892.92,84710.77,164470.71,2,77798.83,77344.034203
46,1315.46,115816.21,297114.46,2,49490.75,56775.260509
47,0.0,135426.92,0.0,1,42559.73,44877.713483
15,114523.61,122616.84,261776.23,0,129917.04,144649.722214
9,123334.88,108679.17,304981.62,1,149759.96,155054.582024
16,78013.11,121597.55,264346.06,1,126992.93,116259.939857
24,77044.01,99281.34,140574.81,0,108552.04,113243.31589
34,46426.07,157693.92,210797.67,1,96712.8,86060.41521
31,61136.38,152701.92,88218.23,0,97483.56,94127.051812


In [191]:
# Importing mean_squared_error -> Loos, r2_score -> Accuracy from sklearn.metrics

from sklearn.metrics import mean_squared_error,r2_score

In [192]:
# Printing mean_squared_error -> Training Loos

print(f'Training Loos : {mean_squared_error(Y_train,Y_train_pred)}')

Training Loos : 81587538.15953389


In [193]:
# Printing r2_score -> Training Accuracy

print(f'Training Accuracy : {r2_score(Y_train,Y_train_pred)}')

Training Accuracy : 0.9511278731764929


**Test Performance**

In [194]:
# Storing X_test Prediction data using Multiple Linear Regression As 'reg' In Y_Test_Predictions

Y_test_pred = reg.predict(X_test)

In [195]:
# Printing mean_squared_error -> Testing Loos

print(f'Testing Loos : {mean_squared_error(Y_test,Y_test_pred)}')

Testing Loos : 84327239.72717616


In [196]:
# Printing r2_score -> Testing Accuracy

print(f'Testing Accuracy : {r2_score(Y_test,Y_test_pred)}')

Testing Accuracy : 0.9400659697736398


**I want to Predict My own data**

In [197]:
# Checking X_train columns for calculations applying in y = mx + c

X_train.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State'], dtype='object')

In [198]:
# Random example for Algorithm -> y = m1x1 + m2x2 + m3x3 + m4x4 + c , i.e y = mx + c

outcome = 8.05287266e-01 * 5000 -9.09759187e-02 * 12000 + 2.76195629e-02 * 18000 + 8.47949958e+02 * 2 + 56350.35199301147
outcome

61478.129346811475

In [199]:
# Importing warnings library to ignore or filter warnings

import warnings
warnings.filterwarnings('ignore')

In [200]:
# Predicting values based on user inputs

out = reg.predict([[5000 , 12000 , 18000 ,2]])
print(f'Prediction Value was : {out[0]} : with {r2_score(Y_test,Y_test_pred) * 100} % Accuracy')

Prediction Value was : 61478.12934815336 : with 94.00659697736398 % Accuracy


**Save The Model**

In [201]:
import pickle     # Using File Handaling

In [202]:
# Writing the file
# w -> write format, wb -> write with binary format

with open('MLR_Model.pkl', 'wb') as f:
  pickle.dump(reg, f)

In [203]:
# Reading the file
# Checking the file

with open('MLR_Model.pkl', 'rb') as f:
  m = pickle.load(f)

In [204]:
# Predicting values based on user inputs

out = m.predict([[5000 , 12000 , 18000 ,2]])
print(f'Prediction Value was : {out[0]} : with {r2_score(Y_test,Y_test_pred) * 100} % Accuracy')

Prediction Value was : 61478.12934815336 : with 94.00659697736398 % Accuracy
