#### Multiple Linear Regression

##### Step 1: Data preprocessing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
dataset = pd.read_csv("Datasets/50_Startups.csv")
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [4]:
dataset.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


##### Step 2: Encoding categorical variable

In [5]:
dataset["State"].unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [7]:
data_new = pd.get_dummies(dataset,columns=["State"])
data_new.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0,0,1
1,162597.7,151377.59,443898.53,191792.06,1,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1,0
3,144372.41,118671.85,383199.62,182901.99,0,0,1
4,142107.34,91391.77,366168.42,166187.94,0,1,0


##### Step 3: Splitting dataset into training/testing sets

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(data_new, data_new[["Profit"]], test_size=0.2, random_state=42)

##### Step 4: Fitting the model

In [14]:
model = LinearRegression()
model.fit(X_train, Y_train)

##### Step 5: Checking Accuracy of the model

In [15]:
model.score(X_test, Y_test)

1.0

##### Step 6: Predicting the "Profit" for test set

In [12]:
Y_pred = model.predict(X_test)
Y_pred

array([[134307.35],
       [ 81005.76],
       [ 99937.59],
       [ 64926.08],
       [125370.37],
       [ 35673.41],
       [105733.54],
       [107404.34],
       [ 97427.84],
       [122776.86]])