# Prediction Using Multiple Linear Regression

### Importing Required Packages and Importing Dataset

In [1]:
#import required packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [2]:
#import required dataset into pandas dataframe
data = pd.read_csv('E:\Datasets\Restaurant_Profit_Data.csv')

In [3]:
#printing dataset shape and features
columns_present = list(data)
print("Columns present : ", columns_present)
print("Dataset Shape : ", data.shape)
#printing dataset observations
data.head()

Columns present :  ['Miscellaneous_Expenses', 'Food_Innovation_Spend', 'Advertising', 'City', 'Profit']
Dataset Shape :  (50, 5)


Unnamed: 0,Miscellaneous_Expenses,Food_Innovation_Spend,Advertising,City,Profit
0,138671.8,167497.2,475918.1,Chicago,202443.83
1,153151.59,164745.7,448032.53,Mumbai,201974.06
2,102919.55,155589.51,412068.54,Tokyo,201232.39
3,120445.85,146520.41,387333.62,Chicago,193083.99
4,93165.77,144255.34,370302.42,Tokyo,176369.94


In [4]:
#obtaining information on dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Miscellaneous_Expenses  50 non-null     float64
 1   Food_Innovation_Spend   50 non-null     float64
 2   Advertising             50 non-null     float64
 3   City                    50 non-null     object 
 4   Profit                  50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


### Preprocessing Data

**Preprocessing Values for City Column**

In [5]:
#generate dummies for city column
city_dummies = pd.get_dummies(data['City'])
city_dummies.head()

Unnamed: 0,Chicago,Mumbai,Tokyo
0,1,0,0
1,0,1,0
2,0,0,1
3,1,0,0
4,0,0,1


In [6]:
#eliminate the dummy values for one city (Chicago in this case)
city_dummies.drop(city_dummies.columns[0],axis=1,inplace=True)
city_dummies.head()

Unnamed: 0,Mumbai,Tokyo
0,0,0
1,1,0
2,0,1
3,0,0
4,0,1


**Final Preprocessed Data**

In [7]:
#form final dataframe to be used
data = pd.concat([data,city_dummies],axis=1)
data.drop('City',axis=1,inplace=True)
data.head()

Unnamed: 0,Miscellaneous_Expenses,Food_Innovation_Spend,Advertising,Profit,Mumbai,Tokyo
0,138671.8,167497.2,475918.1,202443.83,0,0
1,153151.59,164745.7,448032.53,201974.06,1,0
2,102919.55,155589.51,412068.54,201232.39,0,1
3,120445.85,146520.41,387333.62,193083.99,0,0
4,93165.77,144255.34,370302.42,176369.94,0,1


### Splitting the Data

In [8]:
#separate independent and independent features
X = data.drop(['Profit'],axis=1)
y = data.drop(list(X),axis=1)

In [9]:
#display independent features
X.head()

Unnamed: 0,Miscellaneous_Expenses,Food_Innovation_Spend,Advertising,Mumbai,Tokyo
0,138671.8,167497.2,475918.1,0,0
1,153151.59,164745.7,448032.53,1,0
2,102919.55,155589.51,412068.54,0,1
3,120445.85,146520.41,387333.62,0,0
4,93165.77,144255.34,370302.42,0,1


In [10]:
#display dependent feature
y.head()

Unnamed: 0,Profit
0,202443.83
1,201974.06
2,201232.39
3,193083.99
4,176369.94


In [11]:
#Splitting the dataset into test data and train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100)
y_test.head()

Unnamed: 0,Profit
6,166304.51
36,100890.19
37,100131.14
28,113464.38
43,79940.98


### Fitting the Regression Model and Predict Values

In [12]:
#fitting Multiple Linear Regression to the training set
MLR = LinearRegression()
MLR.fit(X_train, y_train)

LinearRegression()

In [13]:
#predicting the test set results
y_predict = MLR.predict(X_test)
y_predict = pd.DataFrame(y_predict)

**Results for Multiple Linear Regression Model**

In [14]:
#view results for the model
r_square =  r2_score(y_true=y_test,y_pred=y_predict)
print("Coefficient of Determination or R^2 value : ", r_square)

Coefficient of Determination or R^2 value :  0.9100064859824895


In [15]:
#model evaluation using adjusted R-Square
'''
we have 5 column in Multiple Linear Regression Equation : 
Profit = a0+a1*Miscellaneous_Expenses+a2*Food_Innovation_Spend+ a3*Advertising+a4*Mumbai+a5*Tokyo
here n = no. of observations and p = no. of independent variables
'''
n = 50
p = 5
Adj_r_square = 1-(1-r_square)*(n-1)/(n-p-1)
print('Adjusted R-Square Error:', Adj_r_square)

Adjusted R-Square Error: 0.8997799502986814
