## Importing required libraries

In [25]:
import pandas as pd
import numpy   as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import sklearn.metrics as metrics

plt.style.use(style='ggplot')
plt.rcParams['figure.figsize'] = (10, 6)

## Mount the google drive

In [26]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Read the Restaurant Revenue dataset

In [27]:
restaurant_data = pd.read_csv('/content/drive/MyDrive/Python Deep Learning/Assignment 3/Part 1/rest_data.csv')
print(restaurant_data)

      Id  City Group Type  P1   P2   P3  ...  P33  P34  P35  P36  P37  revenue
0      0  Big Cities   IL   4  5.0  4.0  ...    5    5    4    3    4  5653753
1      1  Big Cities   FC   4  5.0  4.0  ...    0    0    0    0    0  6923131
2      2       Other   IL   2  4.0  2.0  ...    0    0    0    0    0  2055379
3      3       Other   IL   6  4.5  6.0  ...    6   18   12   12    6  2675511
4      4       Other   IL   3  4.0  3.0  ...    2    3    4    3    3  4316715
..   ...         ...  ...  ..  ...  ...  ...  ...  ...  ...  ...  ...      ...
132  132       Other   FC   2  3.0  3.0  ...    0    0    0    0    0  5787594
133  133  Big Cities   FC   4  5.0  4.0  ...    0    0    0    0    0  9262754
134  134       Other   FC   3  4.0  4.0  ...    0    0    0    0    0  2544857
135  135  Big Cities   FC   4  5.0  4.0  ...    0    0    0    0    0  7217634
136  136  Big Cities   FC   4  5.0  3.0  ...    0    0    0    0    0  6363241

[137 rows x 41 columns]


## Build the multiple linear regression model

In [28]:
# Handling missing values
rest_data = restaurant_data.select_dtypes(include=[np.number]).interpolate().dropna()

# Build the model
X = rest_data.drop(['revenue', 'Id'], axis=1)
y = np.log(rest_data.revenue)

# Split the data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

## Fit the model

In [29]:
# Fit the model
lr = linear_model.LinearRegression()
model1 = lr.fit(X_train, y_train)

# Predict the test data using model
predictions = model1.predict(X_test)

## Evaluate model performance with multiple regression model(All features considered)

In [30]:
# Evaluate the model performance
mse = metrics.mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse) 
r2 = metrics.r2_score(y_test, predictions)

# Print model performance values
print("RMSE:", rmse)
print("R-Squared:", r2)

RMSE: 0.796525104155611
R-Squared: -1.1280520942721401


## 3. Find top 5 most correlated features to the target label(revenue) and then build a model on top of those 5 features. Evaluate the model using MAE, MSE, RMSE and R2 score and then compare the result with the RMSE and R2 you achieved in question 2.

In [31]:
# Find the correlation between parameters
corr = rest_data.corr()
# Get the list of top 5 most correlated features
print(corr['revenue'].sort_values(ascending=False)[:6], '\n')

revenue    1.000000
P2         0.191518
P28        0.155534
P6         0.139094
P21        0.097411
P11        0.084247
Name: revenue, dtype: float64 



In [32]:
# Build the model
X = rest_data[['P2', 'P28', 'P6', 'P21', 'P11']]
y = np.log(rest_data.revenue)

# Split the data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

## Fit the model

In [33]:
# Fit the model
lr = linear_model.LinearRegression()
model2 = lr.fit(X_train, y_train)

# Predict the test data using model
predictions = model2.predict(X_test)

## Evaluate model performance with top 5 most correlated features

In [34]:
# Evaluate the model performance
mse = metrics.mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse) 
r2 = metrics.r2_score(y_test, predictions)

# Print model performance values
print("RMSE:", rmse)
print("R-Squared:", r2)

RMSE: 0.5472952738534502
R-Squared: -0.004677589069639554


## Comparision: We got better results than received with earlier model. As we only considered most correlated variables than to consider ALL the features.