## Importing the essential libraries over here

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

## Importing the dataset over here

In [3]:
data=pd.read_csv("restaurant_data.csv")

In [4]:
data.head()

Unnamed: 0,Name,Location,Cuisine,Rating,Seating Capacity,Average Meal Price,Marketing Budget,Social Media Followers,Chef Experience Years,Number of Reviews,Avg Review Length,Ambience Score,Service Quality Score,Parking Availability,Weekend Reservations,Weekday Reservations,Revenue
0,Restaurant 0,Rural,Japanese,4.0,38,73.98,2224,23406,13,185,161.924906,1.3,7.0,Yes,13,4,638945.52
1,Restaurant 1,Downtown,Mexican,3.2,76,28.11,4416,42741,8,533,148.759717,2.6,3.4,Yes,48,6,490207.83
2,Restaurant 2,Rural,Italian,4.7,48,48.29,2796,37285,18,853,56.849189,5.3,6.7,No,27,14,541368.62
3,Restaurant 3,Rural,Italian,4.4,34,51.55,1167,15214,13,82,205.433265,4.6,2.8,Yes,9,17,404556.8
4,Restaurant 4,Downtown,Japanese,4.9,88,75.98,3639,40171,9,78,241.681584,8.6,2.1,No,37,26,1491046.35


## Taking care of duplicate observations over here

In [5]:
data.duplicated().sum()

0

## Taking care of missing values if present over here

In [6]:
data.isnull().sum()

Name                      0
Location                  0
Cuisine                   0
Rating                    0
Seating Capacity          0
Average Meal Price        0
Marketing Budget          0
Social Media Followers    0
Chef Experience Years     0
Number of Reviews         0
Avg Review Length         0
Ambience Score            0
Service Quality Score     0
Parking Availability      0
Weekend Reservations      0
Weekday Reservations      0
Revenue                   0
dtype: int64

## Filtering all the numerical features over here

In [7]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!='O']
for feature in numerical_features:
  print(feature)

Rating
Seating Capacity
Average Meal Price
Marketing Budget
Social Media Followers
Chef Experience Years
Number of Reviews
Avg Review Length
Ambience Score
Service Quality Score
Weekend Reservations
Weekday Reservations
Revenue


In [8]:
data[numerical_features]

Unnamed: 0,Rating,Seating Capacity,Average Meal Price,Marketing Budget,Social Media Followers,Chef Experience Years,Number of Reviews,Avg Review Length,Ambience Score,Service Quality Score,Weekend Reservations,Weekday Reservations,Revenue
0,4.0,38,73.98,2224,23406,13,185,161.924906,1.3,7.0,13,4,638945.52
1,3.2,76,28.11,4416,42741,8,533,148.759717,2.6,3.4,48,6,490207.83
2,4.7,48,48.29,2796,37285,18,853,56.849189,5.3,6.7,27,14,541368.62
3,4.4,34,51.55,1167,15214,13,82,205.433265,4.6,2.8,9,17,404556.80
4,4.9,88,75.98,3639,40171,9,78,241.681584,8.6,2.1,37,26,1491046.35
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8363,3.4,54,34.85,1102,11298,11,380,253.919515,9.5,5.0,37,0,434653.45
8364,3.7,49,36.88,1988,20432,9,713,175.590195,2.7,2.6,37,21,414977.92
8365,4.7,88,46.87,5949,63945,6,436,222.953647,4.8,1.7,83,21,930395.87
8366,3.1,31,44.53,707,7170,1,729,178.482851,6.1,2.1,6,21,311493.48


## Filteirng all the categorical features over here

In [9]:
cat_features=[feature for feature in data.columns if data[feature].dtype=='O']
for feature in cat_features:
  print(feature)

Name
Location
Cuisine
Parking Availability


In [10]:
data[cat_features]

Unnamed: 0,Name,Location,Cuisine,Parking Availability
0,Restaurant 0,Rural,Japanese,Yes
1,Restaurant 1,Downtown,Mexican,Yes
2,Restaurant 2,Rural,Italian,No
3,Restaurant 3,Rural,Italian,Yes
4,Restaurant 4,Downtown,Japanese,No
...,...,...,...,...
8363,Restaurant 8363,Suburban,Indian,Yes
8364,Restaurant 8364,Rural,Indian,No
8365,Restaurant 8365,Downtown,Italian,Yes
8366,Restaurant 8366,Rural,American,No


## Encoding the categorical features over here

In [11]:
for feature in cat_features:
  feature_mapping={category:index for index,category in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

In [12]:
data

Unnamed: 0,Name,Location,Cuisine,Rating,Seating Capacity,Average Meal Price,Marketing Budget,Social Media Followers,Chef Experience Years,Number of Reviews,Avg Review Length,Ambience Score,Service Quality Score,Parking Availability,Weekend Reservations,Weekday Reservations,Revenue
0,0,0,0,4.0,38,73.98,2224,23406,13,185,161.924906,1.3,7.0,0,13,4,638945.52
1,1,1,1,3.2,76,28.11,4416,42741,8,533,148.759717,2.6,3.4,0,48,6,490207.83
2,2,0,2,4.7,48,48.29,2796,37285,18,853,56.849189,5.3,6.7,1,27,14,541368.62
3,3,0,2,4.4,34,51.55,1167,15214,13,82,205.433265,4.6,2.8,0,9,17,404556.80
4,4,1,0,4.9,88,75.98,3639,40171,9,78,241.681584,8.6,2.1,1,37,26,1491046.35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8363,8363,2,3,3.4,54,34.85,1102,11298,11,380,253.919515,9.5,5.0,0,37,0,434653.45
8364,8364,0,3,3.7,49,36.88,1988,20432,9,713,175.590195,2.7,2.6,1,37,21,414977.92
8365,8365,1,2,4.7,88,46.87,5949,63945,6,436,222.953647,4.8,1.7,0,83,21,930395.87
8366,8366,0,5,3.1,31,44.53,707,7170,1,729,178.482851,6.1,2.1,1,6,21,311493.48


In [13]:
data['RATING']=data['Rating']
data.drop('Rating',axis=1,inplace=True)

In [14]:
data.head(2)

Unnamed: 0,Name,Location,Cuisine,Seating Capacity,Average Meal Price,Marketing Budget,Social Media Followers,Chef Experience Years,Number of Reviews,Avg Review Length,Ambience Score,Service Quality Score,Parking Availability,Weekend Reservations,Weekday Reservations,Revenue,RATING
0,0,0,0,38,73.98,2224,23406,13,185,161.924906,1.3,7.0,0,13,4,638945.52,4.0
1,1,1,1,76,28.11,4416,42741,8,533,148.759717,2.6,3.4,0,48,6,490207.83,3.2


## Creating the feature and labels over here

In [15]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set to avoid the problem of overfitting over here

In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training set over here

In [17]:
from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor(n_estimators=10,random_state=0)
regressor.fit(X_train,y_train)

## Evaluating the performance of the model on the testing dataset over here

In [18]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[4.1  3.2 ]
 [4.3  3.1 ]
 [3.68 3.8 ]
 ...
 [3.76 3.4 ]
 [3.94 4.7 ]
 [3.89 4.8 ]]


In [20]:
data_diff = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

In [21]:
data_diff

Unnamed: 0,Actual,Predicted
0,3.2,4.10
1,3.1,4.30
2,3.8,3.68
3,4.4,4.07
4,4.4,3.84
...,...,...
1669,3.4,4.37
1670,4.0,4.08
1671,3.4,3.76
1672,4.7,3.94
