# <span style='color:Red'> Task1: Predict Restaurnat Ratings </span>

### <span style='color:Blue'> Objective: </span> Build a machine learning model to predict the aggregate rating of a restaurant based on other features.

In [1]:
## basic import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import pickle

## preprocessing import
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler

## modeling import
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

## evaluation metrics import
from sklearn.metrics import r2_score, mean_absolute_error,mean_squared_error

In [2]:
df = pd.read_csv('Dataset.csv')
df.head()

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270
3,6318506,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",...,Botswana Pula(P),No,No,No,No,4,4.9,Dark Green,Excellent,365
4,6314302,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.58445,"Japanese, Korean",...,Botswana Pula(P),Yes,No,No,No,4,4.8,Dark Green,Excellent,229


In [3]:
df.shape

(9551, 21)

#### checking missing values

In [4]:
df.isna().sum()

Restaurant ID           0
Restaurant Name         0
Country Code            0
City                    0
Address                 0
Locality                0
Locality Verbose        0
Longitude               0
Latitude                0
Cuisines                9
Average Cost for two    0
Currency                0
Has Table booking       0
Has Online delivery     0
Is delivering now       0
Switch to order menu    0
Price range             0
Aggregate rating        0
Rating color            0
Rating text             0
Votes                   0
dtype: int64

In [5]:
## ==> only cuisine has null values(only 9)
## way 1. drop these rows form the dataset(not as much important for predict the ratings, and low in numbers)
## way 2. you have to replace the most frequent cuisine values for that location / city

df['Cuisines'] = df.groupby('City')['Cuisines'].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'Unknown'))


#### checking duplicate values

In [6]:
df.duplicated().sum()

0

#### check datatype

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9551 entries, 0 to 9550
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Restaurant ID         9551 non-null   int64  
 1   Restaurant Name       9551 non-null   object 
 2   Country Code          9551 non-null   int64  
 3   City                  9551 non-null   object 
 4   Address               9551 non-null   object 
 5   Locality              9551 non-null   object 
 6   Locality Verbose      9551 non-null   object 
 7   Longitude             9551 non-null   float64
 8   Latitude              9551 non-null   float64
 9   Cuisines              9551 non-null   object 
 10  Average Cost for two  9551 non-null   int64  
 11  Currency              9551 non-null   object 
 12  Has Table booking     9551 non-null   object 
 13  Has Online delivery   9551 non-null   object 
 14  Is delivering now     9551 non-null   object 
 15  Switch to order menu 

#### Drop Irrelevant columns

In [8]:
## drop irrelevant columns
df = df.drop(columns=['Restaurant ID','Restaurant Name','Country Code','Longitude','Latitude','City','Address','Locality','Locality Verbose','Currency','Switch to order menu','Price range','Rating color','Rating text'])
df.head()

Unnamed: 0,Cuisines,Average Cost for two,Has Table booking,Has Online delivery,Is delivering now,Aggregate rating,Votes
0,"French, Japanese, Desserts",1100,Yes,No,No,4.8,314
1,Japanese,1200,Yes,No,No,4.5,591
2,"Seafood, Asian, Filipino, Indian",4000,Yes,No,No,4.4,270
3,"Japanese, Sushi",1500,No,No,No,4.9,365
4,"Japanese, Korean",1500,Yes,No,No,4.8,229


In [9]:
## preprocessing of categorical column with 2 values {yes:1 and no:0}

df['Has Table booking'] = df['Has Table booking'].map({'Yes': 1, 'No': 0})
df['Has Online delivery'] = df['Has Online delivery'].map({'Yes': 1, 'No': 0})
df['Is delivering now'] = df['Is delivering now'].map({'Yes': 1, 'No': 0})

In [10]:
df.head()

Unnamed: 0,Cuisines,Average Cost for two,Has Table booking,Has Online delivery,Is delivering now,Aggregate rating,Votes
0,"French, Japanese, Desserts",1100,1,0,0,4.8,314
1,Japanese,1200,1,0,0,4.5,591
2,"Seafood, Asian, Filipino, Indian",4000,1,0,0,4.4,270
3,"Japanese, Sushi",1500,0,0,0,4.9,365
4,"Japanese, Korean",1500,1,0,0,4.8,229


#### Modified Cuisine Column as General Cuisine

In [12]:
def map_cuisine_to_category(cuisine):
    
    indian_cuisines = [
        "curry", "south indian", "andhra", "maharashtrian", "north indian", "naga",
        "gujarati", "kashmiri", "hyderabadi", "bengali", "goan", "awadhi", "rajasthani",
        "chettinad", "mithai", "lucknowi", "mangalorean", "malwani", "indian", "biryani",
        "oriya", "modern indian", "bihari", "assamese", "north eastern", "healthy food", "vegetarian", "indian"
    ]
    east_asian_cuisines = [
        "chinese", "japanese", "korean", "mongolian", "hong kong cuisine", "cantonese",
        "ramen", "teriyaki", "taiwanese"
    ]
    southeast_asian_cuisines = [
        "thai", "vietnamese", "singaporean", "filipino", "indonesian", "burmese",
        "peranakan", "malaysian", "laotian", "cambodian", "sri lankan"
    ]
    south_asian_cuisines = [
        "nepalese", "afghan", "pakistani", "maldivian", "thakali"
    ]
    central_asian_cuisines = [
        "kazakh", "uzbek", "turkmen", "manti", "shashlik"
    ]
    middle_eastern_cuisines = [
        "armenian", "iranian", "persian", "turkish", "lebanese", "arabian",
        "yemeni", "palestinian", "israeli", "syrian", "iraqi", "jordanian",
        "borek", "kebab", "izgara"
    ]
    western_european_cuisines = [
        "french", "italian", "spanish", "greek", "portuguese", "german", "belgian",
        "irish", "british", "scottish", "mediterranean", "continental", "european", "western",
        "austrian", "swiss", "dutch"
    ]
    eastern_european_cuisines = [
        "polish", "hungarian", "czech"
    ]
    northern_european_cuisines = [
        "danish", "swedish", "finnish"
    ]
    north_american_cuisines = [
        "tex-mex", "american", "diner", "steak", "pub food", "charcoal grill", "new american",
        "bar food", "deli", "canadian", "cajun", "hawaiian"
    ]
    latin_american_cuisines = [
        "mexican", "brazilian", "argentine", "peruvian", "cuban", "latin american",
        "mineira", "southern", "caribbean", "soul food", "chilean", "colombian", "venezuelan",
        "ecuadorian", "uruguayan"
    ]
    african_cuisines = [
        "south african", "durban", "ethiopian", "nigerian", "ghanaian", "kenyan"
    ]
    north_african_cuisines = [
        "moroccan", "algerian", "tunisian", "egyptian", "libyan", "sudanese"
    ]
    australian_and_pacific_cuisines = [
        "australian", "modern australian", "kiwi", "new zealand", "fijian", "polynesian",
        "papua new guinean"
    ]
    international_and_fusion_cuisines = [
        "fusion", "international", "contemporary", "raw meats", "world cuisine",
        "cuisine varies", "street food"
    ]
    desserts_and_bakery = [
        "desserts", "ice cream", "bakery", "patisserie", "mithai"
    ]
    seafood = [
        "seafood", "fish and chips", "sushi"
    ]
    beverages = [
        "tea", "coffee and tea", "juices", "beverages", "drinks only", "bubble tea",
        "smoothies", "shakes"
    ]
    fastfood = [
        "pizza", "burger", "fast food", "sandwich", "turkish pizza", "breakfast",
        "restaurant cafe", "grill", "wraps", "hotdogs", "gourmet fast food", "finger food"
    ]

    if cuisine in indian_cuisines:
        return 'indian'
    elif cuisine in east_asian_cuisines:
        return 'east_asian'
    elif cuisine in southeast_asian_cuisines:
        return 'southeast_asian'
    elif cuisine in south_asian_cuisines:
        return 'south_asian'
    elif cuisine in central_asian_cuisines:
        return 'central_asian'
    elif cuisine in middle_eastern_cuisines:
        return 'middle_eastern'
    elif cuisine in western_european_cuisines:
        return 'western_european'
    elif cuisine in eastern_european_cuisines:
        return 'eastern_european'
    elif cuisine in northern_european_cuisines:
        return 'northern_european'
    elif cuisine in north_american_cuisines:
        return 'north_american'
    elif cuisine in latin_american_cuisines:
        return 'latin_american'
    elif cuisine in african_cuisines:
        return 'african'
    elif cuisine in north_african_cuisines:
        return 'north_african'
    elif cuisine in australian_and_pacific_cuisines:
        return 'australian_and_pacific'
    elif cuisine in international_and_fusion_cuisines:
        return 'international_and_fusion_cuisines'
    elif cuisine in desserts_and_bakery:
        return 'desserts_and_bakery'
    elif cuisine in seafood:
        return 'seafood'
    elif cuisine in beverages:
        return 'beverages'
    elif cuisine in fastfood:
        return 'fastfood'
    else:
        return 'other'

In [13]:
def preprocess_cuisines(row):
    cuisines = [cuisine.strip().lower() for cuisine in row.split(', ')]
    generalized_cuisines = [map_cuisine_to_category(cuisine) for cuisine in cuisines]
    return ', '.join(set(generalized_cuisines))

df['Cuisine_varieties'] = df['Cuisines'].apply(preprocess_cuisines)
df = df.drop(columns=['Cuisines'], axis=1)
df.head()

Unnamed: 0,Average Cost for two,Has Table booking,Has Online delivery,Is delivering now,Aggregate rating,Votes,Cuisine_varieties
0,1100,1,0,0,4.8,314,"east_asian, western_european, desserts_and_bakery"
1,1200,1,0,0,4.5,591,east_asian
2,4000,1,0,0,4.4,270,"indian, seafood, southeast_asian, other"
3,1500,0,0,0,4.9,365,"east_asian, seafood"
4,1500,1,0,0,4.8,229,east_asian


### Preparing X and y Variables

In [15]:
X = df.drop(columns=['Aggregate rating'], axis = 1)
y = df['Aggregate rating']

In [16]:
X.shape, y.shape

((9551, 6), (9551,))

# Feature Engineering

### 1. Encoded "Cuisines" column values

In [17]:
mlb = MultiLabelBinarizer()

def one_hot_encoded_cuisine(df, cuisine_column, save_path="OneHot_Encoder(mlb).pkl", is_training=True):
    
    global mlb

    df["cuisines_list"] = df[cuisine_column].apply(lambda x: [c.strip() for c in x.split(', ')])

    if is_training:
        one_hot = mlb.fit_transform(df["cuisines_list"])
        with open(save_path, "wb") as f:
            pickle.dump(mlb, f)
    else:
        with open(save_path, "rb") as f:
            mlb = pickle.load(f)

        one_hot = mlb.transform(df["cuisines_list"])

    one_hot_df = pd.DataFrame(one_hot, columns=mlb.classes_, index=df.index)

    df = df.drop(columns=[cuisine_column, 'cuisines_list'], axis=1)

    return pd.concat([df, one_hot_df], axis=1)


In [18]:
X = one_hot_encoded_cuisine(X, 'Cuisine_varieties',is_training=True)
X.shape
X.head()


Unnamed: 0,Average Cost for two,Has Table booking,Has Online delivery,Is delivering now,Votes,african,australian_and_pacific,beverages,desserts_and_bakery,east_asian,...,international_and_fusion_cuisines,latin_american,middle_eastern,north_african,north_american,other,seafood,south_asian,southeast_asian,western_european
0,1100,1,0,0,314,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,1
1,1200,1,0,0,591,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,4000,1,0,0,270,0,0,0,0,0,...,0,0,0,0,0,1,1,0,1,0
3,1500,0,0,0,365,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,1500,1,0,0,229,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


### 2. Standard Scaling of "Average cost for two" column

In [19]:
scaler = StandardScaler()
X['Average Cost for two'] = scaler.fit_transform(X[['Average Cost for two']])

In [None]:
with open('StandardScaler(Avg_rat_for_2).pkl', 'wb') as file:
    pickle.dump(scaler,file)

### Seperating train and test data

In [21]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((7640, 22), (1911, 22))

### Create function for Evaluation of Model after Training

In [22]:
def evaluate_model(true,predicted):
    mae = mean_absolute_error(true,predicted)
    rmse = np.sqrt(mean_squared_error(true,predicted))
    r2_square = r2_score(true,predicted)
    return mae,rmse,r2_square

In [23]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 1.3349
- Mean Absolute Error: 1.1132
- R2 Score: 0.2269
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1.3333
- Mean Absolute Error: 1.1220
- R2 Score: 0.2190


Decision Tree
Model performance for Training set
- Root Mean Squared Error: 0.0599
- Mean Absolute Error: 0.0130
- R2 Score: 0.9984
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.4548
- Mean Absolute Error: 0.2990
- R2 Score: 0.9091


Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 0.1330
- Mean Absolute Error: 0.0854
- R2 Score: 0.9923
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.3377
- Mean Absolute Error: 0.2217
- R2 Score: 0.9499


AdaBoost Regressor
Model performance for Training set
- Root Mean Squared Error: 0.3666
- Mean Absolute Error: 0.2455
- R2 Score: 0.9417


## perfomance metrics of all models

In [24]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
2,Random Forest Regressor,0.949902
3,AdaBoost Regressor,0.942563
1,Decision Tree,0.909133
0,Linear Regression,0.218969


=> Here we get<span style="color: red"> <b>RandomForestRegressor()</b> </span>as best model for training and Testing both

# Random Forest Regressor Model

### Hyper parameter tuning of Random Forest Regressor

In [25]:
rf_params = {"max_depth": [5, 8, 15, None, 10],
             "max_features": [5, 7, "auto", 8],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500, 1000]}

random_cv_models = [('RF',RandomForestRegressor(),rf_params)]

from sklearn.model_selection import RandomizedSearchCV

model_params = {}
for name, model, params in random_cv_models:
    random = RandomizedSearchCV(estimator=model,
                                param_distributions=params,
                                n_iter=100,
                                cv=3,
                                verbose=2,
                                n_jobs=-1)
    
    random.fit(X_train,y_train)
    model_params[name] = random.best_params_

for model_name in model_params:
    print(model_params[model_name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits
{'n_estimators': 1000, 'min_samples_split': 8, 'max_features': 7, 'max_depth': 15}


## Trained the model based on best parameters

In [26]:
model = RandomForestRegressor(n_estimators= 1000, min_samples_split= 8, max_features= 7, max_depth= 15,n_jobs=-1)
model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

model_train_mae, model_train_rmse, model_train_r2_score = evaluate_model(y_train,y_train_pred)
model_test_mae, model_test_rmse, model_test_r2_score = evaluate_model(y_test,y_test_pred)

print('RF Regressor performance for Training set')
print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
print("- R2 Score: {:.4f}".format(model_train_r2_score))

print('----------------------------------')
    
print('RF Regressor performance for Test set')
print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
print("- R2 Score: {:.4f}".format(model_test_r2_score))


RF Regressor performance for Training set
- Root Mean Squared Error: 0.2467
- Mean Absolute Error: 0.1633
- R2 Score: 0.9736
----------------------------------
RF Regressor performance for Test set
- Root Mean Squared Error: 0.3138
- Mean Absolute Error: 0.2099
- R2 Score: 0.9567


In [27]:
## save the model
import pickle
with open('Restaurant_Rating_Predictor.pkl','wb') as file:
    pickle.dump(model,file)


## Test the Model Manually

In [None]:
with open('Restaurant_Rating_Predictor.pkl','rb') as file:
    loaded_model = pickle.load(file)

with open('StandardScaler(Avg_rat_for_2).pkl','rb') as s:
    load_scaler = pickle.load(s)

test_data = {
    "Average Cost for two":2000,
    "Has Table booking":0,
    "Has Online delivery":0,
    "Is delivering now":0,
    "Votes":3000,
    "Cuisines":"French, Japanese, Desserts"
}	

test_df = pd.DataFrame([test_data])
test_df = one_hot_encoded_cuisine(test_df,'Cuisines',is_training=False)
test_df['Average Cost for two'] = scaler.transform(test_df[['Average Cost for two']])

predicted_rating = loaded_model.predict(test_df)
print(f"Predicted rating: {predicted_rating[0]:.2f}")

Predicted rating: 4.26


## Analysis

"Votes" are very important feature 

"Average Cost for two" are also important feature

"Has Table booking, Has Online delivery, Is delivering now" are not much important features

"Cuisine" is also not that much important for predicting ratings
