In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split   #Data preprocessing
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression , Lasso , Ridge
from sklearn.ensemble import RandomForestRegressor , GradientBoostingRegressor , AdaBoostRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor


In [2]:
df = pd.read_csv('data/zomato_cleaned.csv')
df

Unnamed: 0,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost2plates,type
0,Jalsa,Yes,Yes,4.100000,487.0,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800.0,Buffet
1,Spice Elephant,Yes,No,4.100000,487.0,Banashankari,Casual Dining,others,800.0,Buffet
2,San Churro Cafe,Yes,No,3.800000,487.0,Banashankari,others,others,800.0,Buffet
3,Addhuri Udupi Bhojana,No,No,3.700000,88.0,Banashankari,Quick Bites,"South Indian, North Indian",300.0,Buffet
4,Grand Village,No,No,3.800000,166.0,Basavanagudi,Casual Dining,others,600.0,Buffet
...,...,...,...,...,...,...,...,...,...,...
51037,Best Brews - Four Points by Sheraton Bengaluru...,No,No,3.600000,27.0,Whitefield,others,Continental,1500.0,Pubs and bars
51038,Vinod Bar And Restaurant,No,No,3.700142,0.0,Whitefield,others,Finger Food,600.0,Pubs and bars
51039,Plunge - Sheraton Grand Bengaluru Whitefield H...,No,No,3.700142,0.0,Whitefield,others,Finger Food,2000.0,Pubs and bars
51040,Chime - Sheraton Grand Bengaluru Whitefield Ho...,No,Yes,4.300000,236.0,others,others,Finger Food,2500.0,Pubs and bars


In [3]:
df.columns

Index(['name', 'online_order', 'book_table', 'rate', 'votes', 'location',
       'rest_type', 'cuisines', 'cost2plates', 'type'],
      dtype='object')

In [4]:
df = df.drop(columns=['name'] , axis=1)

In [5]:
df.head()

Unnamed: 0,online_order,book_table,rate,votes,location,rest_type,cuisines,cost2plates,type
0,Yes,Yes,4.1,487.0,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800.0,Buffet
1,Yes,No,4.1,487.0,Banashankari,Casual Dining,others,800.0,Buffet
2,Yes,No,3.8,487.0,Banashankari,others,others,800.0,Buffet
3,No,No,3.7,88.0,Banashankari,Quick Bites,"South Indian, North Indian",300.0,Buffet
4,No,No,3.8,166.0,Basavanagudi,Casual Dining,others,600.0,Buffet


In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rate,51042.0,3.701424,0.395197,1.8,3.5,3.700142,3.9,4.9
votes,51042.0,131.785961,172.607422,0.0,7.0,41.0,199.0,487.0
cost2plates,51042.0,555.782983,439.293333,40.0,300.0,400.0,700.0,6000.0


In [14]:
num_features = ['votes', 'cost2plates']
cat_features = ['online_order', 'book_table', 'rest_type', 'type', 'location' , 'cuisines']

num_pipeline = Pipeline(steps=[
                ("imputing", SimpleImputer(strategy="mean")),
                ("scaling", StandardScaler())
            ])

            # Categorical pipeline: Imputation and Encoding
cat_pipeline = Pipeline(steps=[
                ("imputing", SimpleImputer(strategy="most_frequent")),
                ("encoding", OneHotEncoder(handle_unknown='ignore'))
            ])

            # Combine pipelines using ColumnTransformer
preprocessor = ColumnTransformer([
                ("num_features", num_pipeline, num_features),
                ("cat_features", cat_pipeline, cat_features)
            ])

In [15]:
preprocessor

In [16]:
X = df.drop(columns=['rate'] , axis=1)
y = df['rate']

X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=22)

In [17]:
preprocessor.fit(X_train)

In [19]:
# Transform training and testing data
X_train_scaled = preprocessor.transform(X_train).toarray()
y_train_scaled = np.array(y_train)
x_test_scaled = preprocessor.transform(X_test).toarray()
y_test_scaled = np.array(y_test)


In [20]:
X_train_scaled,y_train_scaled

(array([[-0.68150417, -0.46886069,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.76277895, -0.81130676,  1.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.60147638,  1.01507228,  1.        , ...,  0.        ,
          0.        ,  1.        ],
        ...,
        [ 2.06442252,  1.92826181,  1.        , ...,  0.        ,
          0.        ,  1.        ],
        [ 2.06442252,  0.33018014,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 1.68126996,  1.01507228,  1.        , ...,  0.        ,
          0.        ,  1.        ]]),
 array([3.6       , 3.70014186, 3.9       , ..., 4.2       , 4.1       ,
        4.        ]))

#### Create an Evaluate Function to give all metrics after model Training

In [22]:
def evaluate_model(true,predicted):
    mse = mean_squared_error(true,predicted)
    mae = mean_absolute_error(true,predicted)
    r_squared = r2_score(true,predicted)

    return mse , mae , r_squared

In [32]:
models = {
    'LinearRegression' : LinearRegression(),
    'Lasso' : Lasso(),
    'Ridge' : Ridge(),
    'DecisionTreeRegressor':DecisionTreeRegressor(),
    'SVR':SVR(),
    'RandomForestRegressor':RandomForestRegressor(),
    'XGBRegressor':XGBRegressor(),
    'GradientBoostingRegressor':GradientBoostingRegressor(),
    'CatBoostRegressor':CatBoostRegressor(verbose=False)
}

In [33]:
models

{'LinearRegression': LinearRegression(),
 'Lasso': Lasso(),
 'Ridge': Ridge(),
 'DecisionTreeRegressor': DecisionTreeRegressor(),
 'SVR': SVR(),
 'RandomForestRegressor': RandomForestRegressor(),
 'XGBRegressor': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...),
 'GradientBoostingRegressor':

In [34]:
list(models.values())

[LinearRegression(),
 Lasso(),
 Ridge(),
 DecisionTreeRegressor(),
 SVR(),
 RandomForestRegressor(),
 XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...),
 GradientBoostingRegressor(),
 <catboost.core.CatBoostRegressor at 0x20d442591f0>]

In [38]:
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train_scaled,y_train_scaled.flatten())

    # Make predictions
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(x_test_scaled)

    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train_scaled, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test_scaled, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

    

LinearRegression
Model performance for Training set
- Root Mean Squared Error: 0.2295
- Mean Absolute Error: 0.0938
- R2 Score: 0.3972
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.2301
- Mean Absolute Error: 0.0950
- R2 Score: 0.4000


Lasso
Model performance for Training set
- Root Mean Squared Error: 0.2825
- Mean Absolute Error: 0.1556
- R2 Score: 0.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.2869
- Mean Absolute Error: 0.1584
- R2 Score: -0.0001


Ridge
Model performance for Training set
- Root Mean Squared Error: 0.2295
- Mean Absolute Error: 0.0938
- R2 Score: 0.3972
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.2301
- Mean Absolute Error: 0.0950
- R2 Score: 0.4000


DecisionTreeRegressor
Model performance for Training set
- Root Mean Squared Error: 0.0064
- Mean Absolute Error: 0.0013
- R2 Score: 0.9913
-----------------------