In [48]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split ,RandomizedSearchCV   #Data preprocessing
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.tree import DecisionTreeRegressor,ExtraTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor , GradientBoostingRegressor , AdaBoostRegressor , BaggingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor


In [2]:
df = pd.read_csv('data/zomato_cleaned.csv')
df

Unnamed: 0,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost2plates,type
0,Jalsa,Yes,Yes,4.100000,487.0,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800.0,Buffet
1,Spice Elephant,Yes,No,4.100000,487.0,Banashankari,Casual Dining,others,800.0,Buffet
2,San Churro Cafe,Yes,No,3.800000,487.0,Banashankari,others,others,800.0,Buffet
3,Addhuri Udupi Bhojana,No,No,3.700000,88.0,Banashankari,Quick Bites,"South Indian, North Indian",300.0,Buffet
4,Grand Village,No,No,3.800000,166.0,Basavanagudi,Casual Dining,others,600.0,Buffet
...,...,...,...,...,...,...,...,...,...,...
51037,Best Brews - Four Points by Sheraton Bengaluru...,No,No,3.600000,27.0,Whitefield,others,Continental,1500.0,Pubs and bars
51038,Vinod Bar And Restaurant,No,No,3.700142,0.0,Whitefield,others,Finger Food,600.0,Pubs and bars
51039,Plunge - Sheraton Grand Bengaluru Whitefield H...,No,No,3.700142,0.0,Whitefield,others,Finger Food,2000.0,Pubs and bars
51040,Chime - Sheraton Grand Bengaluru Whitefield Ho...,No,Yes,4.300000,236.0,others,others,Finger Food,2500.0,Pubs and bars


In [3]:
df.columns

Index(['name', 'online_order', 'book_table', 'rate', 'votes', 'location',
       'rest_type', 'cuisines', 'cost2plates', 'type'],
      dtype='object')

In [4]:
df = df.drop(columns=['name'] , axis=1)

In [5]:
df.head()

Unnamed: 0,online_order,book_table,rate,votes,location,rest_type,cuisines,cost2plates,type
0,Yes,Yes,4.1,487.0,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800.0,Buffet
1,Yes,No,4.1,487.0,Banashankari,Casual Dining,others,800.0,Buffet
2,Yes,No,3.8,487.0,Banashankari,others,others,800.0,Buffet
3,No,No,3.7,88.0,Banashankari,Quick Bites,"South Indian, North Indian",300.0,Buffet
4,No,No,3.8,166.0,Basavanagudi,Casual Dining,others,600.0,Buffet


In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rate,51042.0,3.701424,0.395197,1.8,3.5,3.700142,3.9,4.9
votes,51042.0,131.785961,172.607422,0.0,7.0,41.0,199.0,487.0
cost2plates,51042.0,555.782983,439.293333,40.0,300.0,400.0,700.0,6000.0


### Building Pipeline

In [14]:
#seprating the num and cat columns
num_features = ['votes', 'cost2plates']
cat_features = ['online_order', 'book_table', 'rest_type', 'type', 'location' , 'cuisines']

num_pipeline = Pipeline(steps=[
                ("imputing", SimpleImputer(strategy="mean")),
                ("scaling", StandardScaler())
            ])

            # Categorical pipeline: Imputation and Encoding
cat_pipeline = Pipeline(steps=[
                ("imputing", SimpleImputer(strategy="most_frequent")),
                ("encoding", OneHotEncoder(handle_unknown='ignore'))
            ])

            # Combine pipelines using ColumnTransformer
preprocessor = ColumnTransformer([
                ("num_features", num_pipeline, num_features),
                ("cat_features", cat_pipeline, cat_features)
            ])

In [15]:
preprocessor

In [16]:
#splitting the data
X = df.drop(columns=['rate'] , axis=1)
y = df['rate']

X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=22)

In [17]:
preprocessor.fit(X_train)

In [19]:
# Transform training and testing data
X_train_scaled = preprocessor.transform(X_train).toarray()
y_train_scaled = np.array(y_train)
x_test_scaled = preprocessor.transform(X_test).toarray()
y_test_scaled = np.array(y_test)


In [20]:
X_train_scaled,y_train_scaled

(array([[-0.68150417, -0.46886069,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.76277895, -0.81130676,  1.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.60147638,  1.01507228,  1.        , ...,  0.        ,
          0.        ,  1.        ],
        ...,
        [ 2.06442252,  1.92826181,  1.        , ...,  0.        ,
          0.        ,  1.        ],
        [ 2.06442252,  0.33018014,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 1.68126996,  1.01507228,  1.        , ...,  0.        ,
          0.        ,  1.        ]]),
 array([3.6       , 3.70014186, 3.9       , ..., 4.2       , 4.1       ,
        4.        ]))

#### Create an Evaluate Function to give all metrics after model Training

In [22]:
def evaluate_model(true,predicted):
    mse = mean_squared_error(true,predicted)
    mae = mean_absolute_error(true,predicted)
    r_squared = r2_score(true,predicted)

    return mse , mae , r_squared

In [49]:
models = {
    'ExtraTreeRegressor':ExtraTreeRegressor(),
    'DecisionTreeRegressor':DecisionTreeRegressor(),
    'SVR':SVR(),
    'RandomForestRegressor':RandomForestRegressor(),
    'XGBRegressor':XGBRegressor(),
    'GradientBoostingRegressor':GradientBoostingRegressor(),
    'CatBoostRegressor':CatBoostRegressor(verbose=False)
}

In [50]:
models

{'ExtraTreeRegressor': ExtraTreeRegressor(),
 'DecisionTreeRegressor': DecisionTreeRegressor(),
 'SVR': SVR(),
 'RandomForestRegressor': RandomForestRegressor(),
 'XGBRegressor': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...),
 'GradientBoostingRegressor': GradientBoostingRegressor(),
 'Ca

In [51]:
list(models.values())

[ExtraTreeRegressor(),
 DecisionTreeRegressor(),
 SVR(),
 RandomForestRegressor(),
 XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...),
 GradientBoostingRegressor(),
 <catboost.core.CatBoostRegressor at 0x20d5aa57280>]

### Evaluating Performances on Train and Test Data

In [54]:
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train_scaled,y_train_scaled.flatten())

    # Make predictions
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(x_test_scaled)

    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train_scaled, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test_scaled, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

    

ExtraTreeRegressor
Model performance for Training set
- Mean Squared Error: 0.0064
- Mean Absolute Error: 0.0013
- R2 Score: 0.9913
----------------------------------
Model performance for Test set
- Mean Squared Error: 0.0366
- Mean Absolute Error: 0.0178
- R2 Score: 0.8875


DecisionTreeRegressor
Model performance for Training set
- Mean Squared Error: 0.0064
- Mean Absolute Error: 0.0013
- R2 Score: 0.9913
----------------------------------
Model performance for Test set
- Mean Squared Error: 0.0341
- Mean Absolute Error: 0.0160
- R2 Score: 0.8987




In [None]:
#evaluating Random Forest

rf = RandomForestRegressor()
rf.fit(X_train_scaled,y_train_scaled)

y_pred_trainrf = rf.predict(X_train_scaled)
y_pred_testrf = rf.predict(x_test_scaled)

print('Model performance for Training set')
print("- Mean Squared Error: ",mean_squared_error(y_train_scaled,y_pred_trainrf))
print("- Mean Absolute Error: " , mean_absolute_error(y_train_scaled,y_pred_trainrf))
print("- R2 Score: ",r2_score(y_train_scaled,y_pred_trainrf))

print('=======================================================================')
    
print('Model performance for Test set')
print("- Mean Squared Error: ",mean_squared_error(y_test_scaled,y_pred_testrf))
print("- Mean Absolute Error: " , mean_absolute_error(y_test_scaled,y_pred_testrf))
print("- R2 Score: ",r2_score(y_test_scaled,y_pred_testrf))

    

## Tuning Random Forest

In [46]:
rfr = RandomForestRegressor()

# Define the parameter grid
param_grid = {
    'n_estimators': [int(x) for x in np.linspace(start = 10, stop = 500, num = 10)],  # Number of trees in the forest
    'max_features': ['sqrt', 'log2' , None],  # Number of features to consider at every split
    'max_depth': [int(x) for x in np.linspace(10, 100,10)],    # Maximum number of levels in tree
    'min_samples_split': [2, 5, 10, 14],            # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4, 6, 8],              # Minimum number of samples required at each leaf node   

}             


rs = RandomizedSearchCV(estimator=rfr,
                        param_distributions=param_grid,
                        scoring='r2',
                        cv=5,
                        n_jobs=-1)


rs.fit(X_train_scaled,y_train_scaled)

print('BEST SCORE : ',rs.best_score_)
print('BEST PARAMS : ',rs.best_params_)
print('cv results : \n',rs.cv_results_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


KeyboardInterrupt: 

In [52]:
df

Unnamed: 0,online_order,book_table,rate,votes,location,rest_type,cuisines,cost2plates,type
0,Yes,Yes,4.100000,487.0,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800.0,Buffet
1,Yes,No,4.100000,487.0,Banashankari,Casual Dining,others,800.0,Buffet
2,Yes,No,3.800000,487.0,Banashankari,others,others,800.0,Buffet
3,No,No,3.700000,88.0,Banashankari,Quick Bites,"South Indian, North Indian",300.0,Buffet
4,No,No,3.800000,166.0,Basavanagudi,Casual Dining,others,600.0,Buffet
...,...,...,...,...,...,...,...,...,...
51037,No,No,3.600000,27.0,Whitefield,others,Continental,1500.0,Pubs and bars
51038,No,No,3.700142,0.0,Whitefield,others,Finger Food,600.0,Pubs and bars
51039,No,No,3.700142,0.0,Whitefield,others,Finger Food,2000.0,Pubs and bars
51040,No,Yes,4.300000,236.0,others,others,Finger Food,2500.0,Pubs and bars


### Previous models logs




- [ 2024-06-13 19:28:57,139 ] 23 root - INFO - Defined the preprocessor path. 
- [ 2024-06-13 19:29:00,523 ] 30 root - INFO - Entered the data ingestion method or component 
- [ 2024-06-13 19:29:00,552 ] 34 root - INFO - Read the dataset as dataframe 
- [ 2024-06-13 19:29:00,584 ] 41 root - INFO - Raw data saved 
- [ 2024-06-13 19:29:00,584 ] 44 root - INFO - Train test split initiated 
- [ 2024-06-13 19:29:00,615 ] 50 root - INFO - Ingestion of the data is completed 
- [ 2024-06-13 19:29:00,662 ] 74 root - INFO - Split data into features and target. 
- [ 2024-06-13 19:29:00,662 ] 35 root - INFO - Defined the numeric and categorical features. 
- [ 2024-06-13 19:29:00,662 ] 48 root - INFO - Defined the numeric and categorical pipelines. 
- [ 2024-06-13 19:29:00,693 ] 55 root - INFO - Defined the complete preprocessor. 
- [ 2024-06-13 19:29:00,693 ] 78 root - INFO - Fetched the preprocessor successfully. 
- [ 2024-06-13 19:29:00,740 ] 29 root - INFO - Saving ColumnTransformer(transformers=[('num_features',
                                 Pipeline(steps=[('imputing', SimpleImputer()),
                                                 ('scaling',
                                                  StandardScaler())]),
                                 ['votes', 'cost']),
                                ('cat_features',
                                 Pipeline(steps=[('imputing',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('encoding',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['online_order', 'book_table', 'rest_type',
                                  'type', 'city'])]) to artifacts\Preprocessor.pkl 
- [ 2024-06-13 19:29:00,752 ] 83 root - INFO - Saved the preprocessor object successfully. 
- [ 2024-06-13 19:29:00,791 ] 91 root - INFO - Data transformation complete: x_train_array shape: (3999, 40), y_train_array shape: (3999,), x_test_array shape: (1000, 40), y_test_array shape: (1000,) 
- [ 2024-06-13 19:29:00,792 ] 44 root - INFO - Defined the training and testing arrays for x and y 
- [ 2024-06-13 19:29:00,793 ] 55 root - INFO - Training Random Forest 
- [ 2024-06-13 19:33:12,486 ] 74 root - INFO - Random Forest - Train R2: 0.9630558308414389, Test R2: 0.7707794042833693 
- [ 2024-06-13 19:33:12,486 ] 55 root - INFO - Training Decision Tree 
- [ 2024-06-13 19:33:14,443 ] 74 root - INFO - Decision Tree - Train R2: 0.7859895643447465, Test R2: 0.6671835253209638 
- [ 2024-06-13 19:33:14,443 ] 55 root - INFO - Training Gradient Boosting 
- [ 2024-06-13 19:35:25,339 ] 74 root - INFO - Gradient Boosting - Train R2: 0.9299423258488857, Test R2: 0.7698732141638363 
- [ 2024-06-13 19:35:25,339 ] 55 root - INFO - Training Linear Regression 
- [ 2024-06-13 19:35:25,536 ] 74 root - INFO - Linear Regression - Train R2: 0.07847824799929326, Test R2: 0.03187729313470222 
- [ 2024-06-13 19:35:25,537 ] 55 root - INFO - Training SVR 
- [ 2024-06-13 19:44:56,043 ] 74 root - INFO - SVR - Train R2: 0.09428128542209391, Test R2: 0.020579323128126492 
- [ 2024-06-13 19:44:56,043 ] 55 root - INFO - Training XGBRegressor 
- [ 2024-06-13 19:45:27,156 ] 74 root - INFO - XGBRegressor - Train R2: 0.9202970584531581, Test R2: 0.7709133034835028 
- [ 2024-06-13 19:45:27,156 ] 55 root - INFO - Training CatBoost Regressor 
- [ 2024-06-13 19:47:21,918 ] 74 root - INFO - CatBoost Regressor - Train R2: 0.8839589532117992, Test R2: 0.7562099085025589 
- [ 2024-06-13 19:47:21,918 ] 55 root - INFO - Training AdaBoost Regressor 
- [ 2024-06-13 19:47:43,004 ] 74 root - INFO - AdaBoost Regressor - Train R2: 0.6830793008201845, Test R2: 0.642389114629312 
- [ 2024-06-13 19:47:43,005 ] 55 root - INFO - Training Extra Trees Regressor 
- [ 2024-06-13 19:51:05,758 ] 74 root - INFO - Extra Trees Regressor - Train R2: 0.9280186402762407, Test R2: 0.6672502069877334 
- [ 2024-06-13 19:51:05,758 ] 55 root - INFO - Training Bagging Regressor 
- [ 2024-06-13 19:51:58,208 ] 74 root - INFO - Bagging Regressor - Train R2: 0.9464261163548479, Test R2: 0.757871209590422 
- [ 2024-06-13 19:51:58,208 ] 140 root - INFO - Best model found: XGBRegressor with score: 0.7709133034835028 
- [ 2024-06-13 19:51:58,224 ] 29 root - INFO - Saving XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.05, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=8, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=200, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...) to artifacts\model.pkl 
