In [1]:
# Importing the required libraries
import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
# the plot will be visible in jupyter only insted of separate window
import seaborn as sns

In [2]:
# Reading the zomato.csv dataset
df=pd.read_csv('EDA_FE.csv')

In [3]:
df.head()

Unnamed: 0,online_order,book_table,rate,votes,location,rest_type,cuisines,approx_cost(for two people),listed_in(type)
0,Yes,Yes,4.1,775,Banashankari,Casual Dining,others,800,Buffet
1,Yes,No,4.1,787,Banashankari,Casual Dining,others,800,Buffet
2,Yes,No,3.8,918,Banashankari,others,others,800,Buffet
3,No,No,3.7,88,Banashankari,Quick Bites,"South Indian, North Indian",300,Buffet
4,No,No,3.8,166,Basavanagudi,Casual Dining,others,600,Buffet


In [4]:
df.columns

Index(['online_order', 'book_table', 'rate', 'votes', 'location', 'rest_type',
       'cuisines', 'approx_cost(for two people)', 'listed_in(type)'],
      dtype='object')

# pipeline

In [5]:
X = df.drop(['rate'],axis=1)
y = df['rate']

In [6]:
# Handling Feature Scaling
from sklearn.preprocessing import StandardScaler 
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [7]:
# Define which columns should be ordinal-encoded and which should be scaled
categorical_cols = ['online_order', 'book_table', 'location', 'rest_type', 'cuisines', 'listed_in(type)', 'approx_cost(for two people)']
numerical_cols = ['votes']

In [8]:
from sklearn.preprocessing import OneHotEncoder

In [9]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

In [10]:
# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OneHotEncoder(sparse_output=False, handle_unknown='ignore', categories='auto')),
    ('scaler',StandardScaler())
    ]
)

In [11]:
preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_cols),
    ('cat_pipeline', cat_pipeline, categorical_cols)
])

# model

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor 

In [13]:
from sklearn.metrics import r2_score

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=10)

In [15]:
preprocessor.fit_transform(X_train)

array([[-0.17964288, -0.83418776,  0.83418776, ..., -0.05919713,
        -0.1186469 , -0.03524944],
       [ 5.41229686,  1.19877088, -1.19877088, ..., -0.05919713,
        -0.1186469 , -0.03524944],
       [-0.35315165,  1.19877088, -1.19877088, ..., -0.05919713,
        -0.1186469 , -0.03524944],
       ...,
       [-0.162292  , -0.83418776,  0.83418776, ..., -0.05919713,
        -0.1186469 , -0.03524944],
       [-0.31225315,  1.19877088, -1.19877088, ..., -0.05919713,
        -0.1186469 , -0.03524944],
       [-0.32092859, -0.83418776,  0.83418776, ..., -0.05919713,
        -0.1186469 , -0.03524944]])

In [16]:
preprocessor.transform(X_test)

array([[ 0.40161149,  1.19877088, -1.19877088, ..., -0.05919713,
        -0.1186469 , -0.03524944],
       [ 0.35327691,  1.19877088, -1.19877088, ..., -0.05919713,
        -0.1186469 , -0.03524944],
       [-0.3134925 , -0.83418776,  0.83418776, ..., -0.05919713,
        -0.1186469 , -0.03524944],
       ...,
       [-0.3184499 , -0.83418776,  0.83418776, ..., -0.05919713,
        -0.1186469 , -0.03524944],
       [-0.35315165,  1.19877088, -1.19877088, ..., -0.05919713,
        -0.1186469 , -0.03524944],
       [-0.35315165,  1.19877088, -1.19877088, ..., -0.05919713,
        -0.1186469 , -0.03524944]])

In [17]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [18]:
def evaluate_regression_model(true, predicted):
    score = r2_score(true, predicted)
    return score

In [19]:
models={
    'LinearRegression':LinearRegression(),
    'DecisionTreeRegressor':DecisionTreeRegressor(),
    'RandomForestRegressor':RandomForestRegressor(),
    'KNN': KNeighborsRegressor(),
    'GradientBoosting': GradientBoostingRegressor(),
    'AdaBoosting': AdaBoostRegressor(),
    'XGBoost': XGBRegressor()
    }

In [20]:
trained_model_list=[]
model_list = []
score_list = []

In [21]:
list(models)

['LinearRegression',
 'DecisionTreeRegressor',
 'RandomForestRegressor',
 'KNN',
 'GradientBoosting',
 'AdaBoosting',
 'XGBoost']

In [22]:
for i in range(len(list(models))):
    model=list(models.values())[i]
    print(model)

LinearRegression()
DecisionTreeRegressor()
RandomForestRegressor()
KNeighborsRegressor()
GradientBoostingRegressor()
AdaBoostRegressor()
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)


In [23]:
models.keys()

dict_keys(['LinearRegression', 'DecisionTreeRegressor', 'RandomForestRegressor', 'KNN', 'GradientBoosting', 'AdaBoosting', 'XGBoost'])

In [24]:
models.values()

dict_values([LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(), KNeighborsRegressor(), GradientBoostingRegressor(), AdaBoostRegressor(), XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)])

In [25]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # Make Predictions
    y_test_pred = model.predict(X_test)
    y_train_pred= model.predict(X_train)

    # This is a validation (test) score
    train_model_score = evaluate_regression_model(y_train, y_train_pred)
    test_model_score = evaluate_regression_model(y_test, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("Train_Score:", train_model_score)
    print("Test_Score:", test_model_score)
    
    score_list.append(test_model_score)

    print('=' * 35)
    print('\n')

LinearRegression
Model Training Performance
Train_Score: 0.3343219939253478
Test_Score: -8.483873844205379e+21


DecisionTreeRegressor
Model Training Performance
Train_Score: 0.9950878797755638
Test_Score: 0.8524725216867695


RandomForestRegressor
Model Training Performance
Train_Score: 0.9807730243230233
Test_Score: 0.8943999539376455


KNN
Model Training Performance
Train_Score: 0.7455691926088803
Test_Score: 0.5663925771382041


GradientBoosting
Model Training Performance
Train_Score: 0.5193575640309995
Test_Score: 0.5033940588642531


AdaBoosting
Model Training Performance
Train_Score: 0.17203519095428077
Test_Score: 0.1754731579609482


XGBoost
Model Training Performance
Train_Score: 0.729603214583166
Test_Score: 0.682806162321334


