In [3]:
# Importing the required libraries
import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
# the plot will be visible in jupyter only insted of separate window
import seaborn as sns

In [4]:
# Reading the zomato.csv dataset
df=pd.read_csv('EDA.csv')
df

Unnamed: 0,online_order,book_table,rate,votes,location,rest_type,cuisines,approx_cost(for two people),listed_in(type)
0,Yes,Yes,4.100000,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800.000000,Buffet
1,Yes,No,4.100000,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800.000000,Buffet
2,Yes,No,3.800000,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800.000000,Buffet
3,No,No,3.700000,88,Banashankari,Quick Bites,"South Indian, North Indian",300.000000,Buffet
4,No,No,3.800000,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600.000000,Buffet
...,...,...,...,...,...,...,...,...,...
51712,No,No,3.600000,27,Whitefield,Bar,Continental,416.630112,Pubs and bars
51713,No,No,3.700449,0,Whitefield,Bar,Finger Food,600.000000,Pubs and bars
51714,No,No,3.700449,0,Whitefield,Bar,Finger Food,416.630112,Pubs and bars
51715,No,Yes,4.300000,236,"ITPL Main Road, Whitefield",Bar,Finger Food,416.630112,Pubs and bars


In [5]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Initialize LabelEncoder
le = LabelEncoder()

# Apply Label Encoding for binary categorical columns
df['online_order'] = le.fit_transform(df['online_order'])
df['book_table'] = le.fit_transform(df['book_table'])

In [6]:
df.head()

Unnamed: 0,online_order,book_table,rate,votes,location,rest_type,cuisines,approx_cost(for two people),listed_in(type)
0,1,1,4.1,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800.0,Buffet
1,1,0,4.1,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800.0,Buffet
2,1,0,3.8,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800.0,Buffet
3,0,0,3.7,88,Banashankari,Quick Bites,"South Indian, North Indian",300.0,Buffet
4,0,0,3.8,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600.0,Buffet


In [7]:
X = df.drop(['rate'],axis=1)
y = df['rate']

In [8]:
# Handling Feature Scaling
from sklearn.preprocessing import StandardScaler 
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [20]:
# Define which columns should be ordinal-encoded and which should be scaled
categorical_cols = ['online_order', 'book_table', 'location', 'rest_type', 'cuisines', 'listed_in(type)', 'approx_cost(for two people)']
numerical_cols = ['votes']

In [21]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())

    ]

)

In [22]:
# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OneHotEncoder(sparse_output=False, handle_unknown='ignore', categories='auto')),
    ('scaler',StandardScaler())
    ]
)

In [23]:
preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_cols),
    ('cat_pipeline', cat_pipeline, categorical_cols)
])

In [30]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor

In [31]:
models={
    'LinearRegression':LinearRegression(),
    'DecisionTreeRegressor':DecisionTreeRegressor(),
    'RandomForestRegressor':RandomForestRegressor(),
    'KNN': KNeighborsRegressor(),
    'GradientBoosting': GradientBoostingRegressor(),
    'AdaBoosting': AdaBoostRegressor(),
}

In [36]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Assuming 'data' is your DataFrame and has been preprocessed for missing values in 'rate'
# Prepare features and target variable
X = df.select_dtypes(include=[np.number]).drop(['rate'], axis=1)  # Select numeric features excluding 'rate'
y = df['rate']  # Target variable

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
lr_model = LinearRegression()

# Fit the model on the training data
lr_model.fit(X_train, y_train)

# Predict on the test data
y_pred = lr_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Squared Error: 0.11539203801983743
R^2 Score: 0.26040365091363915


In [48]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt


# Initialize the Random Forest Regressor
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 4]
}

# Initialize the model
rf = RandomForestRegressor()

# Initialize Grid Search
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit Grid Search to the data
grid_search_rf.fit(X_train, y_train)

# Fit the model on the training data
random_forest_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = random_forest_model.predict(X_test)

# Calculate the root mean squared error (RMSE) of the model
rmse = sqrt(mean_squared_error(y_test, y_pred))
print(f'Root Mean Squared Error: {rmse}')


Root Mean Squared Error: 0.2257467253607409


In [46]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
#hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

# Initialize the model
dt = DecisionTreeRegressor()

# Initialize Grid Search
grid_search_dt = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit Grid Search to the data
grid_search_dt.fit(X_train, y_train)
# Initialize the Decision Tree Regressor
decision_tree_model = DecisionTreeRegressor(random_state=42)

# Fit the model on the training data
decision_tree_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = decision_tree_model.predict(X_test)

# Calculate the root mean squared error (RMSE) of the model
rmse = sqrt(mean_squared_error(y_test, y_pred))
print(f'Root Mean Squared Error: {rmse}')

Root Mean Squared Error: 0.23160929069066402


In [49]:
from sklearn.ensemble import GradientBoostingRegressor
# Initialize the Gradient Boosting Regressor
gbr_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,
                                      max_depth=3, random_state=42)
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.01],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2]
}

# Initialize the model
gbr = GradientBoostingRegressor()

# Initialize Grid Search
grid_search_gbr = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit Grid Search to the data
grid_search_gbr.fit(X_train, y_train)

# Fit the model on the training data
gbr_model.fit(X_train, y_train)

# Predict on training and test sets
y_train_pred = gbr_model.predict(X_train)
y_test_pred = gbr_model.predict(X_test)

# Calculate and print the training and test scores (R^2)
train_score = r2_score(y_train, y_train_pred)
test_score = r2_score(y_test, y_test_pred)

print(f'Training Score (R^2): {train_score}')
print(f'Test Score (R^2): {test_score}')

# Optionally, calculate and print the RMSE for training and test sets
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)

print(f'Training RMSE: {train_rmse}')
print(f'Test RMSE: {test_rmse}')

Training Score (R^2): 0.45675923251629524
Test Score (R^2): 0.4520164693333042
Training RMSE: 0.2914839982646578
Test RMSE: 0.2923979312917792


In [50]:
from sklearn.ensemble import AdaBoostRegressor
# Initialize the AdaBoost Regressor
# The base estimator is a decision tree regressor with max_depth=3 by default, but you can change it
ada_boost_model = AdaBoostRegressor(n_estimators=50, learning_rate=1.0, random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1],
    'loss': ['linear', 'square', 'exponential']
}

# Initialize the model
abr = AdaBoostRegressor()

# Initialize Grid Search
grid_search_abr = GridSearchCV(estimator=abr, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit Grid Search to the data
grid_search_abr.fit(X_train, y_train)
# Fit the model on the training data
ada_boost_model.fit(X_train, y_train)

# Predict on training and test sets
y_train_pred = ada_boost_model.predict(X_train)
y_test_pred = ada_boost_model.predict(X_test)

# Calculate and print the training and test scores (R^2)
train_score = r2_score(y_train, y_train_pred)
test_score = r2_score(y_test, y_test_pred)

print(f'Training Score (R^2): {train_score}')
print(f'Test Score (R^2): {test_score}')

# Optionally, calculate and print the RMSE for training and test sets
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)

print(f'Training RMSE: {train_rmse}')
print(f'Test RMSE: {test_rmse}')

Training Score (R^2): 0.17677548673672372
Test Score (R^2): 0.17276009048084628
Training RMSE: 0.35882086435037847
Test RMSE: 0.3592578626443141


In [51]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler

# Define the parameter grid
param_grid = {
    'n_neighbors': [3, 5, 7, 10],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Initialize the model
knn = KNeighborsRegressor()

# Initialize Grid Search
grid_search_knn = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit Grid Search to the data
grid_search_knn.fit(X_train_scaled, y_train) 
#scalling data for KNN
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the KNN Regressor
# n_neighbors is a key hyperparameter
knn_model = KNeighborsRegressor(n_neighbors=5)

# Fit the model on the training data
knn_model.fit(X_train_scaled, y_train)

# Predict on training and test sets
y_train_pred = knn_model.predict(X_train_scaled)
y_test_pred = knn_model.predict(X_test_scaled)

# Calculate and print the training and test scores (R^2)
train_score = r2_score(y_train, y_train_pred)
test_score = r2_score(y_test, y_test_pred)

print(f'Training Score (R^2): {train_score}')
print(f'Test Score (R^2): {test_score}')

# Optionally, calculate and print the RMSE for training and test sets
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)

print(f'Training RMSE: {train_rmse}')
print(f'Test RMSE: {test_rmse}')

Training Score (R^2): 0.7047909756380105
Test Score (R^2): 0.6089339698612548
Training RMSE: 0.2148737462946882
Test RMSE: 0.247010599045378
