<center><span style="color:#b30000;font-size:35px;"><strong>Modeling Phase</strong></span></center>

<span style="color:#2929a3;font-size:20px;">Import Libraries</span>

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, MultiLabelBinarizer, LabelEncoder
from category_encoders import BinaryEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SequentialFeatureSelector

<span style="color:#2929a3;font-size:20px;">Read Dataset</span>

In [2]:
df = pd.read_pickle('Data/df_cleanded.pkl')
df.head()

Unnamed: 0_level_0,Original_Columns,Original_Columns,Original_Columns,Original_Columns,Original_Columns,Original_Columns,Original_Columns,Original_Columns,Rest_Type,Rest_Type,...,Cuisines,Cuisines,Cuisines,Listed_in_Type,Listed_in_Type,Listed_in_Type,Listed_in_Type,Listed_in_Type,Listed_in_Type,Listed_in_Type
Unnamed: 0_level_1,name,online_order,book_table,rate,votes,location,dish_liked,approx_cost(for two people),Bakery,Bar,...,Vegan,Vietnamese,Wraps,Buffet,Cafes,Delivery,Desserts,Dine-out,Drinks & nightlife,Pubs and bars
0,Jalsa,Yes,Yes,4.1,775.0,Banashankari,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...",800.0,0,0,...,0,0,0,1,0,1,0,1,0,0
1,Spice Elephant,Yes,No,4.1,787.0,Banashankari,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...",800.0,0,0,...,0,0,0,1,0,1,0,1,0,0
2,San Churro Cafe,Yes,No,3.8,918.0,Banashankari,"Churros, Cannelloni, Minestrone Soup, Hot Choc...",800.0,0,0,...,0,0,0,1,1,1,1,1,0,0
3,Addhuri Udupi Bhojana,No,No,3.7,88.0,Banashankari,Masala Dosa,300.0,0,0,...,0,0,0,1,0,0,0,1,0,0
4,Grand Village,No,No,3.8,166.0,Basavanagudi,"Panipuri, Gol Gappe",600.0,0,0,...,0,0,0,1,0,0,0,1,0,0


<span style="color:#2929a3;font-size:20px;">Check Info</span>

In [3]:
df['Original_Columns'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14118 entries, 0 to 14117
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   name                         14118 non-null  object 
 1   online_order                 14118 non-null  object 
 2   book_table                   14118 non-null  object 
 3   rate                         10925 non-null  float64
 4   votes                        14118 non-null  float64
 5   location                     14118 non-null  object 
 6   dish_liked                   5435 non-null   object 
 7   approx_cost(for two people)  14061 non-null  float64
dtypes: float64(3), object(5)
memory usage: 882.5+ KB


<span style="color:#2929a3;font-size:20px;">Drop rows that has NaN values in "rate" column</span>

In [4]:
df.dropna(subset=[('Original_Columns','rate')], inplace=True)

In [5]:
df.shape

(10925, 147)

<span style="color:#2929a3;font-size:20px;">Create Target Column from Rate Column</span>

In [6]:
## Create create_target function
def create_target(rate):
    if rate > 3.75:
        return 1
    else:
        return 0

In [7]:
## Create Target Column from rate Column
df['Target'] = df[('Original_Columns','rate')].apply(create_target)

In [8]:
df['Target'].value_counts(normalize=True)

Target
0    0.590206
1    0.409794
Name: proportion, dtype: float64

<span style="color:#2929a3;font-size:20px;">Drop Unnecessary Columns</span>

In [9]:
cols_to_drop = ['name','rate','votes','dish_liked']

In [10]:
for col in cols_to_drop:
    df.drop(('Original_Columns',col), axis=1, inplace=True)

In [11]:
## Check Duplications
df.duplicated().sum()

1562

In [12]:
## Drop Duplications
df.drop_duplicates(inplace=True)

In [13]:
## Drop any missing values
df.dropna(inplace=True)

In [14]:
## Reset Index
df.reset_index(drop=True, inplace=True)

In [15]:
## Check Shape
df.shape

(9328, 144)

<span style="color:#2929a3;font-size:20px;">Check Missing Values</span>

In [16]:
df['Original_Columns'].isnull().sum()

online_order                   0
book_table                     0
location                       0
approx_cost(for two people)    0
dtype: int64

<span style="color:#2929a3;font-size:20px;">Drop columns of "Rest_Type" that has frequency less than 300</span>

In [17]:
rest_type_series = df['Rest_Type'].sum().sort_values(ascending=False)

In [18]:
Rest_Type_less_300 = rest_type_series[rest_type_series < 300].index.tolist()

In [19]:
for col in Rest_Type_less_300:
    df.drop(('Rest_Type',col), axis=1, inplace=True)

<span style="color:#2929a3;font-size:20px;">Drop columns of "Cuisines" that has frequency less than 300</span>

In [20]:
cuisines_series = df['Cuisines'].sum().sort_values(ascending=False)

In [21]:
cuisines_less_300 = cuisines_series[cuisines_series < 300].index.tolist()

In [22]:
for col in cuisines_less_300:
    df.drop(('Cuisines',col), axis=1, inplace=True)

In [23]:
df.shape

(9328, 40)

In [24]:
## Check Duplicates
df.duplicated().sum()

135

In [25]:
## Drop Duplicates
df.drop_duplicates(inplace=True)

In [26]:
## Reset Index
df.reset_index(drop=True, inplace=True)

In [27]:
df.shape

(9193, 40)

<span style="color:#2929a3;font-size:20px;">Split Data to X and Y</span>

In [28]:
X = df.drop('Target', axis=1)
Y = df['Target']

In [29]:
# Flatten multi-level columns
X.columns = ['_'.join(col) for col in X.columns]

In [32]:
X.head(3)

Unnamed: 0,Original_Columns_online_order,Original_Columns_book_table,Original_Columns_location,Original_Columns_approx_cost(for two people),Rest_Type_Bakery,Rest_Type_Bar,Rest_Type_Beverage Shop,Rest_Type_Cafe,Rest_Type_Casual Dining,Rest_Type_Delivery,...,Cuisines_Seafood,Cuisines_South Indian,Cuisines_Street Food,Listed_in_Type_Buffet,Listed_in_Type_Cafes,Listed_in_Type_Delivery,Listed_in_Type_Desserts,Listed_in_Type_Dine-out,Listed_in_Type_Drinks & nightlife,Listed_in_Type_Pubs and bars
0,Yes,Yes,Banashankari,800.0,0,0,0,0,1,0,...,0,0,0,1,0,1,0,1,0,0
1,Yes,No,Banashankari,800.0,0,0,0,0,1,0,...,0,0,0,1,0,1,0,1,0,0
2,Yes,No,Banashankari,800.0,0,0,0,1,1,0,...,0,0,0,1,1,1,1,1,0,0


<span style="color:#2929a3;font-size:20px;">Create Preprocessor Column Transformer</span>

In [514]:
preprocessor = ColumnTransformer(transformers=[('Encoder', BinaryEncoder(), ['Original_Columns_online_order','Original_Columns_book_table','Original_Columns_location']),  ('Scaler',RobustScaler(),['Original_Columns_approx_cost(for two people)'])], remainder='passthrough')

<span style="color:#2929a3;font-size:20px;">Model Selection Process</span>

In [515]:
Models = []
Models.append(('Logistic Regression',LogisticRegression(max_iter=10000)))
Models.append(('Knn',KNeighborsClassifier()))
#Models.append(('MultinomialNB',MultinomialNB()))
#Models.append(('GaussianNB',GaussianNB()))
#Models.append(('BernoulliNB',BernoulliNB()))
Models.append(('SVM',SVC()))
Models.append(('Decision Tree', DecisionTreeClassifier()))
Models.append(('Random Forest',RandomForestClassifier()))
Models.append(('Voting Classifier',VotingClassifier(estimators=[('Logistic Regression',LogisticRegression(max_iter=10000)), ('Knn',KNeighborsClassifier()), ('Decision Tree', DecisionTreeClassifier()), ('Random Forest',RandomForestClassifier())])))
Models.append(('XGBoost',XGBClassifier()))

In [516]:
for model in Models:
    steps = []
    steps.append(('preprocessor', preprocessor))
    #steps.append(('Feature Selector',SequentialFeatureSelector(estimator=model[1], n_features_to_select=40, direction='backward', scoring='accuracy',cv=5,n_jobs=-1)))
    steps.append(model)
    pipeline = Pipeline(steps=steps)
    res = cross_validate(pipeline , X, Y, scoring='accuracy', cv=5, return_train_score=True, n_jobs=-1)
    print(f"Train Accuracy of {model[0]} is ",res['train_score'].mean())
    print(f"Test Accuracy of {model[0]} is ",res['test_score'].mean())
    print("*" * 70)

Train Accuracy of Logistic Regression is  0.7098063198817794
Test Accuracy of Logistic Regression is  0.6954179809838933
**********************************************************************
Train Accuracy of Knn is  0.7807026979957169
Test Accuracy of Knn is  0.6705103603995405
**********************************************************************
Train Accuracy of SVM is  0.741107396428864
Test Accuracy of SVM is  0.7077106413394705
**********************************************************************
Train Accuracy of Decision Tree is  0.9714728648347244
Test Accuracy of Decision Tree is  0.6112281891386068
**********************************************************************
Train Accuracy of Random Forest is  0.9714728648347244
Test Accuracy of Random Forest is  0.6810619979041929
**********************************************************************
Train Accuracy of Voting Classifier is  0.8700913999179495
Test Accuracy of Voting Classifier is  0.6942241046223139
************

<span style="color:#2929a3;font-size:20px;">Create Random Forest Pipeline</span>

In [517]:
steps = []
steps.append(('preprocessor',preprocessor))
steps.append(('Model',RandomForestClassifier()))
pipeline = Pipeline(steps = steps)

<span style="color:#2929a3;font-size:20px;">Hyperparameter Tuning</span>

In [391]:
from sklearn.model_selection import GridSearchCV

In [415]:
param = [
    {'Model__criterion':['gini','entropy'],
     'Model__max_depth':[16,17,18],
     'Model__min_samples_split':[9,10,11,12],
     'Model__n_estimators':[85,90,95]}
]

In [416]:
grid = GridSearchCV(estimator=pipeline, param_grid=param, cv=5, scoring='accuracy' , return_train_score=True, n_jobs=-1)

In [421]:
grid.fit(X,Y)

In [422]:
grid.best_params_

{'Model__criterion': 'gini',
 'Model__max_depth': 18,
 'Model__min_samples_split': 11,
 'Model__n_estimators': 90}

In [423]:
grid.cv_results_['mean_train_score'][grid.best_index_]

0.8094203684431509

In [424]:
grid.cv_results_['mean_test_score'][grid.best_index_]

0.7164137438085821

<span style="color:#2929a3;font-size:20px;">Create Final Pipeline</span>

In [518]:
steps = []
steps.append(('preprocessor',preprocessor))
steps.append(('Model',RandomForestClassifier(n_estimators=90, criterion='gini', max_depth=18, min_samples_split=11)))
pipeline = Pipeline(steps = steps)
pipeline.fit(X,Y)

<span style="color:#2929a3;font-size:20px;">Dump Model and Inputs</span>

In [519]:
inputs_dict = {}
inputs_dict['columns_names'] = X.columns.tolist()[:4]
inputs_dict['Rest_Type_Cols'] = df['Rest_Type'].columns.tolist()
inputs_dict['Cuisines_Cols'] = df['Cuisines'].columns.tolist()
inputs_dict['Listed_in_Type'] = df['Listed_in_Type'].columns.tolist()
inputs_dict['Location'] = df[('Original_Columns','location')].unique().tolist()

In [520]:
import joblib
joblib.dump(inputs_dict,'Data/inputs_dict.pkl')
joblib.dump(pipeline,'Data/Model.pkl')

['Data/Model.pkl']