# Decision Tree Model

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

In [2]:
food_data = pd.read_csv('indian_food.csv')

In [3]:
food_data.head()

Unnamed: 0,name,ingredients,diet,prep_time,cook_time,flavor_profile,course,state,region
0,Balu shahi,"Maida flour, yogurt, oil, sugar",vegetarian,45,25,sweet,dessert,West Bengal,East
1,Boondi,"Gram flour, ghee, sugar",vegetarian,80,30,sweet,dessert,Rajasthan,West
2,Gajar ka halwa,"Carrots, milk, sugar, ghee, cashews, raisins",vegetarian,15,60,sweet,dessert,Punjab,North
3,Ghevar,"Flour, ghee, kewra, milk, clarified butter, su...",vegetarian,15,30,sweet,dessert,Rajasthan,West
4,Gulab jamun,"Milk powder, plain flour, baking powder, ghee,...",vegetarian,15,40,sweet,dessert,West Bengal,East


In [4]:
food_data = food_data.set_index('name')

## clear NaN values

In [5]:
food_data.isnull().sum().sum()

1

In [6]:
food_data = food_data.fillna('-1')

In [7]:
food_data.isnull().values.any()

False

In [8]:
food_data.shape

(255, 8)

## encode data

In [9]:
encoder = LabelEncoder()

In [10]:
encoded_data = food_data.copy()

In [11]:
encoded_data.iloc[:,4:8] = food_data.iloc[:,4:8].apply(encoder.fit_transform)

In [12]:
encoded_data.iloc[:,1] = encoder.fit_transform(food_data.iloc[:,1])

In [13]:
ingredients_df = food_data['ingredients'].str.split(',', expand=True)

In [14]:
ingredients_df = ingredients_df.fillna('-1')

In [15]:
ingredients_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Balu shahi,Maida flour,yogurt,oil,sugar,-1,-1,-1,-1,-1,-1
Boondi,Gram flour,ghee,sugar,-1,-1,-1,-1,-1,-1,-1
Gajar ka halwa,Carrots,milk,sugar,ghee,cashews,raisins,-1,-1,-1,-1
Ghevar,Flour,ghee,kewra,milk,clarified butter,sugar,almonds,pistachio,saffron,green cardamom
Gulab jamun,Milk powder,plain flour,baking powder,ghee,milk,sugar,water,rose water,-1,-1
...,...,...,...,...,...,...,...,...,...,...
Til Pitha,Glutinous rice,black sesame seeds,gur,-1,-1,-1,-1,-1,-1,-1
Bebinca,Coconut milk,egg yolks,clarified butter,all purpose flour,-1,-1,-1,-1,-1,-1
Shufta,Cottage cheese,dry dates,dried rose petals,pistachio,badam,-1,-1,-1,-1,-1
Mawa Bati,Milk powder,dry fruits,arrowroot powder,all purpose flour,-1,-1,-1,-1,-1,-1


In [16]:
ingredients = list(np.unique(ingredients_df.values.ravel()))

In [17]:
for value , i in zip(ingredients,range(len(ingredients))):
    ingredients_df = ingredients_df.replace(value,int(i))

In [18]:
for i in range(len(ingredients_df.columns)):
    encoded_data['ingredient_' + str(i+1)] = ingredients_df[i]

In [19]:
encoded_data = encoded_data.drop(['ingredients'], axis=1)

In [20]:
encoded_data.head()

Unnamed: 0_level_0,diet,prep_time,cook_time,flavor_profile,course,state,region,ingredient_1,ingredient_2,ingredient_3,ingredient_4,ingredient_5,ingredient_6,ingredient_7,ingredient_8,ingredient_9,ingredient_10
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Balu shahi,1,45,25,4,0,24,2,374,294,190,253,295,295,295,295,295,295
Boondi,1,80,30,4,0,18,6,355,121,253,295,295,295,295,295,295,295
Gajar ka halwa,1,15,60,4,0,17,3,322,174,253,121,48,212,295,295,295,295
Ghevar,1,15,30,4,0,18,6,350,121,151,174,65,253,5,201,230,133
Gulab jamun,1,15,40,4,0,24,2,378,202,16,121,174,253,275,228,295,295


## rotate course column to end of table

In [21]:
output = encoded_data['course']

In [22]:
encoded_data = encoded_data.drop(['course'], axis='columns')

In [23]:
encoded_data['course'] = output

In [24]:
encoded_data.head()

Unnamed: 0_level_0,diet,prep_time,cook_time,flavor_profile,state,region,ingredient_1,ingredient_2,ingredient_3,ingredient_4,ingredient_5,ingredient_6,ingredient_7,ingredient_8,ingredient_9,ingredient_10,course
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Balu shahi,1,45,25,4,24,2,374,294,190,253,295,295,295,295,295,295,0
Boondi,1,80,30,4,18,6,355,121,253,295,295,295,295,295,295,295,0
Gajar ka halwa,1,15,60,4,17,3,322,174,253,121,48,212,295,295,295,295,0
Ghevar,1,15,30,4,18,6,350,121,151,174,65,253,5,201,230,133,0
Gulab jamun,1,15,40,4,24,2,378,202,16,121,174,253,275,228,295,295,0


In [25]:
encoded_data = encoded_data.replace(-1, 1)

## prepare modeling

In [26]:
data = encoded_data.values

In [27]:
X, y = data[:, :-1].astype('int'), data[:, -1].astype('int')

In [28]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1,
       2, 2, 2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       0, 2, 1, 1, 1, 1, 1, 2, 1, 1, 0, 1, 1, 3, 2, 1, 1, 1, 2, 2, 2, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 0, 1, 2, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1, 0, 1, 1, 2, 2, 2,
       1, 1, 2, 2, 1, 2, 0, 1, 0, 2, 2, 0, 1, 1, 1, 2, 2, 2, 1, 1, 1, 0,
       2, 2, 1, 1, 2, 1, 1, 2, 0, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0])

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.80, random_state=1, stratify=y)

## DecistionTree model

In [30]:
treeclf = DecisionTreeClassifier(random_state=0)

In [31]:
treeclf = treeclf.fit(X_train, y_train)

In [32]:
yhat = treeclf.predict(X_test)

In [33]:
acc = accuracy_score(y_test, yhat)

In [34]:
acc

0.803921568627451

## Bernoulli Naive Bayes Model

In [35]:
bnb = BernoulliNB()

In [36]:
bnb = bnb.fit(X_train, y_train)

In [37]:
y_pred = bnb.predict(X_test)

In [38]:
acc = accuracy_score(y_test, y_pred)

In [39]:
acc

0.47058823529411764

## Multinomail Naive Bayes Model

In [40]:
mnb = MultinomialNB()

In [41]:
mnb = mnb.fit(X_train, y_train)

In [42]:
y_pred = mnb.predict(X_test)

In [43]:
acc = accuracy_score(y_test, y_pred)

In [44]:
acc

0.7058823529411765