<a href="https://colab.research.google.com/github/Js2604/Recipe-Classifier/blob/main/Recipe_Classifier_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recipe Classfier

Goal: Use ingredients to determine what nationality a recipe is from.

Import packages


In [None]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.utils.np_utils import to_categorical
from keras.layers import Dense

In [None]:
!pip install jellyfish
import jellyfish

Read training data from train.json; contains a list of 33000 recipes and ingredient lists

In [None]:
from sklearn.neural_network import MLPClassifier
train = pd.read_json("train.json")

Preprocess data, removing misspelt or uncommon ingredients to reduce training time and overfitting.

In [None]:
s = {}
for row in train['ingredients']:
  for ingredient in row:
    if not ingredient in s:
      s[ingredient] = 0;
    s[ingredient] += 1;

import collections
#od = [k for k, v in sorted(s.items(), key=lambda item: item[1], reverse=True)]


View the ten most common ingredients. Looks reasonable. It looks like preprocessing has succeeded!

In [None]:
od = [k for k, v in sorted(s.items(), key=lambda item: item[1], reverse=True)]
od = od[:5000]
pop = [(k, v) for k, v in sorted(s.items(), key=lambda item: item[1], reverse=True)]
pop[:10]

In [None]:
l1, l2 = zip(*pop)

import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 22})


plt.figure(figsize=(20,20))

numnum = 20
x = list(l1[:numnum])
energy = list(l2[:numnum])

x_pos = [i for i, _ in enumerate(x)]

plt.bar(x_pos, energy, color='pink')
plt.xlabel("Ingredient")
plt.ylabel("Count")
plt.title("Frequency of ingredients across training data")
#plt.xticks()
plt.xticks(x_pos, x, rotation=-90)

plt.show()

Group similar ingredients together based on simple substring comparison to reduce number of possible ingredients. Examples of groupings are printed. 

Format: grouping | original ingredient

In [None]:
X = []
s = od
for row in train['ingredients']:
  temp = [0] * len(s)
  for ingredient in row:
    if ingredient in s:
      temp[s.index(ingredient)] = 1
    else:
      for cur_ing in s:
        if (" " + cur_ing) in ingredient and (" " + cur_ing + " ") not in ingredient:
          print(cur_ing, "|", ingredient)
          temp[s.index(cur_ing)] = 1
          break
  X.append(temp)
  X


In [None]:
cuisines = []
for cuisine in train['cuisine']:
    if not cuisine in cuisines:
      cuisines.append(cuisine);

In [None]:
y = [cuisines.index(cuisine) for cuisine in train['cuisine']]
z = 30000
x_test = X[z:]
x_train = X[:z]
y_test = y[z:]
y_train = y[:z]

y_train_one = np.array(to_categorical(y_train))
y_test_one = np.array(to_categorical(y_test))

x_train_one = np.array([np.transpose(np.array(x)) for x in x_train])
x_test_one = np.array([np.transpose(np.array(x)) for x in x_test])

Training neural network with cleaned data

In [None]:
from keras.utils import to_categorical
from keras import regularizers
from keras.layers import LeakyReLU
from keras.layers import Dropout

model = Sequential()


model.add(Dense(units=100, input_shape=(len(s), )))
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(rate=0.5))
model.add(Dense(units=100, input_shape=(len(s), )))
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(rate=0.5))
model.add(Dense(units=20, activation='softmax'))



model.compile(loss='categorical_crossentropy',
               optimizer='nadam',
               metrics=['accuracy'])

model.fit(x_train_one, y_train_one, epochs=15, batch_size=2200, shuffle=True)
model
model

The model acheived an accuracy of .777

In [None]:
print(model.evaluate(x_test_one, y_test_one))
model.metrics_names

In [None]:
def predict(ingredients):
  temp = [0] * len(s)
  for ingredient in ingredients:
    if ingredient in s:
      temp[s.index(ingredient)] = 1
    else:
      for cur_ing in s:
        if (" " + cur_ing) in ingredient and (" " + cur_ing + " ") not in ingredient:
          temp[s.index(cur_ing)] = 1
          break
  pred = model.predict_on_batch(np.array([np.transpose(np.array(temp))]))
  return cuisines[(np.where(pred[0] == max(pred[0]))[0][0])]

Test predictions on recipe with ['rice', 'beef', 'soy sauce', 'broccoli', 'salt'] ingredients. The model predicts this is a chinese cuisine.

In [None]:
ing = ['rice', 'beef', 'soy sauce', 'broccoli', 'salt']
print("ingredients:", ing)
print("prediction:", predict(ing))

Taking a look at predictions on the testing set.

In [None]:
test_tbl = pd.read_json("test.json")
test_tbl

In [None]:
write = []
for index, row in test_tbl.iterrows():
    write.append([str(row['id']), predict(row['ingredients'])])
  


  

In [None]:
print(write)

### Potential improvements

*   Additional ingredient simplification to remove uncommon ingredients (ie "Kraft Extra Fancy American Cheese" -> "American Cheese")
*   Reduce amount of wasted data by adopting different forms of data cleaning
*   Further parameter tuning

