In [42]:
# Para manipulacion de datos
import ast
import numpy as np
import pandas as pd

# Para preparar datos
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

# Para la construccion del modelo
from keras.layers import Dense, Embedding, Flatten
from keras.models import Sequential

# Para la evaluacion del modelo
import matplotlib.pyplot as plt

# Para guardar el modelo, tokenizer y label encoder
import pickle


In [43]:
recipes = pd.read_csv("data/RAW_recipes.csv")
recipes['ingredients'] = recipes['ingredients'].apply(ast.literal_eval)

In [44]:
ingredients = ['apple', 'banana', 'orange', 'tomato', 'carrot', 'bread', 'cheese', 'mango', 'broccoli', 'grape', 'lemmon', 'pineapple']

# Filtrando las recetas que contengan al menos 1 ingredieente de la lista o mas
recipes = recipes[recipes['ingredients'].apply(lambda x: any([k in x for k in ingredients]))]
len(recipes)

18122

In [45]:
# Filtrando receptas que contengan 20 ingredientes o menos
max_len = 20
recipes = recipes[recipes['ingredients'].apply(lambda x: all(len(i) <= max_len for i in x))]
len(recipes)

11191

In [46]:
recipes.to_csv('filtered_recipes.csv', index=False)

In [47]:
# Representación binaria de los ingredientes
mlb = MultiLabelBinarizer()
ingredients_presence = mlb.fit_transform(recipes['ingredients'])

# Tokenizacion de los ingredientes
tokenizer = Tokenizer()
tokenizer.fit_on_texts(recipes['ingredients'])
sequences = tokenizer.texts_to_sequences(recipes['ingredients'])
padded_sequences = pad_sequences(sequences)

In [48]:
# Encoding del target (ID de la receta)
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(recipes['id'])
one_hot_labels = to_categorical(encoded_labels)

In [49]:
# Hiperparametros del modelo
vocab_size = len(tokenizer.word_index)+1
embedding_dim = 100
max_length = padded_sequences.shape[1]
recipes_q = recipes['id'].nunique()

# Construccion del modelo
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(Flatten())
model.add(Dense(120, activation='relu'))
model.add(Dense(recipes_q, activation='softmax'))

# Compilacion del modelo
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Entrenamiento del modelo
history = model.fit(padded_sequences, one_hot_labels, epochs=15, verbose=1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [50]:
model.save('recipe_model.h5')

  saving_api.save_model(


In [51]:
with open('tokenizer.pickle', 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('label_encoder.pickle', 'wb') as handle: pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('mlb.pickle', 'wb') as handle: pickle.dump(mlb, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [52]:
try_ingredients = ['bread', 'cheese']
try_ingredients_sequence = tokenizer.texts_to_sequences([try_ingredients])
try_ingredients_padded = pad_sequences(try_ingredients_sequence, maxlen=padded_sequences.shape[1])

predictions = model.predict(try_ingredients_padded)
predicted_recipe_id = label_encoder.inverse_transform([np.argmax(predictions)])

# Top 3 de recetas mas probables
top_3 = predictions.argsort()[0][-3:][::-1]
for i in top_3:
    recipe_id = label_encoder.inverse_transform([i])[0]
    recipe_name = recipes[recipes['id'] == recipe_id]['name'].values[0]
    probability = predictions[0][i]
    print(f'[{recipe_id}] {recipe_name}: {probability*100:.2f}%')

[531520] cheeze bread: 81.37%
[9518] 30 second sandwich: 11.94%
[25981] chorizo dip: 1.57%


In [53]:
# Construccion del modelo
model = Sequential()
model.add(Dense(128, input_dim=ingredients_presence.shape[1], activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(recipes_q, activation='softmax'))

# Compilacion del modelo
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Entrenamiento del modelo
history = model.fit(ingredients_presence, one_hot_labels, epochs=15, verbose=1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [54]:
model.save('recipe_model_v2.h5')

  saving_api.save_model(


In [55]:
input_binary = mlb.transform([try_ingredients])

# Predicciones
predictions = model.predict(np.array(input_binary))[0]

# Top 3 de recetas mas probables
top_3 = predictions.argsort()[-3:][::-1]
for i in top_3:
    recipe_id = label_encoder.inverse_transform([i])[0]
    recipe_name = recipes[recipes['id'] == recipe_id]['name'].values[0]
    probability = predictions[i]
    print(f'[{recipe_id}] {recipe_name}: {probability*100:.2f}%')




[531520] cheeze bread: 75.99%
[9518] 30 second sandwich: 9.77%
[86467] mirowave bacon cheese sandwich: 4.10%
