In [2]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [74]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import glob
import re
import pickle
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity


# Dataset

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
!unzip /content/drive/MyDrive/recipes_raw.zip -d /content/recipes

Archive:  /content/drive/MyDrive/recipes_raw.zip
  inflating: /content/recipes/recipes_raw_nosource_ar.json  
  inflating: /content/recipes/recipes_raw_nosource_epi.json  
  inflating: /content/recipes/recipes_raw_nosource_fn.json  
  inflating: /content/recipes/LICENSE  


In [7]:
path = '/content/recipes/*.json'
recipes = []

for file in glob.glob(path):
    with open(file, 'r') as f:
        data = json.load(f)

        for recipe_id, recipe in data.items():
            recipes.append({
                'title': recipe.get('title', ''),
                'ingredients': recipe.get('ingredients', []),
                'instructions': recipe.get('instructions', '')
            })


Agora que o dataset esta carregado podemos verificar quantas receitas ele tem, e exibiremos um exemplo de uma receita

In [8]:
print(len(recipes))
print('-------------------')
print(recipes[0])

125164
-------------------
{'title': 'Christmas Eggnog ', 'ingredients': ['12 egg whites', '12 egg yolks', '1 1/2 cups sugar', '3/4 cup rye whiskey', '12 egg whites', '3/4 cup brandy', '1/2 cup rum', '1 to 2 cups heavy cream, lightly whipped', 'Garnish: ground nutmeg'], 'instructions': 'Beat the egg whites until stiff, gradually adding in 3/4 cup sugar. Set aside. Beat the egg yolks until they are thick and pale and add the other 3/4 cup sugar and stir in rye whiskey. Blend well. Fold the egg white mixture into the yolk mixture and add the brandy and the rum. Beat the mixture well. To serve, fold the lightly whipped heavy cream into the eggnog. (If a thinner mixture is desired, add the heavy cream unwhipped.) Sprinkle the top of the eggnog with the nutmeg to taste.\nBeat the egg whites until stiff, gradually adding in 3/4 cup sugar. Set aside. Beat the egg yolks until they are thick and pale and add the other 3/4 cup sugar and stir in rye whiskey. Blend well. Fold the egg white mixture

# EDA


Faremos uma pequena analise explarotaria dos dados para ter conhecimento total doque estamos fazendo

In [9]:
# Contagem de ingredientes
ingredient_counts = [len(r['ingredients']) for r in recipes]

print("Média:", np.mean(ingredient_counts))
print("Mediana:", np.median(ingredient_counts))
print("Máximo:", max(ingredient_counts))
print("Mínimo:", min(ingredient_counts))


Média: 10.521795404429389
Mediana: 10.0
Máximo: 82
Mínimo: 0


Com isso responde-se, quantos ingredientes costuma ter uma receita?, Existe outliers??, bom aqui pode se ver que média aprox 10 e mediana 10 quer dizer que os dados são em sua maior parte simétricos, e um deles possui 82 oque nós indica que existe pelo menos 1 outlier

In [10]:
from collections import Counter

all_ingredients = []
for r in recipes:
    all_ingredients.extend(r['ingredients'])

counter = Counter(all_ingredients)
counter.most_common(20)

[('ADVERTISEMENT', 39519),
 ('Salt and freshly ground black pepper', 5218),
 ('Kosher salt and freshly ground black pepper', 4887),
 ('Kosher salt', 4845),
 ('1/2 teaspoon salt', 4246),
 ('1/2 teaspoon salt ADVERTISEMENT', 3455),
 ('1 teaspoon salt', 3372),
 ('2 tablespoons olive oil', 3353),
 ('Salt and pepper', 3203),
 ('Salt', 3039),
 ('Freshly ground black pepper', 3026),
 ('1 teaspoon vanilla extract ADVERTISEMENT', 2998),
 ('1 teaspoon salt ADVERTISEMENT', 2969),
 ('1/4 teaspoon salt', 2889),
 ('salt and pepper to taste ADVERTISEMENT', 2457),
 ('2 eggs ADVERTISEMENT', 2221),
 ('1 tablespoon olive oil', 2169),
 ('1 cup sugar', 2158),
 ('1 teaspoon vanilla extract', 2119),
 ('1/2 cup sugar', 2022)]

E aqui temos os 20 ingredientes mais comuns, note que existem muito ruido no dataset, pois ADVERTISEMENT não é um ingrediente mas uma indicação padrao em muitas receitas.

# Pre-processamento

Nesta seção iremos tratar da limpeza de ruido do texto utilizando regex.

In [11]:
# Limpeza para os ingredientes
unidades = [
    'cup', 'cups', 'tablespoon', 'tablespoons',
    'teaspoon', 'teaspoons', 'ounce', 'ounces',
    'pound', 'pounds', 'package', 'packages',
    'can', 'cans'
]
def clean_ingredients(text):
  text = text.lower()
  text = re.sub(r'ADVERTISEMENT','',text)
  text = re.sub(r'\d+\/?\d*','',text)
  text = re.sub(r'\([^)]*\)', '', text)
  text = re.sub(r'[^a-z\s]', '', text)

  for unidade in unidades:
    text = re.sub(r'\b' + unidade + r'\b', '', text)

  text = re.sub(r'\s+', ' ', text).strip()
  return text

for r in recipes:
    r['clean_ingredients'] = [
        clean_ingredients(i)
        for i in r['ingredients'] if clean_ingredients(i) != ''
    ]



In [12]:
print(recipes[0]['ingredients'])
print('-------------------')
print(recipes[0]['clean_ingredients'])


['12 egg whites', '12 egg yolks', '1 1/2 cups sugar', '3/4 cup rye whiskey', '12 egg whites', '3/4 cup brandy', '1/2 cup rum', '1 to 2 cups heavy cream, lightly whipped', 'Garnish: ground nutmeg']
-------------------
['egg whites', 'egg yolks', 'sugar', 'rye whiskey', 'egg whites', 'brandy', 'rum', 'to heavy cream lightly whipped', 'garnish ground nutmeg']


In [13]:
# Limpeza para os titulos
def clean_title(text):
   text = text.lower()
   text = re.sub(r'[^a-z\s]', '', text)
   text = re.sub(r'\s+', ' ', text).strip()
   return text


In [14]:
for r in recipes:
  if r['title'] is not None:
    r['clean_title'] = clean_title(r['title'])
  else:
    r['clean_title'] = ''

In [15]:
print(recipes[0]['title'])
print('-------------------')
print(recipes[0]['clean_title'])

Christmas Eggnog 
-------------------
christmas eggnog


Agora criaremos a junção desses dois que dara o significado culinario para a receita e por meio disso o modelo ira decidir qual receita entregar.

In [16]:
def recipe_to_text(recipe):
    ingredients_text = ' '.join(recipe['clean_ingredients'])
    return f"{recipe['clean_title']} {ingredients_text}"
for r in recipes:
    r['final_text'] = recipe_to_text(r)

In [17]:
recipes[0]['final_text']
# Tudo junto pronto para o embedding.

'christmas eggnog egg whites egg yolks sugar rye whiskey egg whites brandy rum to heavy cream lightly whipped garnish ground nutmeg'

In [18]:
tokenized_corpus = [r['final_text'].split() for r in recipes]
print(tokenized_corpus[0])
# Note que agora cada palavra esta separada

['christmas', 'eggnog', 'egg', 'whites', 'egg', 'yolks', 'sugar', 'rye', 'whiskey', 'egg', 'whites', 'brandy', 'rum', 'to', 'heavy', 'cream', 'lightly', 'whipped', 'garnish', 'ground', 'nutmeg']


# Treinando a rede

Iremos treinar a rede utilizando word2vec para realizar o embedding das palavras

In [19]:
w2v_model = Word2Vec(
    sentences=tokenized_corpus,
    vector_size=100,
    window=5,
    min_count=5,
    workers=4,
    sg=1,        # Skip-gram
    epochs=10
)



In [75]:
artifacts = {
    "w2v_model": w2v_model,
    "recipe_vectors": recipe_vectors,
    "recipes": valid_recipes,
    "ingredient_vocab": ingredient_vocab
}

with open("recipe_chatbot.pkl", "wb") as f:
    pickle.dump(artifacts, f)

print("Modelo salvo com sucesso!")

Modelo salvo com sucesso!


In [20]:
w2v_model.wv.most_similar('chicken')

[('breast', 0.7108600735664368),
 ('breasts', 0.6962664723396301),
 ('thighs', 0.6664709448814392),
 ('turkey', 0.6664022207260132),
 ('skinless', 0.6573923230171204),
 ('tsos', 0.6404609084129333),
 ('pheasant', 0.6372656226158142),
 ('legthigh', 0.6275071501731873),
 ('broilerfryer', 0.6218454837799072),
 ('thigh', 0.6077554225921631)]

In [21]:
w2v_model.wv.similarity('butter', 'oil')

np.float32(0.39467165)

In [22]:
def clean_ingredients(ingredients):
    return [
        ing.lower()
        for ing in ingredients
        if ing and ing != "ADVERTISEMENT"
    ]

In [23]:
def recipe_embedding(ingredients, model):
    vectors = []

    for ing in ingredients:
        if ing in model.wv:
            vectors.append(model.wv[ing])

    if len(vectors) == 0:
        return None

    return np.mean(vectors, axis=0)

recipe_vectors = []
valid_recipes = []


for recipe in recipes:
    clean_ing = clean_ingredients(recipe['ingredients'])
    emb = recipe_embedding(clean_ing, w2v_model)

    if emb is not None:
        recipe_vectors.append(emb)
        valid_recipes.append(recipe)

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_recipes(query_ingredients, model, recipe_vectors, recipes, top_k=5):
    query_emb = recipe_embedding(query_ingredients, model)
    sims = cosine_similarity([query_emb], recipe_vectors)[0]

    top_idx = sims.argsort()[-top_k:][::-1]

    return [(recipes[i]['title'], sims[i]) for i in top_idx]


In [25]:
query = ['chicken', 'butter']
find_similar_recipes(query, w2v_model, recipe_vectors, valid_recipes)
# Receitas semanticamentes proximas.

[('Jillo', np.float32(0.8267207)),
 ('Grilled Jerk Rubbed Chicken with Habanero-Mint Glaze',
  np.float32(0.8058742)),
 ('Beignets', np.float32(0.7410874)),
 ('Spiced Popcorn', np.float32(0.7410874)),
 ('Grape Leaves', np.float32(0.7410874))]

In [30]:
# exemplo do uso dos embeddings: rei - homem + mulher = rainha
w2v_model.wv.most_similar(positive=['butter', 'chicken'], negative=['oil'])

[('pheasant', 0.49691373109817505),
 ('turkey', 0.48952773213386536),
 ('quickcook', 0.46499010920524597),
 ('mode', 0.4532237946987152),
 ('ultra', 0.4528697729110718),
 ('kiev', 0.450727641582489),
 ('velvety', 0.4471573829650879),
 ('stick', 0.44534507393836975),
 ('pierogis', 0.437674880027771),
 ('semihomemaker', 0.43374156951904297)]

# Limpeza e extração de input do usuario:

In [32]:
def tokenize_ingredient(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    return tokens

ingredient_vocab = set()
for r in recipes:
    for ing in r['clean_ingredients']:
        tokens = tokenize_ingredient(ing)
        ingredient_vocab.update(tokens)

In [34]:
print(len(ingredient_vocab))
print('-----------')
print(ingredient_vocab)

16109
-----------
{'leche', 'necks', 'vanillascented', 'garnishdecorating', 'odd', 'balcones', 'minuti', 'alae', 'kingui', 'dillor', 'peperonata', 'ctes', 'maitak', 'fridge', 'annes', 'ruler', 'cilanto', 'wellstocked', 'jennieo', 'latticetop', 'classico', 'pizzettas', 'ajowan', 'choricero', 'crawler', 'saint', 'aarons', 'psylliumhusk', 'tornup', 'heinz', 'surprising', 'basis', 'tritip', 'snap', 'deepfat', 'pompelmo', 'sammy', 'unaged', 'sprays', 'changs', 'hollowed', 'kept', 'achioteinfused', 'mescal', 'knocking', 'chateau', 'braggs', 'escabechestyle', 'colouring', 'muscadavo', 'bird', 'cupcapacity', 'softpeak', 'truffles', 'orangeflower', 'seashell', 'breadwhatever', 'maseca', 'energ', 'topneck', 'olakes', 'classiccut', 'sheepss', 'chuns', 'grate', 'robb', 'uneeda', 'primeaged', 'trimmed', 'demibaguette', 'smithfieldfarmland', 'sprite', 'calabacitas', 'mcclures', 'discretion', 'potatos', 'classicmac', 'californiastyle', 'destroys', 'piadine', 'schupf', 'info', 'farmer', 'fern', 'getti

In [66]:
def clean_user_text(text):
   text = text.lower()
   text = re.sub(r'[^a-z\s]', '', text)
   text = re.sub(r'\s+', ' ', text).strip()
   return text

user_text = "I have chicken, butter and onion and biscuit."
user_tokens = clean_user_text(user_text).split()
print(user_tokens)
print('---------------------------')
user_tokens = list(set(token for token in user_tokens if token in ingredient_vocab))
print(user_tokens)

stopwords = {
    'i', 'have', 'do', 'what', 'can', 'a', 'the', 'and', 'or', 'to', 'of', 'is'
}
print('---------------------------\nPós uso de stopwords')
user_tokens = [token for token in user_tokens if token not in stopwords]
print(user_tokens)

['i', 'have', 'chicken', 'butter', 'and', 'onion', 'and', 'biscuit']
---------------------------
['and', 'biscuit', 'chicken', 'i', 'onion', 'butter', 'have']
---------------------------
Pós uso de stopwords
['biscuit', 'chicken', 'onion', 'butter']


# Output

In [67]:
# Agora vamos pegar a médias dos vetores(ou ingredientes) que o usuario deu, ou seja queremos a media dos embeddings
vectors = [w2v_model.wv[token]for token in user_tokens if token in w2v_model.wv]

if len(vectors) == 0:
    vector = None
else:
    vector = np.mean(vectors, axis=0)
print(vector)
print(vector.shape)


[-0.32509762  0.13521124  0.15702364  0.22520071  0.05874763 -0.3052799
  0.0268219   0.03967972  0.05385944 -0.23968194 -0.4730968  -0.44220355
 -0.07820446  0.4347715   0.12284133  0.05218104  0.27909702 -0.26075593
  0.20144014 -0.05927563  0.07831711  0.19595566  0.03503709 -0.19832137
  0.2434333  -0.12030867 -0.11857487  0.01165035 -0.41393486 -0.00792591
  0.18404163 -0.03995736  0.29173857 -0.06314841  0.00643753  0.04845779
 -0.03956046 -0.09884411 -0.34667033 -0.35628104  0.22686757  0.15167394
 -0.00892673  0.1147078   0.1374408  -0.09772637 -0.23453149 -0.20514727
  0.11360607  0.05627402  0.17162627 -0.38549215  0.11796363 -0.0005977
 -0.24756068  0.07669584 -0.00119231  0.306648   -0.06500533 -0.11324479
  0.24175858 -0.05182974  0.1484992   0.00345504 -0.09855148  0.10850571
  0.02684177 -0.3894252   0.1074837   0.1888678  -0.3699496   0.21786603
  0.28240678  0.11994316  0.2736752   0.02770984  0.04980705 -0.13465823
 -0.03798044 -0.17798124 -0.17277804 -0.00736912 -0.0

In [68]:
def find_similar_recipes(query_vector, recipe_vectors, recipes, top_k=5):
    similarities = []

    for i, rv in enumerate(recipe_vectors):
        sim = cosine_similarity(
            query_vector.reshape(1, -1),
            rv.reshape(1, -1)
        )[0][0]

        similarities.append((recipes[i]['title'], sim))

    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_k]
results = find_similar_recipes(
    vector,
    recipe_vectors,
    valid_recipes
)

for r in results:
    print(r)


('Jillo', np.float32(0.7681515))
('Grilled Jerk Rubbed Chicken with Habanero-Mint Glaze', np.float32(0.7449807))
('Improvising a Meal with Phillippe Chin', np.float32(0.6890353))
('Omelets', np.float32(0.6781118))
('New England Lobster Roll', np.float32(0.67576885))


Note que os resultados não fazem sentido a interpretação, porém computacionalmente o modelo enxerga que essas receitas estão mais proximas semanticamente oque nos falta aplicar alguma regra que de fato nos leve a receitas que são mais proximas e possiveis com os ingredientes que temos

In [70]:
def ingredient_match_score(user_ings, recipe_ings):
    recipe_tokens = set()

    for ing in recipe_ings:
        tokens = tokenize_ingredient(ing)
        recipe_tokens.update(tokens)

    if len(recipe_tokens) == 0:
        return 0.0

    match = sum(1 for ing in user_ings if ing in recipe_tokens)
    return match / len(user_ings)

def find_best_recipes(query_vector, recipe_vectors, recipes, user_ings, top_k=5):
    scored = []

    for i, rv in enumerate(recipe_vectors):
        sem_sim = cosine_similarity(
            query_vector.reshape(1, -1),
            rv.reshape(1, -1)
        )[0][0]

        ing_match = ingredient_match_score(
            user_ings,
            recipes[i]['clean_ingredients']
        )

        final_score = 0.4 * sem_sim + 0.6 * ing_match
        scored.append((recipes[i]['title'], final_score, sem_sim, ing_match))

    scored.sort(key=lambda x: x[1], reverse=True)
    return scored[:top_k]

results = find_best_recipes(
    vector,
    recipe_vectors,
    valid_recipes,
    user_tokens
)

for r in results:
    print(r)


('Roast Meat Loaf or "Hedgehog"', np.float32(0.7062216), np.float32(0.6405541), 0.75)
('What a Face! Open Faced Hot Turkey Sammys with Sausage Stuffing and Gravy, Smashed Potatoes with Bacon, Warm Apple Cranberry Sauce', np.float32(0.7062216), np.float32(0.6405541), 0.75)
('Easy Arancini ', np.float32(0.7008606), np.float32(0.6271516), 0.75)
("Mike's Polish Smothered Chicken", np.float32(0.7008606), np.float32(0.6271516), 0.75)
("Mama's Black-Eyed Pea Casserole", np.float32(0.7008606), np.float32(0.6271516), 0.75)


In [72]:
best_title, final_score, sem_sim, ing_match = results[0]

best_recipe = next(
    r for r in valid_recipes if r['title'] == best_title
)
def format_chatbot_response(recipe):
    response = f"A melhor receita para você é:\n\n"
    response += f"{recipe['title']}\n\n"

    response += "Ingredientes principais:\n"
    for ing in recipe['ingredients']:
        response += f"- {ing}\n"

    response += "\nModo de preparo:\n"
    response += recipe['instructions']

    return response
print(format_chatbot_response(best_recipe))


A melhor receita para você é:

Roast Meat Loaf or "Hedgehog"

Ingredientes principais:
- 8 ounces mushrooms, sliced
- Butter
- Salt
- Freshly ground black pepper
- Grated nutmeg
- 8 ounces chicken livers
- 1 pound each ground beef, ground pork, ground veal
- 1 pound sausage meat
- 1 large onion, grated
- 3 fat cloves garlic, crushed to a paste
- 10 juniper berries, crushed
- 1 teaspoon ground allspice
- Fresh thyme sprigs
- 1 to 2 eggs
- 8 ounces unsmoked bacon
- Bay leaves
- Branches of fresh rosemary
- Tomato sauce, for serving

Modo de preparo:
Preheat the oven to 450 degrees F.
Saute the mushrooms in butter until the juices run, then season with salt and pepper and nutmeg. Reserve.
Remove the sinews from the livers and slice. In a large bowl combine all the ground meats, sausage, livers, onion, garlic, and juniper berries. Add the allspice and thyme leaves. Season with salt and pepper. Beat the eggs and add to the mixture together with the mushrooms. Mix with your hands thoroughly.