In [1]:
import torch
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import pickle as pkl
from ast import literal_eval
from sklearn.metrics.pairwise import cosine_similarity


c:\Users\Fastora\AppData\Local\Programs\Python\Python39\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Users\Fastora\AppData\Local\Programs\Python\Python39\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


In [2]:
from recipe_model import CBOW
from recipe_dataset import RecipeText2DataSet

In [3]:
VOCAB_SIZE = 27534
EMBEDDING_DIM = 50
WINDOW_SIZE = 2

In [4]:
model = CBOW(VOCAB_SIZE, EMBEDDING_DIM, WINDOW_SIZE)
model.load_state_dict(torch.load('Models/model_249', map_location=torch.device('cpu')))
model.eval()

CBOW(
  (embedding): Embedding(27534, 50)
  (linear): Linear(in_features=50, out_features=27534, bias=True)
)

In [5]:
data = RecipeText2DataSet('data/ar_recipes_corpus.txt', window_size=WINDOW_SIZE)

Vocab size: 27534


In [6]:
recipes = pd.read_json('data/recipes_cleaned_1.json')

In [7]:
embedding_matrix = model.embedding.weight.data.numpy()
word2idx = data.word2idx
idx2word = data.idx2word
pkl.dump(word2idx, open('data/word2idx.pkl', 'wb'))
pkl.dump(idx2word, open('data/idx2word.pkl', 'wb'))
pkl.dump(embedding_matrix, open('data/embedding_matrix.pkl', 'wb'))

In [8]:
def print_k_nearest(X, idx, k, idx_to_word):
    dists = np.dot((X - X[idx])**2, np.ones(X.shape[1]))
    ids = np.argsort(dists)[:k]
    scores = [dists[i] for i in ids]
    print('Nearest to {}:'.format(idx_to_word[idx]))
    for i in ids:
        print(idx_to_word[i])
    print('\n')
    return ids, scores

In [9]:
def get_k_nearest(X, idx, k, idx_to_word):
    dists = np.dot((X - X[idx])**2, np.ones(X.shape[1]))
    ids = np.argsort(dists)[:k]
    scores = [dists[i] for i in ids]
    return ids, scores

In [10]:
for i, data in recipes.iterrows():
    ing = []
    for j in data['ingredients']:
        ing.append(j['ingredient'])
    data['ingredients'] = ing


In [11]:
#get unique ingredients
unique_ingredients = set()
for i, data in recipes.iterrows():
    for j in data['ingredients']:
        unique_ingredients.add(j)


In [12]:
def get_recipe_vector(recipe_id, word2idx):
    recipe = recipes.iloc[recipe_id]
    recipe_ings = recipe['ingredients']
    recipe_steps = recipe['steps']
    recipe_tags = recipe['tags']
    recipe_name = str(recipe['name'])
    recipe_cuisine = str(recipe['cuisine'])

    #tokenize each list 
    recipe_ings = [word_tokenize(ing) for ing in recipe_ings]
    recipe_steps = [word_tokenize(step) for step in recipe_steps]
    recipe_tags = [word_tokenize(tag) for tag in recipe_tags]
    recipe_name = word_tokenize(recipe_name)
    recipe_cuisine = word_tokenize(recipe_cuisine)

    #flatten each list
    recipe_ings = [item for sublist in recipe_ings for item in sublist]
    recipe_steps = [item for sublist in recipe_steps for item in sublist]
    recipe_tags = [item for sublist in recipe_tags for item in sublist]

    #get embeddings for each word in each list
    recipe_ings = [embedding_matrix[word2idx[ing]] for ing in recipe_ings if ing in word2idx]
    recipe_steps = [embedding_matrix[word2idx[step]] for step in recipe_steps if step in word2idx]
    recipe_tags = [embedding_matrix[word2idx[tag]] for tag in recipe_tags if tag in word2idx]
    recipe_name = [embedding_matrix[word2idx[name]] for name in recipe_name if name in word2idx]
    recipe_cuisine = [embedding_matrix[word2idx[cuisine]] for cuisine in recipe_cuisine if cuisine in word2idx]

    #average the embeddings for each list if the list has more than 1 word
    recipe_ings = np.mean(recipe_ings, axis=0)
    recipe_steps = np.mean(recipe_steps, axis=0)
    recipe_tags = np.mean(recipe_tags, axis=0)
    recipe_name = np.mean(recipe_name, axis=0)
    recipe_cuisine = np.mean(recipe_cuisine, axis=0)

    #if the list has only 1 word, skip this r
    if type(recipe_ings) == np.float64:
        recipe_ings = np.zeros(EMBEDDING_DIM)
    if type(recipe_steps) == np.float64:
        recipe_steps = np.zeros(EMBEDDING_DIM)
    if type(recipe_tags) == np.float64:
        recipe_tags = np.zeros(EMBEDDING_DIM)
    if type(recipe_name) == np.float64:
        recipe_name = np.zeros(EMBEDDING_DIM)
    if type(recipe_cuisine) == np.float64:
        recipe_cuisine = np.zeros(EMBEDDING_DIM)

    #check if any of the lists are empty
    if len(recipe_ings) == 0:
        recipe_ings = np.zeros(EMBEDDING_DIM)
    if len(recipe_steps) == 0:
        recipe_steps = np.zeros(EMBEDDING_DIM)
    if len(recipe_tags) == 0:
        recipe_tags = np.zeros(EMBEDDING_DIM)
    if len(recipe_name) == 0:
        recipe_name = np.zeros(EMBEDDING_DIM)
    if len(recipe_cuisine) == 0:
        recipe_cuisine = np.zeros(EMBEDDING_DIM)
    #concatenate the embeddings for each list
    recipe_vector = np.concatenate((recipe_ings, recipe_steps, recipe_tags, recipe_name, recipe_cuisine), axis=0)
    return recipe_vector, recipe_id

In [13]:
recipes_matrix = []
for i, data in tqdm(recipes.iterrows(), total=recipes.shape[0]):
    r = get_recipe_vector(i, word2idx)
    recipes_matrix.append(r)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
 37%|███▋      | 5835/15911 [00:10<00:19, 522.00it/s]

In [None]:
pkl.dump(recipes_matrix, open('data/recipes_matrix.pkl', 'wb'))

In [None]:
def get_k_nearest_recipes(recipe_id, k, recipes_matrix):
    #recipes matrix is a list of vectors (recipe_vector, recipe_id)
    recipe_vector = recipes_matrix[recipe_id][0]
    #extract ingredients vector from each recipe vector
    recipes_m = [r[0] for r in recipes_matrix]
    recipes_m = np.array(recipes_m)
    dists = np.dot((recipes_m - recipe_vector)**2, np.ones(recipes_m.shape[1]))
    ids = np.argsort(dists)[:k]
    r_id = recipes_matrix[recipe_id][1]
    re_name = recipes.iloc[r_id]['name']
    print('Nearest to {}:'.format(re_name))
    for i in ids:
        r_id = recipes_matrix[i][1]
        re_name = recipes.iloc[r_id]['name']
        print(re_name)
    print('\n')
    return ids

In [None]:
get_k_nearest_recipes(51, 10, np.array(recipes_matrix))

(15911, 250)
(250,)
Nearest to مهلبية البطاطا الحلوة:
مهلبية البطاطا الحلوة
فيديو كيكة جوز الهند الهشة
سلطة البطاطا الحلوة
كرات جوز الهند بالشوكولاتة البيضاء واللوز
مهلبية الشوكولاتة بجوز الهند
مهلبية الشوفان بجوز الهند
سلطة البطاطا الحلوة
شوربة الخضار مع الأرز
مهلبية الشوفان
مهلبية جوز الهند الشهية




  get_k_nearest_recipes(51, 10, np.array(recipes_matrix))


array([   51,  4845,  8389,   467, 11682,  9642,  5716,  5240,  6332,
        4325], dtype=int64)