# Imports

In [1]:
import pickle
from gensim.models import Word2Vec
import gensim
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import numpy as np
import pandas as pd
from sentence_transformers import util
import torch

from collections import defaultdict
import heapq

# Constants

In [2]:
RECIPES_WITH_RATINGS = "../samples/recipes_with_ratings.obj"

INDEX_TO_RECIPE_OBJ = "../samples/index_to_recipe_with_ratings.obj"
RECIPE_TO_INDEX_OBJ = "../samples/recipe_to_index_with_ratings.obj"

WORD2VEC_ING_VECTORS = "../word_embeddings_files/vectors/word2vec_ingredients_vectors.obj"
WORD2VEC_ING_CAT_VECTORS = "../word_embeddings_files/vectors/word2vec_ing_cat_vectors.obj"
WORD2VEC_ING_CAT_KEY_VECTORS = "../word_embeddings_files/vectors/word2vec_ing_cat_key_vectors.obj"

WORD2VEC_ING_CAT_TENSORS = "../word_embeddings_files/vectors/word2vec_ing_cat_tensors.pt"
TOP_ING_CAT = "../word_embeddings_files/top25_ing_cat.obj"

# Load data

In [3]:
with open(RECIPES_WITH_RATINGS, "rb") as input_file:
    recipes_with_ratings = pickle.load(input_file)

with open(INDEX_TO_RECIPE_OBJ, "rb") as input_file:
    index_to_recipe = pickle.load(input_file)

with open(RECIPE_TO_INDEX_OBJ, "rb") as input_file:
    recipe_to_index = pickle.load(input_file)

In [4]:
with open(WORD2VEC_ING_CAT_VECTORS, "rb") as input_file:
    word2vec_ing_cat_vectors = pickle.load(input_file)

In [5]:
# word2vec_ing_cat_tensors = torch.load(WORD2VEC_ING_CAT_TENSORS, map_location=torch.device('cpu'))

# Create tensors

In [19]:
word2vec_ing_cat_vectors_sample = {}
i = 0
for key, value in word2vec_ing_cat_vectors.items():
    if i < 10:
        if key in recipe_to_index.keys():
            word2vec_ing_cat_vectors_sample[key] = value
            i += 1

In [20]:
vectors = list(word2vec_ing_cat_vectors_sample.values())
vectors = [np.array(x).ravel() for x in vectors]

word2vec_ing_cat_tensors = torch.tensor(np.array(vectors), dtype=torch.float)

## Save

In [74]:
# torch.save(word2vec_ing_cat_tensors, WORD2VEC_ING_CAT_TENSORS)

# Cosine similarities

In [43]:
def create_top_k_similar_vectors(vectors, items_to_check, top_k=1, verbose=False):

    #create dictionary of positions in list
    list_pos_to_recipe_id = {}
    recipe_id_to_list_pos = {}
    i = 0
    for key in vectors.keys():
        list_pos_to_recipe_id[i] = key
        recipe_id_to_list_pos[key] = i
        i += 1
    
    # create list of vectors from dictionary
    vectors = list(vectors.values())
    vectors = [np.array(x).ravel() for x in vectors]

    tensors = torch.tensor(vectors, dtype=torch.float)

    if (top_k+1) > len(tensors):
        top_k = len(tensors)
    else:
        top_k += 1
    
    i = 0
    top_scores = defaultdict()
    for key in items_to_check:
        if(i % 1000 == 0 and verbose):
            print("Iteration: ", list_pos)
        cos_scores = util.pytorch_cos_sim(tensors[recipe_id_to_list_pos[key]], 
                                          tensors)
        top_scores[key] = torch.topk(cos_scores, k=top_k)
        i += 1
        
        
    top = defaultdict(list)
    for key in items_to_check:
        top[key] = [(list_pos_to_recipe_id[k[0].item()],k[1].item()) for k 
                       in list(tuple(zip(top_scores[key][1][0], top_scores[key][0][0])))
                       if list_pos_to_recipe_id[k[0].item()] != key]
        
    return top

In [44]:
def create_similarity_matrix(vectors, verbose=False):
    pos_to_recipe_id = {}
    recipe_id_to_pos = {}
    i = 0
    for key in vectors.keys():
        pos_to_recipe_id[i] = key
        recipe_id_to_pos[key] = i
        i += 1
    
    # create list of vectors from dictionary
    vectors = list(vectors.values())
    vectors = [np.array(x).ravel() for x in vectors]

    tensors = torch.tensor(np.array(vectors), dtype=torch.float)
    
    size = len(vectors)
    similarities = np.zeros((size, size))
    for this_item in range(size):
        if (this_itotem % 500 == 0 and verbose==True):
            print(this_item, " of ", size)
        for other_item in range(this_item+1, size):
            tensor1 = torch.tensor(vectors[this_item], dtype=torch.float)
            tensor2 = torch.tensor(vectors[other_item], dtype=torch.float)

            sim = util.pytorch_cos_sim(tensor1, tensor2)[0][0].item()
            similarities[this_item, other_item] = sim
            similarities[other_item, this_item] = sim
            
    return pos_to_recipe_id, recipe_id_to_pos, similarities

In [51]:
top = create_top_k_similar_vectors(word2vec_ing_cat_vectors_sample, 
                             [44], 
                             top_k=25)

In [49]:
pos_to_recipe_id, recipe_id_to_pos, similarities = create_similarity_matrix(word2vec_ing_cat_vectors_sample, verbose=False)

In [52]:
top[44]

[(42, 0.30775606632232666),
 (41, 0.26044660806655884),
 (39, 0.0602552555501461),
 (46, -0.08631511777639389),
 (40, -0.2337792068719864),
 (45, -0.28480109572410583),
 (47, -0.3050621449947357),
 (43, -0.3860410749912262),
 (38, -0.4958491921424866)]

## Save

In [77]:
with open(TOP_ING_CAT, 'wb') as pickle_file:
     pickle.dump(top25_ing_cat, pickle_file)