In [7]:
import pandas as pd
import numpy as np

import pickle

import matplotlib.pyplot as plt
import networkx

from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')

In [8]:
edges           = pd.read_csv('../data/edges_191120.csv')
nodes           = pd.read_csv('../data/nodes_191120.csv', index_col=0)
ingr2cat        = pd.read_csv('../data/dict_ingr2cate.csv')
embeddings_dict = pickle.load(open('../data/FlavorGraph_Node_Embedding.pickle', 'rb'))

id_to_name = dict(zip(nodes.index, nodes['name']))

ingredients_idx = nodes[nodes['node_type']=='ingredient'].index.tolist()
nodes           = nodes.loc[ingredients_idx]#.reset_index(drop=True)
embeddings_dict = {k: embeddings_dict[k] for k in map(str, ingredients_idx)}

In [27]:
subset = nodes[nodes['name'].str.contains('ahi', case=False, na=False)]
# subset = subset[subset['is_hub'] == 'hub']
subset

Unnamed: 0_level_0,name,id,node_type,is_hub
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
66,ahi,,ingredient,no_hub
67,ahi_tuna_steak,,ingredient,no_hub
2446,fresh_ahi_tuna,,ingredient,no_hub
4031,mahi_mahi,,ingredient,no_hub
4032,mahi_mahi_fillet,,ingredient,no_hub
6322,tahini,,ingredient,no_hub
6323,tahini_paste,,ingredient,no_hub
6324,tahini_sauce,,ingredient,no_hub


In [10]:
def sorted_sim(embedding, embeddings_dict=embeddings_dict):
   '''
   Given embedding, calculate all pairwise cosine similarities. 

   Input: 
      numpy.array: embedding
   Output:
      list of tuples: [(node_id, similarity_score), ...] sorted by similarity_score in descending order
   '''
   all_embeddings = np.array(list(embeddings_dict.values()))
   all_ids        = list(embeddings_dict.keys())
   embedding      = embedding.reshape(1, -1)
   sims           = cosine_similarity(embedding, all_embeddings).flatten()
   sim_tuples     = list(zip(all_ids, sims))
   sim_tuples     = sorted(sim_tuples, key=lambda x: x[1], reverse=True)
   return sim_tuples

In [31]:
food_id_A = 66
name = id_to_name[food_id_A]
print(name)

food_id_B = 6441
name = id_to_name[food_id_B]
# print(name)

food_id_C = 4635
name = id_to_name[food_id_C]
# print(name)

food_id_D = 2200
name = id_to_name[food_id_D]
# print(name)

embedding = embeddings_dict[str(food_id_A)] #+ embeddings_dict[str(food_id_B)] + embeddings_dict[str(food_id_C)] - embeddings_dict[str(food_id_D)]
print('------------------------')

sim_tuples = sorted_sim(embedding)
for sim_id, sim_score in sim_tuples[1:21]:
    sim_name = id_to_name[int(sim_id)]
    print(f"{sim_name}: {sim_score:.4f}")

ahi
------------------------
kingfish: 0.9449
crisp_salad_green: 0.9327
wishbone_italian_dressing: 0.9308
vegan_burger: 0.9282
dry_chili_pepper: 0.9264
light_balsamic_vinaigrette_salad_dressing: 0.9258
96%_lean_ground_beef: 0.9257
whole_wheat_toast: 0.9238
western_salad_dressing: 0.9173
tex_mex_cheese: 0.9168
oven_roasted_turkey_breast: 0.9161
green_chutney: 0.9160
tomato: 0.8781
garlic_mayonnaise: 0.8479
perch: 0.8206
soy_cheddar_cheese: 0.8177
whole_wheat_roll: 0.8134
meat_seasoning: 0.8006
low_fat_flour_tortilla: 0.7990
pickle_spear: 0.7939
