In [117]:
import pandas as pd
import numpy as np

import pickle

import matplotlib.pyplot as plt
import networkx

from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')

In [118]:
edges           = pd.read_csv('../data/edges_191120.csv')
nodes           = pd.read_csv('../data/nodes_191120.csv', index_col=0)
ingr2cat        = pd.read_csv('../data/dict_ingr2cate.csv')
embeddings_dict = pickle.load(open('../data/FlavorGraph_Node_Embedding.pickle', 'rb'))

id_to_name = dict(zip(nodes.index, nodes['name']))

ingredients_idx = nodes[nodes['node_type']=='ingredient'].index.tolist()
nodes           = nodes.loc[ingredients_idx]#.reset_index(drop=True)
embeddings_dict = {k: embeddings_dict[k] for k in map(str, ingredients_idx)}

In [119]:
subset = nodes[nodes['name'].str.contains('tomato', case=False, na=False)]
subset[subset['is_hub'] == 'hub']

Unnamed: 0_level_0,name,id,node_type,is_hub
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1125,cherry_tomato,,ingredient,hub
6441,tomato,,ingredient,hub


In [120]:
def sorted_sim(embedding, embeddings_dict=embeddings_dict):
   '''
   Given embedding, calculate all pairwise cosine similarities. 

   Input: 
      numpy.array: embedding
   Output:
      list of tuples: [(node_id, similarity_score), ...] sorted by similarity_score in descending order
   '''
   all_embeddings = np.array(list(embeddings_dict.values()))
   all_ids        = list(embeddings_dict.keys())
   embedding      = embedding.reshape(1, -1)
   sims           = cosine_similarity(embedding, all_embeddings).flatten()
   sim_tuples     = list(zip(all_ids, sims))
   sim_tuples     = sorted(sim_tuples, key=lambda x: x[1], reverse=True)
   return sim_tuples

In [121]:
food_id_A = 1263
name = id_to_name[food_id_A]
print(name)

food_id_B = 6441
name = id_to_name[food_id_B]
print(name)


embedding = embeddings_dict[str(food_id_A)] + embeddings_dict[str(food_id_B)]
print('------------------------')

sim_tuples = sorted_sim(embedding)
for sim_id, sim_score in sim_tuples[1:21]:
    sim_name = id_to_name[int(sim_id)]
    print(f"{sim_name}: {sim_score:.4f}")

chive
tomato
------------------------
chive: 0.7465
tomato: 0.7175
oven_roasted_turkey_breast: 0.6913
green_chutney: 0.6888
western_salad_dressing: 0.6872
dry_chili_pepper: 0.6857
chipotle_chili_sauce: 0.6827
vegan_burger: 0.6814
sevruga_caviar: 0.6809
truffle_butter: 0.6808
whole_wheat_toast: 0.6778
crisp_salad_green: 0.6776
wishbone_italian_dressing: 0.6773
whole_wheat_roll: 0.6734
96%_lean_ground_beef: 0.6700
low_fat_flour_tortilla: 0.6693
light_balsamic_vinaigrette_salad_dressing: 0.6633
ahi: 0.6625
perch: 0.6606
tex_mex_cheese: 0.6602
