In [1]:
import pandas as pd
import ast

In [2]:
from gensim.models import KeyedVectors

In [3]:
word_vectors = KeyedVectors.load_word2vec_format('../data/embedding/rdf2vec_vectors_21_july_tags.txt', binary=False)

# C text format

In [4]:
word_vectors.most_similar('http://idea.rpi.edu/heals/kb/usda#01003', topn=5)

[('http://idea.rpi.edu/heals/kb/usda#42151', 0.7698718905448914),
 ('http://idea.rpi.edu/heals/kb/usda#04585', 0.7616324424743652),
 ('http://idea.rpi.edu/heals/kb/usda#42230', 0.7595455646514893),
 ('http://idea.rpi.edu/heals/kb/usda#04624', 0.7517369389533997),
 ('http://idea.rpi.edu/heals/kb/usda#01088', 0.7443124055862427)]

In [5]:
df = pd.read_json('../data/foodcom_review_data.json')

In [6]:
df.head()

Unnamed: 0,0,1
0,http://idea.rpi.edu/heals/kb/ingredientname/mo...,http://idea.rpi.edu/heals/kb/ingredientname/ag...
1,http://idea.rpi.edu/heals/kb/ingredientname/ta...,http://idea.rpi.edu/heals/kb/ingredientname/fe...
2,http://idea.rpi.edu/heals/kb/ingredientname/pl...,http://idea.rpi.edu/heals/kb/ingredientname/mi...
3,http://idea.rpi.edu/heals/kb/ingredientname/ma...,http://idea.rpi.edu/heals/kb/ingredientname/oil
4,http://idea.rpi.edu/heals/kb/ingredientname/ca...,http://idea.rpi.edu/heals/kb/ingredientname/sw...


In [7]:
len(df)

3846

In [8]:
df_usda= pd.read_csv('../data/input/food_category.csv')
df_usda.head()

Unnamed: 0,NDB_No,Long_Desc,FdGrp_Cd,FdGrp_Desc
0,1001,"Butter, salted",100,Dairy and Egg Products
1,1002,"Butter, whipped, with salt",100,Dairy and Egg Products
2,1003,"Butter oil, anhydrous",100,Dairy and Egg Products
3,1004,"Cheese, blue",100,Dairy and Egg Products
4,1005,"Cheese, brick",100,Dairy and Egg Products


In [9]:
df_usda['NDB_No']= df_usda['NDB_No'].astype(str).str.rjust(5,'0')

In [10]:
food2cat ={}
food_label_map= {}
food_id_map ={}
for i, row in df_usda.iterrows():
    uri = 'http://idea.rpi.edu/heals/kb/usda#'+row['NDB_No']
    label = row['Long_Desc'].strip()
    cat = row['FdGrp_Desc']
    food2cat[uri] = cat
    food_label_map[uri] = label
    food_id_map[label] = uri

In [11]:
linking_df = pd.read_csv('../data/usda_linking.nt', names=['entity1', 'predicate', 'entity2','x'], sep=' ')

In [12]:
mapping_name2id={}
mapping_id2name ={}
for i, row in linking_df.iterrows():
    entity1= row['entity1'].replace("<","")[:-1]
    entity2= row['entity2'].replace("<","")[:-1]
    #print (entity1, entity2)
    mapping_name2id[entity1] = entity2
    mapping_id2name[entity2] = entity1

In [13]:
scraped_subs_dict = dict()

for i,row in df.iterrows():
    if row[0] in mapping_name2id and row[1] in mapping_name2id:
        food= row[0]
        subs = row[1]
        if food not in mapping_name2id or subs not in mapping_name2id : continue
        food_id= mapping_name2id[row[0]]
        subs_id = mapping_name2id[row[1]]
        #print (food, food_label_map[food_id],food_id )
        if food_id not in scraped_subs_dict:
            scraped_subs_dict[food_id] = set()
        scraped_subs_dict[food_id].add(subs_id)

In [14]:
cnt = 0
for k in scraped_subs_dict:
    cnt+=len(scraped_subs_dict[k])
print (cnt)

1841


In [15]:
subs2_df =pd.read_csv('../data/common_foods.csv', sep='\t')
subs2_df.head()

Unnamed: 0.1,Unnamed: 0,Food id,Food,Food Short
0,1,usda#11090,Broccoli,"raw,broccoli"
1,2,usda#01040,Cheese,"swiss,swiss cheese"
2,3,usda#11445,Seaweed,"kelp, raw,kelp"
3,4,usda#15065,Fish,"pollock, Atlantic,pollock"
4,5,usda#09229,Papaya nectar,"canned,papaya"


In [16]:
for i,row in subs2_df.iterrows():
    food_id = 'http://idea.rpi.edu/heals/kb/'+row['Food id']
    ranks = word_vectors.most_similar(food_id, topn=20)
    ranked_foods = [food_label_map[f] for f,sim in ranks if f in food_label_map]
    #print (i+1)
    #print (i+1,':',food_label_map[food_id], ' ---- ',ranked_foods)
    if food_id not in scraped_subs_dict:
        scraped_subs_dict[food_id] = set()
    for subs_name in ranked_foods:
        subs_id= food_id_map[subs_name]
        scraped_subs_dict[food_id].add(subs_id)
    #my_list.append([fromt, food_label_map[fromt], gt_subs_labels,ranked_foods ])
    

In [17]:
my_list = []
for food in scraped_subs_dict.keys():
    subs_list = scraped_subs_dict[food]
    for subs in subs_list:
        my_list.append([food, food_label_map[food], '',subs ,food_label_map[subs] ])

In [18]:
len(my_list)

3344

In [19]:
subs_df = pd.DataFrame(my_list, columns=['Food id','Food','Verified', 'Substitution id','Substitution'])

In [20]:
subs_df.to_csv('../data/output/food_substitions_to_be_verified.csv',sep=',')