In [25]:
import numpy as np
import random

In [26]:
import pandas as pd
import ast

In [27]:
from gensim.models import KeyedVectors
from gensim.test.utils import datapath

In [194]:
word_vectors = KeyedVectors.load_word2vec_format('../data/embedding/rdf2vec_usda_tags_openfoodfacts.txt', binary=False)

# C text format

In [196]:
word_vectors.most_similar('http://idea.rpi.edu/heals/kb/usda#01003', topn=5)

[('http://idea.rpi.edu/heals/kb/usda#01073', 0.9275833964347839),
 ('http://idea.rpi.edu/heals/kb/usda#01023', 0.9127548336982727),
 ('http://idea.rpi.edu/heals/kb/usda#01167', 0.9102444648742676),
 ('http://idea.rpi.edu/heals/kb/usda#01089', 0.9072730541229248),
 ('http://idea.rpi.edu/heals/kb/usda#01020', 0.9048135280609131)]

In [197]:
df= pd.read_csv('../data/input/food_category.csv')
df.head()

Unnamed: 0,NDB_No,Long_Desc,FdGrp_Cd,FdGrp_Desc
0,1001,"Butter, salted",100,Dairy and Egg Products
1,1002,"Butter, whipped, with salt",100,Dairy and Egg Products
2,1003,"Butter oil, anhydrous",100,Dairy and Egg Products
3,1004,"Cheese, blue",100,Dairy and Egg Products
4,1005,"Cheese, brick",100,Dairy and Egg Products


In [198]:
df['NDB_No']= df['NDB_No'].astype(str).str.rjust(5,'0')

In [199]:
food2cat ={}
food_label_map= {}
food_id_map ={}
for i, row in df.iterrows():
    uri = 'http://idea.rpi.edu/heals/kb/usda#'+row['NDB_No']
    label = row['Long_Desc'].strip()
    cat = row['FdGrp_Desc']
    food2cat[uri] = cat
    food_label_map[uri] = label
    food_id_map[label] = uri

In [200]:
def get_simscore_ingrank_onlyss_multisamerank(fromt, asdf):
    # return rank based on KG embedding +revised nutriscore
    TOPK=len(word_vectors.vocab)
    mostSimilar =word_vectors.most_similar(fromt, topn=TOPK)
    for i,(e, sim) in enumerate(mostSimilar):
        if e == asdf:
            return i+1    
    return TOPK


In [201]:
def get_simscore_ingrank_category_multisamerank(fromt, asdf):
    # return rank based on KG embedding +revised nutriscore
    TOPK=len(word_vectors.vocab)
    mostSimilar =word_vectors.most_similar(fromt, topn=TOPK)
    i = 0
    for e, sim in mostSimilar:
        if e not in food2cat or food2cat[asdf] != food2cat[e]: continue
        if e == asdf:
            return i+1 
        i+=1
    return TOPK

In [202]:
def mrr_map_new(scraped_subs_dict, opt = 1):
    rank_scores = []
    ave_p = []
    in_top_5 = 0
    in_top_10 = 0
    print('number of ings: ',len(scraped_subs_dict.keys()))

    for fromt in sorted(scraped_subs_dict.keys()):
        relevant_ranks = []
        min_rank = 9999999999999999
        for asdf in scraped_subs_dict[fromt]:
            if opt == 1:
                rank = get_simscore_ingrank_onlyss_multisamerank(fromt, asdf)
            elif opt == 2:
                rank = get_simscore_ingrank_category_multisamerank(fromt, asdf)   
            relevant_ranks.append(rank)
            if rank < min_rank:
                min_rank = rank
        rank = min_rank
        rank_scores.append(1.0/rank)
        if min_rank <= 5:
            in_top_5 += 1
        if min_rank <= 10:
            in_top_10 += 1
        precisions = []
        for rank in relevant_ranks:
            good_docs = len([r for r in relevant_ranks if r <= rank])
            precisions.append(good_docs/rank)
        if len(precisions) == 0:
            precisions = [0]
        ave_p.append(np.mean(precisions))
    print('mean reciprocal rank: ', np.mean(rank_scores))
    print('recallrate at 5: ', in_top_5/len(scraped_subs_dict.keys()))
    print('recallrate at 10: ', in_top_10/len(scraped_subs_dict.keys()))
    print('mean average precision: ', np.mean(ave_p))
    print('formatted')
    print(round(np.mean(ave_p), 3),' & ',round(np.mean(rank_scores), 3),' & ',round(in_top_5/len(scraped_subs_dict.keys()), 3),' & ',round(in_top_10/len(scraped_subs_dict.keys()), 3))
    return round(np.mean(ave_p), 3)


In [203]:
subs_df =pd.read_csv('../data/final_substitution.csv')
#subs_df =pd.read_csv('../data/final_substitution_unsorted.csv')
subs_df.head()

Unnamed: 0,Food id,Food label,Substitution id,Substitution label
0,http://idea.rpi.edu/heals/kb/usda#01001,"Butter, salted",http://idea.rpi.edu/heals/kb/usda#04506,"Oil, sunflower, linoleic"
1,http://idea.rpi.edu/heals/kb/usda#01001,"Butter, salted",http://idea.rpi.edu/heals/kb/usda#04679,"Oil, PAM cooking spray, original"
2,http://idea.rpi.edu/heals/kb/usda#01001,"Butter, salted",http://idea.rpi.edu/heals/kb/usda#04053,"Oil, olive, salad or cooking"
3,http://idea.rpi.edu/heals/kb/usda#01001,"Butter, salted",http://idea.rpi.edu/heals/kb/usda#04128,"Margarine,spread, 35-39% fat"
4,http://idea.rpi.edu/heals/kb/usda#01001,"Butter, salted",http://idea.rpi.edu/heals/kb/usda#04601,"Butter, light, stick"


In [204]:
subs_df[subs_df['Food id']=='http://idea.rpi.edu/heals/kb/usda#01006']

Unnamed: 0,Food id,Food label,Substitution id,Substitution label
20,http://idea.rpi.edu/heals/kb/usda#01006,"Cheese, brie",http://idea.rpi.edu/heals/kb/usda#01019,"Cheese, feta"
21,http://idea.rpi.edu/heals/kb/usda#01006,"Cheese, brie",http://idea.rpi.edu/heals/kb/usda#01004,"Cheese, blue"
22,http://idea.rpi.edu/heals/kb/usda#01006,"Cheese, brie",http://idea.rpi.edu/heals/kb/usda#01022,"Cheese, gouda"
23,http://idea.rpi.edu/heals/kb/usda#01006,"Cheese, brie",http://idea.rpi.edu/heals/kb/usda#01251,"Cheese, Mexican blend"


In [205]:
len(subs_df)

1847

In [206]:
ground_truth_foods= set(subs_df['Food id'].unique()).union(subs_df['Substitution id'].unique())

In [207]:
len(ground_truth_foods)

785

In [208]:
scraped_subs_dict = dict()

for i,row in subs_df.iterrows():
    #print (ast.literal_eval(row[2]))
    #print  (food_id_map[row[1]])
    food= row['Food id']
    subs = row['Substitution id']
    if food in word_vectors and subs in word_vectors and food in food2cat and subs in food2cat:
        if food not in scraped_subs_dict:
            scraped_subs_dict[food]= set()
        
        scraped_subs_dict[food].add(subs)
    else:
        print (food, subs, 'not in embeddings')



http://idea.rpi.edu/heals/kb/usda#01053 http://idea.rpi.edu/heals/kb/usda#01065 not in embeddings
http://idea.rpi.edu/heals/kb/usda#01088 http://idea.rpi.edu/heals/kb/usda#01289 not in embeddings
http://idea.rpi.edu/heals/kb/usda#01088 http://idea.rpi.edu/heals/kb/usda#01065 not in embeddings
http://idea.rpi.edu/heals/kb/usda#09206 http://idea.rpi.edu/heals/kb/usda#09212 not in embeddings
http://idea.rpi.edu/heals/kb/usda#12147 http://idea.rpi.edu/heals/kb/usda#12157 not in embeddings


In [209]:
scraped_subs_dict['http://idea.rpi.edu/heals/kb/usda#01006']

{'http://idea.rpi.edu/heals/kb/usda#01004',
 'http://idea.rpi.edu/heals/kb/usda#01019',
 'http://idea.rpi.edu/heals/kb/usda#01022',
 'http://idea.rpi.edu/heals/kb/usda#01251'}

In [210]:
n=0
for key in scraped_subs_dict:
    n+=len(scraped_subs_dict[key])
    
print (n)

1842


In [211]:
mrr_map_new(scraped_subs_dict,1)

number of ings:  370
mean reciprocal rank:  0.2335069870116246
recallrate at 5:  0.32972972972972975
recallrate at 10:  0.4
mean average precision:  0.13273214969551803
formatted
0.133  &  0.234  &  0.33  &  0.4


0.133

In [212]:
mrr_map_new(scraped_subs_dict,2)

number of ings:  370
mean reciprocal rank:  0.2585016589686655
recallrate at 5:  0.35945945945945945
recallrate at 10:  0.43783783783783786
mean average precision:  0.1541059924779764
formatted
0.154  &  0.259  &  0.359  &  0.438


0.154

In [213]:
TOPK=len(word_vectors.vocab)
fromt= 'http://idea.rpi.edu/heals/kb/usda#04047'
#Oil, coconut
#mostSimilar =word_vectors.most_similar(asdf, topn=TOPK)
mostSimilar =word_vectors.most_similar(fromt, topn=TOPK)
gt_subs_list = scraped_subs_dict[fromt]
i=1
for (e, sim) in mostSimilar:
    #if e not in food2cat or food2cat[fromt] != food2cat[e]: continue
    #if e not in ground_truth_foods: continue
    if e in gt_subs_list:
        print (food_label_map[e],i)
    i+=1

Oil, canola 2
Oil, corn, industrial and retail, all purpose salad or cooking 4
Oil, sesame, salad or cooking 5
Oil, walnut 7
Oil, peanut, salad or cooking 10
Shortening, vegetable, household, composite 16
Oil, olive, salad or cooking 19
Lard 21
USDA Commodity Food, oil, vegetable, soybean, refined 24
Margarine-like, margarine-butter blend, soybean oil and butter 113
Butter oil, anhydrous 488
Margarine, regular, hard, soybean (hydrogenated) 2617


In [214]:

TOPK=len(word_vectors.vocab)
fromt= 'http://idea.rpi.edu/heals/kb/usda#11109'
#Cabbage, raw
#mostSimilar =word_vectors.most_similar(asdf, topn=TOPK)
mostSimilar =word_vectors.most_similar(fromt, topn=TOPK)
gt_subs_list = scraped_subs_dict[fromt]
i=1
for (e, sim) in mostSimilar:
    #if e not in food2cat or food2cat[fromt] != food2cat[e]: continue
    #if e not in ground_truth_foods: continue
    if e in gt_subs_list:
        print (food_label_map[e],i)
    i+=1

Sauerkraut, canned, solids and liquids 4
Cauliflower, green, raw 10
Cabbage, red, raw 13
Cabbage, chinese (pak-choi), raw 17
Mustard greens, raw 49
Cress, garden, raw 54
Broccoli, raw 61
Brussels sprouts, raw 96


In [215]:
TOPK=len(word_vectors.vocab)
fromt= 'http://idea.rpi.edu/heals/kb/usda#11601'
#Yam, raw
#mostSimilar =word_vectors.most_similar(asdf, topn=TOPK)
mostSimilar =word_vectors.most_similar(fromt, topn=TOPK)
gt_subs_list = scraped_subs_dict[fromt]
i=1
for (e, sim) in mostSimilar:
    #if e not in food2cat or food2cat[fromt] != food2cat[e]: continue
    #if e not in ground_truth_foods: continue
    if e in gt_subs_list:
        print (food_label_map[e],i)
    i+=1

Sweet potato, raw, unprepared 77
Taro, raw 82
Burdock root, raw 91
Mountain yam, hawaii, raw 95
Cassava, raw 158
Chicory roots, raw 280
Salsify, (vegetable oyster), raw 433
Lotus root, raw 437
Ginger root, raw 521
Arrowhead, raw 632
Chicory, witloof, raw 636


In [216]:

fromt= 'http://idea.rpi.edu/heals/kb/usda#07005'
#Yam, raw
#mostSimilar =word_vectors.most_similar(asdf, topn=TOPK)
mostSimilar =word_vectors.most_similar(fromt, topn=TOPK)
gt_subs_list = scraped_subs_dict[fromt]
i=1
for (e, sim) in mostSimilar:
    #if e not in ground_truth_foods: continue
    #if e not in food2cat or food2cat[fromt] != food2cat[e]: continue
    if e in gt_subs_list:
        print (food_label_map[e],i)
    i+=1

Kielbasa, fully cooked, grilled 100
Bologna, pork 2106
Ham, minced 2447
Smoked link sausage, pork 3406
Mushrooms, shiitake, raw 5703
Meatballs, frozen, Italian style 6009
Beef, grass-fed, ground, raw 7902
Chickpeas (garbanzo beans, bengal gram), mature seeds, raw 8267
