In [1]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import tqdm

### read data

In [3]:
ing_mapping = {}
#read usda mapping
ing_mapping = json.load(open("../generated/ing_id_mapping.json"))

#read usda id description
item_describe = json.load(open("../generated/id_description.json"))

#read cleaned recipes
cleaned_kaggle_recipes = json.load(open("../generated/clean_kaggle.json"))
cleaned_cookies_recipes = json.load(open("../generated/clean_cookies.json"))
cleaned_1m_recipes = json.load(open("../generated/clean_1m.json"))
cleaned_all_recipes = cleaned_kaggle_recipes+cleaned_cookies_recipes+cleaned_1m_recipes

#read ingredients count
ingredients_count = json.load(open("../generated/ingredients_count.json"))['count']


### study mapping

> Ingredients have been mapped to USDA food items. We first identify the ingredients that have been mapped to the same food item and find a representative name for them.

#### spot collisions

In [4]:
mapped_ids = [ing_mapping[k] for k in ing_mapping]

#build dict to store collisions
collisions = {}

for m in tqdm.tqdm(ing_mapping) :
    if ing_mapping[m] not in collisions.keys() :
        collisions[ing_mapping[m]] = [m]
        
    else :
        collisions[ing_mapping[m]].append(m)

100%|████████████████████████████████████████████████████████| 2649/2649 [00:00<00:00, 156244.62it/s]


In [20]:
collisions

{2047: ['salt'],
 1145: ['butter'],
 19908: ['sugar', 'turbinado sugar', 'turbinado'],
 11282: ['onion', 'onions', 'raw onion'],
 1123: ['egg', 'raw egg'],
 11981: ['pepper', 'peppers', 'hungarian pepper'],
 14555: ['water'],
 4053: ['olive oil', 'salad oil', 'olive salad'],
 1212: ['milk'],
 2030: ['black pepper'],
 2050: ['vanilla'],
 11695: ['tomato', 'tomatoes'],
 20063: ['flour'],
 19334: ['brown sugar'],
 9152: ['lemon juice'],
 28315: ['cinnamon'],
 11297: ['parsley'],
 18371: ['baking powder'],
 4669: ['vegetable oil'],
 11124: ['carrot', 'raw carrot', 'carrots'],
 11215: ['garlic'],
 18372: ['baking soda'],
 1146: ['parmesan cheese'],
 11216: ['ginger', 'ginger root'],
 2044: ['basil'],
 11362: ['potato', 'raw potato', 'potato skin', 'potatoes'],
 16124: ['soy sauce', 'tamari', 'tamari soy sauce'],
 2049: ['thyme'],
 11165: ['cilantro'],
 9156: ['lemon', 'lemon peel'],
 7935: ['chicken breast'],
 11292: ['green onion', 'green onion top', 'onion top'],
 1179: ['sour cream'],
 2

#### find representative name for each group

In [6]:
import collections


item_number = 43
proportion = 0.5

representative_keys = dict()

for i, c in tqdm.tqdm(enumerate(collisions)) :
    
    all_items = " ".join(collisions[c]).split(" ")
    counter = collections.Counter()
    counter.update(all_items)

    #find common names
    common_names = [x[0] for x in counter.most_common() if x[1] > len(collisions[c])*proportion]

    #choose database description
    if len(common_names) == 0 :
        representative_keys[c] = collisions[c][0]

    elif len(common_names) == 1 :
        representative_keys[c] = common_names[0]

    else :
        #determine order

        #case 1, there exist an entry with only wanted words
        exact_match = [x.split(" ") for x in collisions[c] if (len(set(common_names).difference(set(x.split(" "))))== 0)]
        if len(exact_match) != 0 :
            representative_keys[c] = " ".join(exact_match[0])

        #case 2, no exact match
        else :

            all_words_collisions = [x.split(" ") for x in collisions[c] if set(common_names).issubset(set(x.split(" ")))]

            index_tuples = [ (word, index) for collision in all_words_collisions for index, word in enumerate(collision) if (word in common_names)]

            index_counts = np.array([0]*len(common_names))

            #average the relative indices
            for it in index_tuples :
                word_index = common_names.index(it[0])
                index_counts[word_index] += it[1]

            index_counts = index_counts

            common_names_ordered = " ".join([common_names[i] for i in np.argsort(index_counts)])
            representative_keys[c] = common_names_ordered



1863it [00:00, 124521.74it/s]


In [7]:
representative_keys

{2047: 'salt',
 1145: 'butter',
 19908: 'turbinado sugar',
 11282: 'onion',
 1123: 'egg',
 11981: 'pepper',
 14555: 'water',
 4053: 'olive oil salad',
 1212: 'milk',
 2030: 'black pepper',
 2050: 'vanilla',
 11695: 'tomato',
 20063: 'flour',
 19334: 'brown sugar',
 9152: 'lemon juice',
 28315: 'cinnamon',
 11297: 'parsley',
 18371: 'baking powder',
 4669: 'vegetable oil',
 11124: 'carrot',
 11215: 'garlic',
 18372: 'baking soda',
 1146: 'parmesan cheese',
 11216: 'ginger',
 2044: 'basil',
 11362: 'potato',
 16124: 'tamari soy sauce',
 2049: 'thyme',
 11165: 'cilantro',
 9156: 'lemon',
 7935: 'chicken breast',
 11292: 'green onion top',
 1179: 'sour cream',
 2027: 'oregano',
 19296: 'honey',
 2014: 'cumin',
 1270: 'cheddar',
 4572: 'nutmeg',
 1017: 'cream cheese',
 11143: 'celery',
 2020: 'garlic powder',
 6194: 'chicken soup',
 4532: 'oil',
 12071: 'almond',
 2028: 'paprika',
 43598: 'mayonnaise',
 1053: 'heavy cream',
 11993: 'mushroom',
 16104: 'bacon',
 20027: 'cornstarch',
 2004: '

> Some ingredients may have been mapped to the same representative without having the same usda matching. Did it happened ?

In [38]:
con = collections.Counter()
con.update([representative_keys[x] for x in representative_keys])
len([c for c in con.most_common() if c[1] >=2])

43

In [9]:
[(c, representative_keys[c]) for c in representative_keys][:10]

[(2047, 'salt'),
 (1145, 'butter'),
 (19908, 'turbinado sugar'),
 (11282, 'onion'),
 (1123, 'egg'),
 (11981, 'pepper'),
 (14555, 'water'),
 (4053, 'olive oil salad'),
 (1212, 'milk'),
 (2030, 'black pepper')]

> It indeed happened, should we map them to the same usda entry ? let's look at the different ingredient sets

In [11]:
for c in con.most_common()[:10] :
    print("Representative term : ", c[0], " (", c[1], " times)")
    matching_keys = []
    for key in representative_keys :
        if representative_keys[key]== c[0] :
            print("\tusda food item : ", item_describe[str(key)])
            
            for ing in collisions[key] :
                print("\t\tingredient : ", ing)
            print("\n")
            
    print("\n")
    

Representative term :  chicken  ( 4  times)
	usda food item :  Chicken, meatless
		ingredient :  chicken


	usda food item :  Chicken, skin (drumsticks and thighs), raw
		ingredient :  chicken thigh
		ingredient :  chicken drumstick
		ingredient :  drumstick
		ingredient :  chicken thighs


	usda food item :  Chicken, broiler, rotisserie, BBQ, back meat only
		ingredient :  chicken back
		ingredient :  chicken backs


	usda food item :  Frankfurter, chicken
		ingredient :  chicken frankfurter
		ingredient :  chicken dog




Representative term :  bean  ( 4  times)
	usda food item :  Beans, snap, green, raw
		ingredient :  green bean
		ingredient :  bean
		ingredient :  beans


	usda food item :  Beans, chili, barbecue, ranch style, cooked
		ingredient :  ranch style bean
		ingredient :  barbecue bean


	usda food item :  Noodles, chinese, cellophane or long rice (mung beans), dehydrated
		ingredient :  cellophane noodle
		ingredient :  long bean
		ingredient :  mung bean noodle
		ingre

> We choose to remember the representative string because we find it good

In [12]:
json.dump(representative_keys, open("../generated/id_repr.json" , 'w'))

### rewrite recipes with usda_id, or original name if no id found

In [13]:
usda_all_mapped_recipes = []
usda_any_mapped_recipes = []
for r in cleaned_all_recipes:
    all_ids_found = True
    founds = list(filter(lambda i: i in ing_mapping.keys(), r))
    found_ids = list(map(lambda i: ing_mapping[i],founds))
    if len(found_ids) == len(r):
        usda_all_mapped_recipes.append(found_ids)
    if len(found_ids) != 0:
        usda_any_mapped_recipes.append(found_ids)
        
json.dump(usda_all_mapped_recipes, open("../generated/all_recipes_all_ids.json", 'w'))
json.dump(usda_any_mapped_recipes, open("../generated/all_recipes_any_ids.json", 'w'))

In [14]:
len(usda_any_mapped_recipes)

963458