In [3]:

def getAllUnits():
  import json

  with open('data/units.json', 'r') as f:
    data = json.load(f)

  units = []

  for key in data["units"]:
      units.append(key)
      units.append(data["units"][key]["name"].lower())

  units.append("kcal")
  return units

In [87]:

import re
import spacy
import pandas as pd
import nltk
from nltk.corpus import words
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams

# Create the nlp tool
nlp = spacy.load('en_core_web_sm')


def my_preprocessing(raw_sentence): 
    token_sentence = nlp(raw_sentence)
    preprocessed_sentence = []
    for word in token_sentence:
        if not word.is_stop and word.pos_ != 'PUNCT':
            preprocessed_sentence.append(word.lemma_)
    return preprocessed_sentence

def spell_correction(word): 
    if re.search(r'\d', word): 
        return word
    correct_words = words.words()
    try: 
        temp =[(jaccard_distance(set(ngrams(word,2)),
            set(ngrams(w,2))), w)
            for w in correct_words if w[0] == word[0]]

    except Exception as e: 
        return word

    if len(temp) == 0: 
        return word

    return sorted(temp, key = lambda val:val[0])[0][1]


In [64]:
def findAllUnitAndRemoveSpace(input):
    units = getAllUnits()
    modifiedInput = []
    token_sentence = nlp(input)
    for word in token_sentence:
        word = word.lemma_
        if word in units:
            if modifiedInput[len(modifiedInput)-1].isdigit():
                n = modifiedInput.pop()
                print(n)
                modifiedInput.append(str(n)+word)
        else:
            modifiedInput.append(word)

    print(modifiedInput)
    return " ".join(modifiedInput)
print(findAllUnitAndRemoveSpace("i have 250 kg pork want to make a meal with less than 350 cal"))
    

350
['i', 'have', '250', 'kg', 'pork', 'want', 'to', 'make', 'a', 'meal', 'with', 'less', 'than', '350cal']
i have 250 kg pork want to make a meal with less than 350cal


In [55]:
NUTRITION_DICT = {
}
nutrition_df = pd.read_csv('./data/nutrition_ingredients_metadata.csv')
for index, row in nutrition_df.iterrows(): 
    ingredient = row['ingr']

    arr_words = my_preprocessing(ingredient)

    for word in arr_words: 
        if word not in NUTRITION_DICT: 
            NUTRITION_DICT[word] = []

        NUTRITION_DICT[word].append({
            'ingr': ingredient,
            'words': arr_words,
            'id': row['id'],
            'cal': row['cal/g'],
            'fat': row['fat(g)'],
            'carb': row['carb(g)'],
            'protein': row['protein(g)']
        })

print('size:', len(NUTRITION_DICT))

size: 492


In [56]:
test_NUTRITIONS_dict = {}
nutrition_df = pd.read_csv('./data/nutrition_ingredients_metadata.csv')
for index, row in nutrition_df.iterrows(): 
    ingredient = row['ingr']
    test_NUTRITIONS_dict[ingredient] = {
        'id': row['id'],
        'cal': row['cal/g'],
        'fat': row['fat(g)'],
        'carb': row['carb(g)'],
        'protein': row['protein(g)']
    }

In [88]:
import json
with open('./data/nutritions_dict.json', 'w') as f:
    json.dump(test_NUTRITIONS_dict, f)

In [82]:

def findIngredients(input):
    # input = findAllUnitAndRemoveSpace(input)
    # print(input)
    ingredients = []
    arr_words = my_preprocessing(input)
    print(arr_words)
    finalinput = []
    for i in range(len(arr_words)): 
        w = arr_words[i]
        if i >0:
            lastword = finalinput[len(finalinput) - 1]
            if "{0} {1}".format(lastword,w) in test_NUTRITIONS_dict:
                # override lastword
                finalinput[len(finalinput) - 1] = "{0} {1}".format(lastword,w)
                continue
            elif w in UNIT_EXCHANGE_DICT and lastword.isdigit():
                finalinput[len(finalinput) - 1] = "{0} {1}".format(lastword,w)
                continue
        finalinput.append(w)
    print(finalinput)

    for i in range(len(finalinput)):
        w = finalinput[i]
        if w in test_NUTRITIONS_dict:
            # check prvious
            if i > 0:
                # check if the previous description is number and unit
                m = re.search(r'^(\d+)(.*)$', finalinput[i-1])
                if m:
                    # find unit
                    # convert unit to gram
                    n = m.group(1).strip()
                    u = m.group(2).strip()
                    nn = n
                    if u in UNIT_EXCHANGE_DICT:
                        nn = int(n) * UNIT_EXCHANGE_DICT[u]
                        print('find ingredient: %s -> %s%s (%sg)' % (w,n,u,nn))
                        ingredients.append({
                            'ingredient': w,
                            'original unit': '{0}{1}'.format(n,u),
                            'in_g': nn,
                        })
                        continue
            print('find ingredient: %s' % w)
    return ingredients

findIngredients("i have 250 kg ground pork want to make a meal with less than 350kcal")
        

['250', 'kg', 'ground', 'pork', 'want', 'meal', '350kcal']
['250 kg', 'ground pork', 'want', 'meal', '350kcal']
find ingredient: ground pork -> 250kg (250000g)


[{'ingredient': 'ground pork', 'original unit': '250kg', 'in_g': 250000}]

In [19]:

def processUserQuery(input):
    input = findAllUnitAndRemoveSpace(input)
    print('query:', input)
    arr_words = my_preprocessing(input)
    print('my_preprocessing words:', arr_words)
    for i in range(len(arr_words)): 
        word = arr_words[i]
        old_word = word
        word = spell_correction(old_word)
        if not word == old_word: 
            print('spell_correction: [%s] -> [%s]' % (old_word, word))
        arr_words[i] = word

    print('words found in database: ')
    ingredients = []
    for word in arr_words: 
        if word in NUTRITION_DICT: 
            value_list = NUTRITION_DICT[word]
            for value in value_list: 
                print('\tword[%s] ingredient[%s]' % (word, value['ingr']))
            ingredients.append(word)

    print(ingredients)
    print('digits:')
    for word in arr_words: 
        m = re.search(r'^(\d+)(.*)$', word)
        if not m: 
            continue
        print(m.group(1))
        digits = m.group(1)
        unit = m.group(2)
        print('\t%s %s' % (digits, unit))
    

processUserQuery("i have 250 grams ground pork want to make a meal with less than 350kcal")

"i have 250gram pork want to make a meal with less than 350kcal".split().index("pork")

250
['i', 'have', '250gram', 'ground', 'pork', 'want', 'to', 'make', 'a', 'meal', 'with', 'less', 'than', '350kcal']
query: i have 250gram ground pork want to make a meal with less than 350kcal
my_preprocessing words: ['250gram', 'ground', 'pork', 'want', 'meal', '350kcal']
words found in database: 
	word[ground] ingredient[ground beef]
	word[ground] ingredient[ground pork]
	word[ground] ingredient[ground chicken]
	word[ground] ingredient[ground turkey]
	word[pork] ingredient[ground pork]
	word[pork] ingredient[pork chops]
	word[pork] ingredient[roast pork]
	word[pork] ingredient[pork]
['ground', 'pork']
digits:
250
	250 gram
350
	350 kcal


3

In [1]:
WEIGHT_EXCHANGE_DICT = {
    'pound': 453.59237,
    'ml': 1,
    'l': 1000,
    'oz': 28.3495231,
    'ounce': 28.3495231,
    'heaping': 28.3495231,
    'gram': 1,
    'kg': 1000,
    'g': 1,
}


# --> unit ->  g
UNIT_EXCHANGE_DICT = {
    'pound': 453.59237,
    'ml': 1,
    'l': 1000,
    'oz': 28.3495231,
    'ounce': 28.3495231,
    'heaping': 28.3495231,
    'gram': 1,
    'kg': 1000,
    'g': 1,
    'big bunch': 22,
    'big handful': 20,
    'bottle': 50,
    'bunch': 10,
    'can': 20,
    'cans': 20,
    'clove': 5,
    'cloves': 5,
    'cup': 20,
    'cups': 20,
    'g': 1,
    'grams': 1,
    'handful': 10,
    'jar': 15,
    'jars': 15,
    'kg': 1000,
    'l': 150,
    'large bunch': 22,
    'large clove': 2,
    'large cloves': 2,
    'large handful': 20,
    'large jar': 10,
    'large lb': 453.59237,
    'large pinch': 0.7,
    'large sheet': 0.8,
    'large slices': 2,
    'large sprigs': 1,
    'large stalk': 6,
    'lb': 453.59237,
    'lbs': 453.59237,
    'medium clove': 10,
    'medium cloves': 10,
    'ml': 1,
    'ounce': 28.3495231,
    'ounces': 28.3495231,
    'oz': 28.3495231,
    'oz.': 28.3495231,
    'package': 20,
    'packages': 20,
    'pinch': 0.5,
    'pound': 453.59237,
    'pounds': 453.59237,
    'sheet': 0.65,
    'slices': 3,
    'small bunch': 8,
    'small can': 10,
    'small clove': 3,
    'small handful': 8,
    'small jar': 10,
    'small pinch': 0.3,
    'small sprigs': 5,
    'small stalk': 20,
    'sprigs': 8,
    'stalk': 30,
    'stalks': 30,
    'stick': 12,
    'sticks': 12,
    'tablespoon': 4.3,
    'tablespoons': 4.3,
    'tbsp': 4.3,
    'teaspoon': 4.2,
    'teaspoons': 4.2,
}

In [27]:
from quantities import units

# unit_symbols = [u.symbol for _, u in units.__dict__.items()]
# print(unit_symbols)

for _, u in units.__dict__.items():
    print(u)

quantities.units


quantities.units
<_frozen_importlib_external.SourceFileLoader object at 0x7fb1c8cb3a50>
ModuleSpec(name='quantities.units', loader=<_frozen_importlib_external.SourceFileLoader object at 0x7fb1c8cb3a50>, origin='/Users/roychiu/opt/anaconda3/envs/py3.7/lib/python3.7/site-packages/quantities/units/__init__.py', submodule_search_locations=['/Users/roychiu/opt/anaconda3/envs/py3.7/lib/python3.7/site-packages/quantities/units'])
['/Users/roychiu/opt/anaconda3/envs/py3.7/lib/python3.7/site-packages/quantities/units']
/Users/roychiu/opt/anaconda3/envs/py3.7/lib/python3.7/site-packages/quantities/units/__init__.py
/Users/roychiu/opt/anaconda3/envs/py3.7/lib/python3.7/site-packages/quantities/units/__pycache__/__init__.cpython-37.pyc
All Rights Reserved.

Copyright (c) 2000 BeOpen.com.
All Rights Reserved.

Copyright (c) 1995-2001 Corporation for National Research Initiatives.
All Rights Reserved.

Copyright (c) 1991-1995 Stichting Mathematisch Centrum, Amsterdam.
All Rights R

1
{'name': 'One', 'description': 'Unit non-dimensional quantity', 'category': 'unitless', 'scale': '1', 'dimensions': {}}
%
{'name': 'Percent', 'description': 'Non dimensional percent (100 % == 1)', 'category': 'unitless', 'scale': '1e-2', 'dimensions': {}}
ppm
{'name': 'Parts per Million', 'description': 'Unitless quantity defined as 1 / 10^6', 'category': 'unitless', 'scale': '1e-6', 'dimensions': {}}
ppb
{'name': 'Parts per Billion', 'description': 'Unitless quantity defined as 1 / 10^9', 'category': 'unitless', 'scale': '1e-9', 'dimensions': {}}
ppt
{'name': 'Parts per Trillion', 'description': 'Unitless quantity defined as 1 / 10^12', 'category': 'unitless', 'scale': '1e-12', 'dimensions': {}}
g
{'name': 'Gram', 'description': 'unit of mass defined as 1e-3 kg', 'category': 'mass', 'scale': '1e-3', 'dimensions': {'mass': 1}}
Da
{'name': 'Dalton', 'description': 'Sometimes called an atomic mass unit, defined as 1/12 of the mass of an unbound neutral atom of carbon-12 in its nuclear 