# Seperating ingredients into unit, quantity and item

1. unpack list into items
2. for each item:
3.     check taste profile csv for existence, also use NER entries. For which ingredient to use: find shortest
4.     if exists: split by " ", 1st is quantity, 2nd is unit, concat rest into item(long), taste profile title for item(short)
5.     put all these into new lists and then DF for new CSV


~30k recipes do not have the same number of ingredients and NER (extracted short ingredients). Decision: drop them.
We only want recipes with multiple valid ingredients.
Valid ingredients have a parsable unit and/or quantity.
Parsable units are units in one of the imperial or metric units defined in the conversion dictionary.
Valid quantities are quantities that can be converted to a number using our custom parsing function

In [1]:
import pandas as pd
import time

In [192]:
recipes = pd.read_csv("NLG Recipe dataset/dataset/first_tenth_recipes.csv")
ingredients = pd.read_csv("1683806651775-Taste,_Fat_and_Texture_Da/original/taste_profiles_V5.csv")

In [195]:
ingredients.loc[73, "Key_ingredient"] += "  " #manipulating results to deprioritize peanut butter compared to butter

In [182]:
recipes.head()

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER
0,0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."


In [183]:
ingredients.head()

Unnamed: 0.1,Unnamed: 0,Product_description,Key_ingredient,Food_group,m_sweet,m_sour,m_bitter,m_umami,m_salt,m_fat
0,0,Potatoes mashed prep w semi-sk milk+marg,mashed potato,Potatoes,7.0,3.0,1.0,15.0,44.0,36.0
1,1,Rosti prepared without fat,rosti,Potatoes,9.0,3.0,1.0,15.0,35.0,29.0
2,2,Chips pre-fried unprepared,chips,Potatoes,9.0,3.0,0.0,7.0,15.0,42.0
3,3,Potatoes boiled with skin av,potato,Potatoes,6.0,2.0,1.0,8.0,6.0,9.0
4,4,Potato waffels/balls frozen unprepared,potato ball,Potatoes,7.0,3.0,1.0,13.0,40.0,45.0


In [184]:
def strToLst(item):
    "Takes a string that encodes a list. Converts it into a proper list"
    return item[2:-2].split("\", \"")

In [185]:
#making a mask to remove recipes with different ingredients vs NER:
mask = [ len(strToLst(recipes["ingredients"][i])) == len(strToLst(recipes["NER"][i])) for i in range(len(recipes.values))]
recipes2 = recipes[mask]

In [186]:
recipes2.reset_index(inplace=True, drop=True)
recipes2.head()

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER
0,0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."
4,5,Cheeseburger Potato Soup,"[""6 baking potatoes"", ""1 lb. of extra lean gro...","[""Wash potatoes; prick several times with a fo...",www.cookbooks.com/Recipe-Details.aspx?id=20115,Gathered,"[""baking potatoes"", ""extra lean ground beef"", ..."


In [187]:
#confirm no more duplicates
val = 0
for i in range(len(recipes2.values)):
    if len(strToLst(recipes2["ingredients"][i])) != len(strToLst(recipes2["NER"][i])):
        val += 1
print(val)

0


In [188]:
#lets try handling item 1:
item = strToLst(recipes["ingredients"][0])
NER = strToLst(recipes["NER"][0])

print(item, NER)

['1 c. firmly packed brown sugar', '1/2 c. evaporated milk', '1/2 tsp. vanilla', '1/2 c. broken nuts (pecans)', '2 Tbsp. butter or margarine', '3 1/2 c. bite size shredded rice biscuits'] ['brown sugar', 'milk', 'vanilla', 'nuts', 'butter', 'bite size shredded rice biscuits']


In [252]:
#dictionary mapping diverse units to standardized ones (so hopefully US customary or metric ones with all the same spelling)
#used to have only actual units in the units
unit_conversion_dict = {
    " " : " ",
    "bag" : "bags",
    "bar" : "bar",
    "bowl" : "bowls",
    "bowls" : "bowls",
    "bunch" : "bunches",
    "bunches" : 'bunches',
    "bundle" : "bunches", #only used for some spinach
    "bushel" : "bushel",
    "bxs" : "boxes",
    "c" : "cups",
    "c." : "cups",
    "c," : "cups",
    "c.c" : "cups",
    "c/" : "cups",
    "cup," : "cups",
    "cuppa" : "cups",
    "cups." : "cups",
    "cups" : "cups",
    "dash" : "dashes",
    "dashes" : "dashes",
    "dl" : "dl",
    "dot" : "dots",
    "dots" : "dots",
    "doz." : "dozen",
    "fl." : "fluid ounces",  #logical extrapolation
    "g" : "grams",
    "g." : "grams",
    "gal" : "gallons",
    "gal." : "gallons",
    "gal.)" : "gallons",
    "gallon" : "gallons",
    "gm." : "grams",
    "gr" : "grams",
    "gr." : "grams",
    "grams" : "grams",
    "head" : "heads",
    "heads" : "heads",
    "in." : "inches",
    "inch" : "inches",
    "inches" : "inches",
    "jar" : "jars",
    "jars" : "jars",
    "k." : "kilograms",
    "keg" : "jars", #there are kegs of ketchup... basically jars right?
    "kg" : "kilograms",
    "kg." : "kilograms",
    "kilo" : "kilograms",
    "kilograms" : "kilograms",
    "l.??" : "liters",
    "liters" : "liters",
    "lb" : "lbs",
    "lb." : "lbs",
    "lb)" : "lbs",
    "lb," : "lbs",
    "lb.)" : "lbs",
    "lb.)," : 'lbs',
    "lb.," : "lbs",
    "lbs" : "lbs",
    "lbs." : "lbs",
    "lbs.)" : "lbs",
    
    "liter)" : "liter",
    "liter" : "liter",
    "ml" : "ml",
    "ml." : "ml", #milliliter
    "ounce" : "ounces",
    "ounce)" : "ounces",
    "ounces" : "ounces",
    "oz" : "ounces",
    "oz)" : "ounces",
    "oz." : "ounces",
    "oz.)" : "ounces",
    "oz.)(" : "ounces",
    "oz.)." : "ounces",
    "oz.)/tcan" : "ounces",
    "no" : "nr",
    "no." : "nr",
    "package" : "packages",
    "packages" : "packages",
    "pcs." : "pieces",
    "pint" : "pints",
    "pkg" : "packages",
    "pkg." : "packages",
    "pkg.)" : "packages",
    "pkgs" : "packages",
    "pkgs." : "packages",
    "pound" : "lbs",
    "pounds" : "lbs",
    "pound)" : "lbs",
    "qt" : "quarts",
    "qt." : "quarts",
    "qt.)" : "quarts",
    "qts" : "quarts",
    "qts." : "quarts",
    "spoon" : "tablespoons", 
    "stick" : "gill", #default butter stick is 8 tablespoons which is 1 gill according to some us guy i asked
    "stick)" : "gill",
    "stick," : "gill",
    "stick?" : "gill",
    "sticks" : "gill",
    "stick)" : "gill",
    "t" : "teaspoons",
    "t." : "teaspoons",
    "tablespoon" : "tablespoons",
    "tablespoons" : "tablespoons",
    "tbls" : "tablespoons",
    "tbls." : "tablespoons",
    "tblsp." : "tablespoons",
    "tbs" : "tablespoons",
    "tbsp" : "tablespoons",
    "tbsp." : "tablespoons",
    "tbsp.)" : "tablespoons",
    "tbsp.," : "tablespoons",
    "tbsp.?" : "tablespoons",
    "tbsps." : "tablespoons",
    "teaspoon" : "teaspoons",
    "teaspoons" : "teaspoons",
    "tsp" : "teaspoons",
    "tsp." : "teaspoons",
    "tsp.)" : "teaspoons",
    "tsp.white" : "teaspoons",
    "tsps." : "teaspoons",
    "tub" : "packages",
}

In [253]:
def qtyToFloat(qty : str) -> float:
    """
    Converts a recipe quantity into a normal float.
    Should handle: 
    integers: '1'
    fractions: '1/2'
    ranges: '3-5'
    """
    
    if qty == "" or qty == " ":
        return 0.01
    
    nr = 0.0
    
    for i in qty.split(" "):
        #skip items with brackets (these are typically pan measurements or new recipe quantities for more/less people)
        if "(" in i or ")" in i:
            continue
        #skip items that contain letters (typically units or pan sizes)
        skip = False
        for char in i:
            if char not in "1234567890/- ":
                skip = True
        if skip:
            continue
        try:
            #integer or good float handling
            nr += abs(float(i))
        except:
            #ranges: pick smaller value
            if "-" in i:
                j = i.split("-")
                nr += qtyToFloat(j[0])
            #fractions
            elif "/" in i:
                j = i.split("/")
                nr += qtyToFloat(j[0])/qtyToFloat(j[1])
    return(nr)
            
    

In [254]:
start = time.time()
#now do this for everything, with the caveat I only want to save things that have at least 2 associated records
records_list = []
for i in range(len(recipes.values)):
    NER = strToLst(recipes["NER"][i])
    items = strToLst(recipes["ingredients"][i])
    records = []
    for j in range(len(NER)):
        val = NER[j]
        #find candidate ingredients
        lst = []
        for val2 in ingredients["Key_ingredient"]:
            if val in val2.split(" "):
                #print("hello", val, val2)
                lst.append(val2)
        #choose final ingredient:
        if lst:
            minlen = 99999
            for val2 in lst:
                if len(val2) < minlen:
                    minlen = len(val2)
                    shortest = val2
            #print(shortest, val)
            #generate new listing: recipe, quantity, unit, ingredientNER, ingredientIngredients
            item_split = items[j].split(" ")
            qty = ""
            unit = ""
            rest = ""
            no_qty=False
            for a in item_split:
                found = False
                for k in a:
                    if k in "123456789":
                        found = True
                        break
                if found and not no_qty:
                    #assume anything containing a number is part of the quantity
                    qty += f" {a}"
                elif unit == "" and qty != "":
                    #assume right after that is a 1 word unit
                    #unless the next word has too much overlap with the NER ingredient (eg. "onion," and "onion")
                    if not a in val and not val in a:
                        unit = a
                    else:
                        unit = " "
                        rest += f" {a}"
                    no_qty = True
                else:
                    #and anything else is rest
                    rest += f" {a}"
            #don't append records with no quantity nor unit
            if unit != " " and qty != "":
                #convert unit to standardized and quantity to a float
                new_qty = qtyToFloat(qty)
                try:
                    new_unit = unit_conversion_dict[unit.lower()]
                    record = [i, new_qty, new_unit, val, shortest, rest]
                    records.append(record)
                except:
                    pass
    #only keep recipes with at least 2 ingredients
    if len(records) >= 2:
        for record in records:
            records_list.append(record)
print(time.time()-start)

186.97996258735657


In [259]:
new_df = pd.DataFrame(records_list)

In [260]:
len(new_df[0].unique()) #110k recipes left #down to 85k now

76379

In [249]:
new_df.columns = ["recipeID", "quantity", "unit", "ingredient_NER", "ingredient_Key_ingredient", "rest of ingredient"]
new_df.head()

Unnamed: 0,recipeID,quantity,unit,ingredient_NER,ingredient_Key_ingredient,rest of ingredient
0,0,0.5,cups,milk,milk (low fat),evaporated milk
1,0,0.5,teaspoons,vanilla,vanilla pudding,vanilla
2,0,2.0,tablespoons,butter,salted butter,butter or margarine
3,2,0.333333,cups,butter,salted butter,"butter, cubed"
4,2,0.25,teaspoons,pepper,red bell pepper,pepper


In [221]:
#new_df[1000:1050]

In [251]:
#recipes2.to_csv("NLG Recipe dataset/dataset/cleaned_recipes.csv", index=False)
#new_df.to_csv("NLG Recipe dataset/dataset/recipe_ingredientsv3.csv", index=False)

In [223]:
# new_df["unit"] = new_df["unit"].str.lower()


29


In [255]:
# for i in sorted(new_df["unit"].unique()):
#     print(i)
print(len(new_df["unit"].unique()))

In [257]:
# for i in new_df["quantity"].unique():
#     print(i)
print(len(new_df["quantity"].unique()))

122


11.0

Unnamed: 0,recipeID,quantity,unit,ingredient_NER,ingredient_Key_ingredient,rest of ingredient
7598,5740,1,tub,pineapple,pineapple,Cool Whip
85786,64601,1/2,tub,milk,milk (low fat),Cool Whip
118370,89129,1,tub,cream,salad cream,whip cream
153669,115617,1,tub,milk,milk (low fat),Cool Whip
161750,121684,1,tub,chocolate,hot chocolate,Cool Whip
179989,135450,1,tub,pineapple,pineapple,Cool Whip
264411,198985,1,tub,pineapple,pineapple,Cool Whip
274403,206379,1,tub,pineapple,pineapple,Cool Whip


Unnamed: 0                                                132907
title                                                Pumpkin Pie
ingredients    ["1 qt. (1 pt.) pumpkin", "8 Tbsp. (4 Tbsp.) f...
directions     ["Put 8 cups milk on stove; heat to boiling po...
link             www.cookbooks.com/Recipe-Details.aspx?id=720734
source                                                  Gathered
NER            ["pumpkin", "flour", "sugar", "salt", "allspic...
Name: 132907, dtype: object