Using NLTK to process and lemmatize ingredients

In [None]:
import nltk

nltk.download()

In [1]:
import nltk

nltk.download('punkt')  
nltk.download('averaged_perceptron_tagger')  
nltk.download('wordnet')  
nltk.download('omw-1.4')  

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [23]:
import re
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
import ast
import pandas as pd
import html

lemmatizer = WordNetLemmatizer()

def convert_pos_tag_to_wordnet(tag):
    tag = tag.upper()
    if tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('R'):
        return wordnet.ADV
    elif tag.startswith('J'):
        return wordnet.ADJ
    return None 

def lemmatize_word(word, tag):
    wordnet_tag = convert_pos_tag_to_wordnet(tag)
    if wordnet_tag:
        return lemmatizer.lemmatize(word, wordnet_tag)
    return word  

def clean_ingredient_text(ingredient):
    units = [
        "cup", "cups", "teaspoon", "tsp", "tablespoon", "tbsp", "oz", "ounce", "ounces",
        "gram", "grams", "kg", "ml", "liter", "liters", "lbs", "pound", "pounds", "pack",
        "pcs", "pieces", "slices", "dash", "pinch", "lb", "can"
    ]
    
    unwanted_phrases = [
        "taste", "canned", "package", "or", "drain", "rinse", "with", "is", "best",
        "to", "cooked", "chopped", "cook", "chop", "not", "evaporate", "cut", "into", "tablespoons",
        "teaspoons", "but", "I", "prefer", "bite", "sized", "cubes", "and", "cans", "at", "room",
        "temperature", "pkge", "in", "half", "thinly", "diced", "dice", "slice", "sliced",
        "strip", "strips", "large", "small", "uncooked", "old", "fashion"
    ]

    # Remove unwanted words and collapse multiple spaces
    ingredient = re.sub(r'\b\d+(\.\d+)?\b', '', ingredient)  
    ingredient = re.sub(r'\b(?:' + '|'.join(units) + r')\b', '', ingredient, flags=re.IGNORECASE)  
    ingredient = re.sub(r'\b(?:' + '|'.join(unwanted_phrases) + r')\b', '', ingredient, flags=re.IGNORECASE) 
    ingredient = re.sub(r'[^\w\s]', '', ingredient)
    ingredient = re.sub(r'\s+', ' ', ingredient).strip()

    return ingredient

def process_ingredient(ingredient):
    cleaned_ingredient = clean_ingredient_text(ingredient)
    tokens = word_tokenize(cleaned_ingredient)
    tagged_tokens = pos_tag(tokens)
    lemmatized_tokens = [lemmatize_word(word, tag) for word, tag in tagged_tokens]
    
    return " ".join(lemmatized_tokens)

def process_ingredient_list(ingredient_list):
    ingredients = [ingredient.strip() for ingredient in ingredient_list.split(',')]
    processed_ingredients = [process_ingredient(ingredient) for ingredient in ingredients]
    
    return ', '.join(processed_ingredients)


In [18]:
converters = {
    column: html.unescape 
    for column in ("name", "description")
} | {
    column: lambda value: tuple(map(html.unescape, ast.literal_eval(value)))
    for column in ("ingredients", "ingredients_raw_str", "search_terms", "steps")
} | {
    "tags": lambda value: frozenset(map(html.unescape, ast.literal_eval(value)))
}

df = pd.read_csv("Data/recipes_food_com_revised.csv", converters = converters)
df.head(10)


Unnamed: 0,Name,Description,IngredientsExtracted,IngredientsRaw,serving_size,Servings,Instructions,Tags,SearchTerms,TotalTime,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent
0,Grilled Garlic Cheese Grits,"We love grits, this is another good way to ser...","('water', 'grits', 'salt', 'cheddar cheese', '...","('4 cups water', '1 cup uncooked old ...",1 (155 g),8,"('I a sauce pan, bring water to a boil; slowly...","frozenset({'low-in-something', 'stove-top', 'd...","('low-calorie', 'side', 'diabetic', 'low-carb'...",65,144.8,6.7,3.3,14.9,382.7,15.7,0.9,0.2,5.0
1,Simple Shrimp and Andouille Jambalaya,"Simple, easy and very tasty for when you are i...","('onion', 'red bell pepper', 'garlic cloves', ...","('1 medium onion, chopped coarse ', '1 ...",1 (366 g),4,"('In a food processor, pulse the onion, red pe...","frozenset({'meat', 'pork-sausage', 'shellfish'...","('dinner', 'shrimp')",45,756.5,28.9,9.0,191.8,2094.9,82.3,3.9,9.7,39.2
2,black-and-white bean salad,,"('white beans', 'canned black beans', 'tomatoe...","('1 cup canned white beans, rinsed and dra...",1 (807 g),1,"('In a large bowl, combine beans, tomato, onio...","frozenset({'dietary', 'north-american', 'numbe...","('side', 'dinner', 'salad', 'vegan', 'vegetari...",5,159.0,1.6,0.3,0.0,318.3,28.3,8.3,2.1,9.1
3,Crock Pot Italian Zucchini,This is a good recipe for weight watchers. It ...,"('zucchini', 'yellow squash', 'diced tomatoes'...","('2 zucchini, sliced ', '2 small ye...",1 (244 g),4,('Put all ingredients in the crock pot and coo...,"frozenset({'dietary', 'weeknight', 'crock-pot-...","('side', 'vegetarian', 'italian')",370,47.1,0.4,0.1,0.0,19.1,10.4,2.7,4.6,2.6
4,Beef Stew With Dried Cherries,This is a fabulous stew that came from one of ...,"('beef stew meat', 'flour', 'salt', 'allspice'...","('3 lbs beef stew meat', '3 tablespoons...",1 (358 g),8,"('Preheat oven to 350°F.', ""Cut beef into 1 in...","frozenset({'meat', 'course', 'beef', 'preparat...","('dinner',)",165,691.1,45.8,17.7,177.1,503.6,14.4,1.6,6.8,48.2
5,Hot Sweet Almond Brittle,This is one of our standard holiday gift recip...,"('slivered almonds', 'cider vinegar', 'sugar',...","('12 ounces slivered almonds', '1/4 cup ...",1 (832 g),1,('Preheat oven to 375°F Place almonds in sing...,"frozenset({'low-in-something', 'dietary', 'can...","('dessert',)",80,2051.9,96.9,7.3,0.0,2668.3,280.2,24.6,243.1,41.9
6,Retro Chicken & Chips Casserole,"From Cooking Light Magazine, 11/05. This is co...","('chicken breasts', 'green onion', 'red bell p...","('4 cups roasted chopped chicken breasts',...",1 (85 g),6,"('In large bowl, combine chicken, green onion,...","frozenset({'meat', '30-minutes-or-less', 'one-...","('dinner', 'chicken', 'casserole')",25,515.5,39.0,15.4,49.0,649.3,30.6,2.4,4.0,13.3
7,Asparagus Omelette Wraps,"These wraps make a lovely breakfast, light lun...","('eggs', 'milk', 'fresh sage', 'fresh thyme', ...","('8 eggs', '1/2 cup milk', '1 tabl...",1 (499 g),4,"('Beat the eggs in a bowl. Add the milk, sage,...","frozenset({'dietary', 'free-of-something', '60...","('breakfast', 'gluten-free')",40,305.4,18.2,5.0,376.3,207.8,17.8,7.5,5.1,22.4
8,Potato-Crab Chowder,Soup for the soul!,"('butter', 'onion', 'garlic', 'potatoes', 'flo...","('2 tablespoons butter', '1 medium o...",1 (362 g),6,('Saute onion& garlic in melted butter in larg...,"frozenset({'low-in-something', 'stove-top', 'd...","('low-calorie', 'healthy', 'low-sodium', 'low-...",45,274.2,8.2,4.6,50.3,446.4,36.8,3.1,3.8,16.0
9,Sweet and Simple Sloppy Joes,Easy and kid-friendly recipe that I always hav...,"('lean ground beef', 'ketchup', 'heinz chili s...","('1 lb lean ground beef', '1/2 cup ke...",1 (103 g),4,('Brown ground beef with onion powder in a fr...,"frozenset({'low-in-something', 'lunch', 'dieta...","('lunch', 'low-carb')",30,243.7,11.5,4.6,73.7,537.7,10.6,0.7,8.8,23.5


In [46]:
df['NLP_Ingredients'] = df['IngredientsExtracted'].apply(process_ingredient_list)
df1 = df[['Name','NLP_Ingredients', 'IngredientsExtracted']]
df1.head(10)

Unnamed: 0,Name,NLP_Ingredients,IngredientsExtracted
0,Grilled Garlic Cheese Grits,"water, grit, salt, cheddar cheese, garlic, oli...","('water', 'grits', 'salt', 'cheddar cheese', '..."
1,Simple Shrimp and Andouille Jambalaya,"onion, red bell pepper, garlic clove, shrimp, ...","('onion', 'red bell pepper', 'garlic cloves', ..."
2,black-and-white bean salad,"white bean, black bean, tomato, onion, celery,...","('white beans', 'canned black beans', 'tomatoe..."
3,Crock Pot Italian Zucchini,"zucchini, yellow squash, tomato, onion, garlic...","('zucchini', 'yellow squash', 'diced tomatoes'..."
4,Beef Stew With Dried Cherries,"beef stew meat, flour, salt, allspice, cinnamo...","('beef stew meat', 'flour', 'salt', 'allspice'..."
5,Hot Sweet Almond Brittle,"sliver almond, cider vinegar, sugar, sugar, sa...","('slivered almonds', 'cider vinegar', 'sugar',..."
6,Retro Chicken & Chips Casserole,"chicken breast, green onion, red bell pepper, ...","('chicken breasts', 'green onion', 'red bell p..."
7,Asparagus Omelette Wraps,"egg, milk, fresh sage, fresh thyme, garlic clo...","('eggs', 'milk', 'fresh sage', 'fresh thyme', ..."
8,Potato-Crab Chowder,"butter, onion, garlic, potato, flour, milk, bl...","('butter', 'onion', 'garlic', 'potatoes', 'flo..."
9,Sweet and Simple Sloppy Joes,"lean ground beef, ketchup, heinz chili sauce, ...","('lean ground beef', 'ketchup', 'heinz chili s..."


In [53]:
def remove_duplicates(df, column_name):
    def process_ingredient_string(ingredient):
        try:
            # Attempt to parse the ingredient as a tuple or list
            parsed_ingredient = ast.literal_eval(ingredient)
            if isinstance(parsed_ingredient, (tuple, list)):
                # Remove duplicates and return as a sorted list
                return tuple(sorted(set(parsed_ingredient)))
        except (ValueError, SyntaxError):
            # Handle as a plain string if parsing fails
            return tuple(sorted(set([item.strip() for item in ingredient.split(',') if item.strip()])))
        
    df[column_name] = df[column_name].apply(process_ingredient_string)
    return df

In [54]:
df1 = remove_duplicates(df, 'NLP_Ingredients')
df1 = df1[['Name','NLP_Ingredients', 'IngredientsExtracted']]
df1.head(10)

Unnamed: 0,Name,NLP_Ingredients,IngredientsExtracted
0,Grilled Garlic Cheese Grits,"(cheddar cheese, garlic, grit, olive oil, salt...","('water', 'grits', 'salt', 'cheddar cheese', '..."
1,Simple Shrimp and Andouille Jambalaya,"(andouille sausage, bay leaf, clam juice, fres...","('onion', 'red bell pepper', 'garlic cloves', ..."
2,black-and-white bean salad,"(black bean, black pepper, celery, italian par...","('white beans', 'canned black beans', 'tomatoe..."
3,Crock Pot Italian Zucchini,"(garlic, green bell pepper, italian seasoning,...","('zucchini', 'yellow squash', 'diced tomatoes'..."
4,Beef Stew With Dried Cherries,"(allspice, beef stew meat, beef stock, black p...","('beef stew meat', 'flour', 'salt', 'allspice'..."
5,Hot Sweet Almond Brittle,"(cayenne pepper, cider vinegar, ground coriand...","('slivered almonds', 'cider vinegar', 'sugar',..."
6,Retro Chicken & Chips Casserole,"(cheddar cheese, chicken breast, dijon mustard...","('chicken breasts', 'green onion', 'red bell p..."
7,Asparagus Omelette Wraps,"(asparagus, egg, extra virgin olive oil, fresh...","('eggs', 'milk', 'fresh sage', 'fresh thyme', ..."
8,Potato-Crab Chowder,"(black pepper, butter, cayenne pepper, cream c...","('butter', 'onion', 'garlic', 'potatoes', 'flo..."
9,Sweet and Simple Sloppy Joes,"(heinz chili sauce, ketchup, lean ground beef,...","('lean ground beef', 'ketchup', 'heinz chili s..."


In [None]:
def remove_empty(ingredient):
    try:
        # Attempt to parse the ingredient as a tuple or list
        parsed_ingredient = ast.literal_eval(ingredient)
        if isinstance(parsed_ingredient, (tuple, list)):
            # Remove empty values and return as a sorted tuple
            return tuple(sorted(item for item in parsed_ingredient if item))
    except (ValueError, SyntaxError):
        # Handle as a plain string if parsing fails
        return tuple(sorted(item.strip() for item in ingredient.split(',') if item.strip()))

In [None]:
df2[NLP_Ingredients] = df2[NLP_Ingredients].apply(remove_empty)
df2.head(10)

TypeError: 'float' object is not iterable

In [14]:

df.to_csv('NLTK.csv')

In [3]:
list = "eggs, egg, chopped onions, evaporated milk"
print(process_ingredient_list(list))

egg, egg, onion, evaporate milk
