In [2]:
import nltk

nltk.download()


KeyboardInterrupt: 

In [1]:
import nltk

nltk.download('punkt')  
nltk.download('averaged_perceptron_tagger')  
nltk.download('wordnet')  
nltk.download('omw-1.4')  

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
import re
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag

lemmatizer = WordNetLemmatizer()

def convert_pos_tag_to_wordnet(tag):
    tag = tag.upper()
    if tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('R'):
        return wordnet.ADV
    elif tag.startswith('J'):
        return wordnet.ADJ
    return None 

def lemmatize_word(word, tag):
    wordnet_tag = convert_pos_tag_to_wordnet(tag)
    if wordnet_tag:
        return lemmatizer.lemmatize(word, wordnet_tag)
    return word  

def clean_ingredient_text(ingredient):
    units = [
        "cup", "cups", "teaspoon", "tsp", "tablespoon", "tbsp", "oz", "ounce", "ounces",
        "gram", "grams", "kg", "ml", "liter", "liters", "lbs", "pound", "pounds", "pack",
        "pcs", "pieces", "slices", "dash", "pinch", "lb", "can"
    ]
    
    unwanted_phrases = [
        "taste", "canned", "is best", "package", "or", "drain", "rinse", "with", "is", "best",
        "to", "cooked", "chopped", "cook", "chop", "not", "evaporate", "cut", "into", "tablespoons",
        "teaspoons", "but", "I", "prefer", "bite", "sized", "cubes", "and", "cans", "at", "room",
        "temperature", "pkge", "in", "half", "thinly", "diced", "dice", "slice", "sliced",
        "strip", "strips", "large", "small"
    ]

    # Remove unwanted words and collapse multiple spaces
    ingredient = re.sub(r'\b\d+(\.\d+)?\b', '', ingredient)  
    ingredient = re.sub(r'\b(?:' + '|'.join(units) + r')\b', '', ingredient, flags=re.IGNORECASE)  
    ingredient = re.sub(r'\b(?:' + '|'.join(unwanted_phrases) + r')\b', '', ingredient, flags=re.IGNORECASE) 
    ingredient = re.sub(r'[^\w\s]', '', ingredient)
    ingredient = re.sub(r'\s+', ' ', ingredient).strip()

    return ingredient

def process_ingredient(ingredient):
    cleaned_ingredient = clean_ingredient_text(ingredient)
    tokens = word_tokenize(cleaned_ingredient)
    tagged_tokens = pos_tag(tokens)
    lemmatized_tokens = [lemmatize_word(word, tag) for word, tag in tagged_tokens]
    
    return " ".join(lemmatized_tokens)

def process_ingredient_list(ingredient_list):
    ingredients = [ingredient.strip() for ingredient in ingredient_list.split(',')]
    processed_ingredients = [process_ingredient(ingredient) for ingredient in ingredients]
    
    return ', '.join(processed_ingredients)


In [None]:
import pandas as pd

# Read documentation!, use ast literal eval
# Try catch with literal eval, converters
df = pd.read_csv('Data/recipes_food_com_cleaned.csv')
df['NLP_Ingredients'] = df['IngredientsRaw'].apply(process_ingredient_list)
df = df[['ID', 'Name','NLP_Ingredients', 'Cleaned_Ingredients']]
df.head(10)


Unnamed: 0,ID,Name,NLP_Ingredients,Cleaned_Ingredients
0,71247,Cherry Streusel Cobbler,"cherry pie filling, egg, sweeten condensed mil...","['condensed milk', 'margarine', 'self-rising f..."
1,76133,Reuben and Swiss Casserole Bake,"corn beef, , thousand island dress, sauerkraut...","['corned beef', 'sauerkraut', 'swiss cheese', ..."
2,503816,Yam-Pecan Recipe,"unsalted butter, , sugar, vegetable oil, egg, ...","['salt', 'sugar', 'all - purpose flour', 'vege..."
3,418749,Tropical Orange Layer Cake,"orange cake mix, instant vanilla pudding, oran...","['orange gelatin', 'instant vanilla pudding', ..."
4,392934,"Safe to Eat Raw Chocolate Chip Oreo Cookie ""do...","butter, , brown sugar, granulate sugar, milk, ...","['salt', 'granulated sugar', 'vanilla', 'choco..."
5,532245,Chicken and Petite Carrots,"chicken breast, cutlet, bag of petite carrot, ...","['seasoning', 'margarine', 'carrots', 'chicken..."
6,489452,Teriyaki Pork Chops,"bottle teriyaki sauce, pork chop","['pork chops', 'teriyaki sauce']"
7,126368,Bobbie's Pie Crust,"flour, sugar, salt, milk, oil","['sugar', 'milk', 'salt', 'flour']"
8,306467,Quick Bolognese Sauce,"light olive oil, yellow onion, , celery rib, ,...","['crushed tomatoes', 'ground chuck', 'dry red ..."
9,318331,Granny's Butter Rolls,"biscuit mix, water, granulate sugar, butter, s...","['granulated sugar', 'butter', 'biscuit mix', ..."


In [13]:
df['NLP_Ingredients'] = df['NLP_Ingredients'].str.split(',').apply(
    lambda x: ', '.join([ingredient.strip() for ingredient in x if ingredient.strip()])
)
df.head(10)

Unnamed: 0,ID,Name,NLP_Ingredients,Cleaned_Ingredients
0,71247,Cherry Streusel Cobbler,"cherry pie filling, egg, sweeten condensed mil...","['condensed milk', 'margarine', 'self-rising f..."
1,76133,Reuben and Swiss Casserole Bake,"corn beef, thousand island dress, sauerkraut, ...","['corned beef', 'sauerkraut', 'swiss cheese', ..."
2,503816,Yam-Pecan Recipe,"unsalted butter, sugar, vegetable oil, egg, li...","['salt', 'sugar', 'all - purpose flour', 'vege..."
3,418749,Tropical Orange Layer Cake,"orange cake mix, instant vanilla pudding, oran...","['orange gelatin', 'instant vanilla pudding', ..."
4,392934,"Safe to Eat Raw Chocolate Chip Oreo Cookie ""do...","butter, brown sugar, granulate sugar, milk, va...","['salt', 'granulated sugar', 'vanilla', 'choco..."
5,532245,Chicken and Petite Carrots,"chicken breast, cutlet, bag of petite carrot, ...","['seasoning', 'margarine', 'carrots', 'chicken..."
6,489452,Teriyaki Pork Chops,"bottle teriyaki sauce, pork chop","['pork chops', 'teriyaki sauce']"
7,126368,Bobbie's Pie Crust,"flour, sugar, salt, milk, oil","['sugar', 'milk', 'salt', 'flour']"
8,306467,Quick Bolognese Sauce,"light olive oil, yellow onion, celery rib, car...","['crushed tomatoes', 'ground chuck', 'dry red ..."
9,318331,Granny's Butter Rolls,"biscuit mix, water, granulate sugar, butter, s...","['granulated sugar', 'butter', 'biscuit mix', ..."


In [14]:

df.to_csv('NLTK.csv')

In [3]:
list = "eggs, egg, chopped onions, evaporated milk"
print(process_ingredient_list(list))

egg, egg, onion, evaporate milk
