## Recipe Recommendation 

In [6]:
import pandas as pd
import numpy as np

In [7]:
# Load the dataset
recipes = pd.read_csv("recipes_ingredients.csv")

In [8]:
print(recipes.shape)
print(recipes.columns)
print(recipes.head(3))

(500471, 9)
Index(['id', 'name', 'description', 'ingredients', 'ingredients_raw', 'steps',
       'servings', 'serving_size', 'tags'],
      dtype='object')
       id                             name  \
0   71247          Cherry Streusel Cobbler   
1   76133  Reuben and Swiss Casserole Bake   
2  503816                 Yam-Pecan Recipe   

                                         description  \
0  I haven't made this in years, so I'm just gues...   
1  I think this is even better than a reuben sand...   
2  A lady I work with heard me taking about ZWT a...   

                                         ingredients  \
0  ["cherry pie filling", "condensed milk", "melt...   
1  ["corned beef chopped", "sauerkraut cold water...   
2  ["unsalted butter", "vegetable oil", "all - pu...   

                                     ingredients_raw  \
0  ["2 (21   ounce) cans   cherry pie filling","2...   
1  ["1/2-1   lb    corned beef, cooked and choppe...   
2  ["3/4  cup    unsalted butter, at roo

In [26]:
print(type(recipes['ingredients_raw'].iloc[0]))

<class 'str'>


In [39]:
#Convert ingredients string to Python list 

import re

def parse_ingredients(s):
    if isinstance(s, str):
        #Extracts everything between double quotes
        pattern = r'"(.*?)"'
        return re.findall(pattern, s)
    else:
        return []  

recipes['ingredients_list'] = recipes['ingredients_raw'].apply(parse_ingredients)

print(type(recipes['ingredients_list'].iloc[0]))


<class 'list'>


In [None]:
import re
 
units = [
    "cup","cups","tablespoon","tablespoons","tbsp","teaspoon","tsp","teaspoons",
    "g","kg","mg","oz","ounce","ounces","ml","l","liter","pinch","dash","pound","lb","lbs",
    "clove","cloves","slice","slices","package","can","envelope","bunches","sprig"
]
 
descriptors = [
    "chopped","sliced","minced","crumbled","shredded","coarsely ground",
    "thick-cut","thinly sliced","diced","peeled","crushed","ground","finely",
    "small","large","extra","uncooked","condensed","your choice of","your choice",
    "semisweet","bittersweet","chunky","fresh","dried","whole",
    "melted","softened","whipped","for topping","for serving","of your choice",
    "into","inch","pieces","flavored","grated","may","approximately","seeded","juice",
    "shell","penne","fusilli","rotini","elbow","drained","in water",
    "thinly","thickly","finely","coarsely","roughly","smooth","chunky","creamy",
    "head","leaf","leaves","clove","cloves","bunch","stalk","stalks","sprig","sprigs",
    "freshly","lightly","heavily","slightly","well","very","quite","really",
    "julienned","grated","shredded","chopped","diced","sliced","minced"
]
 
adjectives = [
    "white","cold","hot","raw","unsalted","light","dark","fresh","extra","virgin"
]
 
normalize_map = {
    "brown sugar":"sugar",
    "light brown sugar":"sugar", 
    "instant rice":"rice",
    "long-grain white rice":"white rice",
    "all-purpose flour":"flour",
    "graham cracker crumbs":"graham cracker",
    "extra virgin olive oil":"olive oil",
    "virgin olive oil":"olive oil",
    "chopped tomatoes":"tomatoes",
    "diced tomatoes":"tomatoes",
    "fresh basil":"basil",
    "tomatoes, with juice":"tomatoes",
    "black pepper":"pepper",
    "ground pepper":"pepper",
    "white pepper":"pepper",
    "red pepper":"pepper",
    "green pepper":"bell pepper"
}
 
prep_words = ["in water", "drained", "cooked", "softened", "whipped", "to taste", "for serving", 
              "for garnish", "as needed", "if desired", "optional"]
 
origin_descriptors = ["italian", "mexican", "chinese", "french", "spanish", "greek", "american", 
                     "asian", "european", "mediterranean"]
 
def refine_ingredient_list(ingredients):
    cleaned = []
    for ingredient in ingredients:
        if not isinstance(ingredient, str):
            continue
        ing = ingredient.lower().strip()
        
        # Remove parentheses content first
        ing = re.sub(r"\(.*?\)", "", ing)
        
        # Remove numbers with fractions/decimals and units
        # This handles patterns like "8 ounces", "2 1/2 teaspoons", "1/4 cup"
        for unit in units:
            # Pattern to match: number(s) + optional fraction + unit
            pattern = rf"\b\d+(?:\s*\d*/\d+)?\s*{re.escape(unit)}\b"
            ing = re.sub(pattern, "", ing)
            
        # Remove any remaining standalone numbers and fractions
        ing = re.sub(r"\b\d+(?:/\d+)?\b", "", ing)
        ing = re.sub(r"\b\d+\s*\d+/\d+\b", "", ing)  # For "2 1/2" format
        
        # Remove descriptors
        for desc in descriptors:
            ing = re.sub(rf"\b{re.escape(desc)}\b", "", ing)
            
        # Remove adjectives  
        for adj in adjectives:
            ing = re.sub(rf"\b{re.escape(adj)}\b", "", ing)
            
        # Remove prep words
        for pw in prep_words:
            ing = ing.replace(pw, "")
            
        # Remove origin descriptors
        for origin in origin_descriptors:
            ing = re.sub(rf"\b{re.escape(origin)}\b", "", ing)
            
        # Remove commas and normalize spaces
        ing = re.sub(r",", "", ing)
        ing = re.sub(r"\s+", " ", ing).strip()
        
        # Apply normalization mapping
        for k, v in normalize_map.items():
            ing = ing.replace(k, v)
            
        # Split on connecting words and take first part
        ing = re.split(r"\bor\b|\bwith\b|\band\b", ing)[0].strip()
        
        # Remove common stopwords
        stopwords = ["pieces", "inch", "small", "medium", "large", "may", "approximately"]
        
        for sw in stopwords:
            ing = re.sub(rf"\b{sw}\b", "", ing)

        # Final cleanup
        ing = re.sub(r"\s+", " ", ing).strip()

        if ing and len(ing) > 1:  # Only keep non-empty ingredients with more than 1 char
            cleaned.append(ing)
            
    # Remove duplicates while preserving order
    return list(dict.fromkeys(cleaned))

Original beef ingredients:
  0: '2   lbs    ground beef'
  1: '2   lbs    Italian sausage, ground '
  2: '  saltine crackers'
  3: '3       eggs'
  4: '1       onion, chopped '
  5: '2   tablespoons    garlic, minced '
  ...

Cleaned for TF-IDF:
  0: 'beef'
  1: 'sausage'
  2: 'saltine crackers'
  3: 'eggs'
  4: 'onion'
  5: 'garlic'
  6: 'parsley'
  7: 'salt'
  8: 'pepper'
  9: 'bell pepper'
  10: 'marinara sauce'

Result: ['beef', 'sausage', 'saltine crackers', 'eggs', 'onion', 'garlic', 'parsley', 'salt', 'pepper', 'bell pepper', 'marinara sauce']

PREVIOUS TEST CASES:

Test case 1: ['8   ounces    shell pasta', '1 (6 1/2  ounce) can   tuna in water, drained ', '2 1/2  teaspoons   chopped walnuts', '1/4  cup    pesto sauce']
Cleaned: ['pasta', 'tuna', 'walnuts', 'pesto sauce']

Test case 2: ['2 cups all-purpose flour', '1 lb ground beef', '3 large eggs']
Cleaned: ['flour', 'beef', 'eggs']

Test case 3: ['1/2 cup olive oil', '2 tablespoons fresh basil', '1 can diced tomatoes']
Cleane