#RE

In [None]:
import pandas as pd

# Load recipe dataset
recipe_df = pd.read_csv('/content/70000_recipes_nutrients_cleaned_final.csv')

# Show sample
print(recipe_df[['recipe_name', 'ingredients']].head())


                                 recipe_name  \
0                              stalker pasta   
1                vegan wild mushroom lasagna   
2  rwop finalist: tantalizing tilapia recipe   
3             blue cheese portobello burgers   
4       pan-grilled portobello mushroom caps   

                                         ingredients  
0  3 tbsp. olive oil, 2 oz. pancetta or regular b...  
1  9 sheets of oven-ready, no-boil lasagna, 1 1/2...  
2  2 tsp blackened seasoning, 1 tsp lemon pepper ...  
3  3 tablespoons extra-virgin olive oil, divided,...  
4  4 x portobello mushroom caps, the dry stem tri...  


In [None]:
import re

def simple_ingredient_extractor(ingredient_text):
    # Split by commas (basic step)
    raw_ingredients = ingredient_text.split(',')

    # Clean each ingredient: remove quantities and extra descriptors (numbers, measurements)
    cleaned_ingredients = []
    for item in raw_ingredients:
        # Remove quantities like '3 tbsp.', '1/2 cup', '100g', etc.
        item = re.sub(r'\b(\d+\/\d+|\d+\.\d+|\d+)(\s?(tbsp|tsp|cup|cups|oz|g|kg|lb|pound|clove|slices|slice|cups|tablespoons|teaspoons|pinch|package|package|large|small|medium|ml|l|grams|pounds))?\b', '', item, flags=re.IGNORECASE)

        # Remove extra words like 'chopped', 'minced', 'diced', 'fresh', 'thinly sliced'
        item = re.sub(r'\b(chopped|minced|diced|fresh|thinly sliced|washed|trimmed|sliced|peeled|grated|finely chopped|finely sliced|ground|drained|divided|optional|such as|like|or|plus|optional|for frying|to taste|cut|peeled|beaten|rinsed)\b', '', item, flags=re.IGNORECASE)

        # Remove extra spaces
        item = item.strip()

        # Remove multiple spaces
        item = re.sub(r'\s+', ' ', item)

        if item:
            cleaned_ingredients.append(item.lower())

    return cleaned_ingredients

In [None]:
# Select a sample recipe from your dataset
sample_recipe = recipe_df.iloc[60000]

print("Original ingredients:")
print(sample_recipe['ingredients'])

extracted_ingredients = simple_ingredient_extractor(sample_recipe['ingredients'])

print("\nExtracted ingredients:")
print(extracted_ingredients)

print("\nNutrient values for this recipe:")
nutrient_cols = ['calories', 'protein', 'fat', 'carbohydrates']
print(sample_recipe[nutrient_cols])

Original ingredients:
carton cottage cheese, frozen whipped topping such as cool whip thawed, orange-flavored gelatin such as jell-o, crushed pineapple, mandarin oranges drained, miniature marshmallows

Extracted ingredients:
['carton cottage cheese', 'frozen whipped topping cool whip thawed', 'orange-flavored gelatin jell-o', 'crushed pineapple', 'mandarin oranges', 'miniature marshmallows']

Nutrient values for this recipe:
calories         25.0
protein          37.0
fat              30.0
carbohydrates    22.0
Name: 60000, dtype: object


#Method 2 NER

In [None]:
import pandas as pd
from transformers import pipeline
import re

# Load recipe dataset
recipe_df = pd.read_csv('/content/70000_recipes_nutrients_cleaned_final.csv')

# Initialize Hugging Face NER pipeline (using a general NER model)
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", grouped_entities=True)

def hf_ner_ingredient_extractor(text):
    # Run NER model
    ner_results = ner_pipeline(text)

    # Collect entities that could be ingredients
    # The 'dslim/bert-base-NER' model uses labels like 'ORG', 'PER', 'LOC', 'MISC', 'O' etc.
    # So we take all 'MISC' or 'O' entities (to be inclusive), but mainly all entity words

    # Extract entity words from results
    entities = []
    for entity in ner_results:
        # You can customize filtering based on entity['entity_group'] if needed
        # For demo, we accept all entities
        entities.append(entity['word'])

    # Join split tokens (they sometimes start with ##)
    # Remove '##' subword tokenization artifacts
    cleaned_entities = []
    temp = ""
    for word in entities:
        if word.startswith("##"):
            temp += word[2:]
        else:
            if temp:
                cleaned_entities.append(temp)
                temp = ""
            cleaned_entities.append(word)
    if temp:
        cleaned_entities.append(temp)

    # Clean entities with regex similar to before
    final_ingredients = []
    for item in cleaned_entities:
        item = item.lower()
        item = re.sub(r'\b(\d+\/\d+|\d+\.\d+|\d+)(\s?(tbsp|tsp|cup|cups|oz|g|kg|lb|pound|clove|slice|slices|tablespoons|teaspoons|pinch|package|large|small|medium|ml|l|grams|pounds))?\b', '', item, flags=re.IGNORECASE)
        item = re.sub(r'\b(chopped|minced|diced|fresh|thinly sliced|washed|trimmed|sliced|peeled|grated|finely chopped|finely sliced|ground|drained|divided|optional|such as|like|or|plus|optional|for frying|to taste|cut|peeled|beaten|rinsed)\b', '', item, flags=re.IGNORECASE)
        item = item.strip()
        item = re.sub(r'\s+', ' ', item)
        if item and len(item) > 1:
            final_ingredients.append(item)

    # Deduplicate
    final_ingredients = list(set(final_ingredients))
    return final_ingredients

# Apply the Hugging Face NER extractor on sample data for demo
sample_text = recipe_df.loc[60000, 'ingredients']
print("Original ingredients text:\n", sample_text)
print("\nExtracted ingredients using HF NER:")
print(hf_ner_ingredient_extractor(sample_text))

# To run on whole dataset (can be slow, so for demo just first 5)
recipe_df['hf_extracted_ingredients'] = recipe_df['ingredients'].head(5).apply(hf_ner_ingredient_extractor)

print(recipe_df[['ingredients', 'hf_extracted_ingredients']].head())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


Original ingredients text:
 carton cottage cheese, frozen whipped topping such as cool whip thawed, orange-flavored gelatin such as jell-o, crushed pineapple, mandarin oranges drained, miniature marshmallows

Extracted ingredients using HF NER:
[]
                                         ingredients hf_extracted_ingredients
0  3 tbsp. olive oil, 2 oz. pancetta or regular b...                       []
1  9 sheets of oven-ready, no-boil lasagna, 1 1/2...                       []
2  2 tsp blackened seasoning, 1 tsp lemon pepper ...                       []
3  3 tablespoons extra-virgin olive oil, divided,...                       []
4  4 x portobello mushroom caps, the dry stem tri...                       []


**RE**

The first section of your code uses regular expressions (RE) to extract ingredient names from recipe texts. The function simple_ingredient_extractor() takes a string of ingredients, typically separated by commas, and processes each item by removing quantities (like "1/2 cup", "2 tbsp", or "100g") and descriptors such as "chopped", "minced", or "fresh". This helps isolate the core ingredient names, making them more suitable for matching or analysis. It also strips extra whitespace and lowercases the text to maintain uniformity. This rule-based approach is fast and simple but may miss context-specific nuances.

**NER**

The second section leverages a Hugging Face Named Entity Recognition (NER) model, specifically dslim/bert-base-NER, to extract ingredients using deep learning. It tokenizes the input text and classifies each word as part of an entity. After extracting the relevant words, it joins broken subword tokens (like "##ing"), cleans the results using a similar regex-based approach as the RE method, and removes duplicates. This method aims to identify ingredient names based on language context, potentially capturing more complex or subtle ingredient names than rule-based extraction.

