In [1]:
!pip install langchain-openai langchain-community

In [12]:
from opensearchpy import OpenSearch
from decouple import config
import pandas as pd
import pickle
from tqdm import tqdm
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, ToolMessage
import re
from typing import List, Optional

# Load environment variables from .env file
load_dotenv()

True

## Get all reciepies ingredients:

In [None]:
# Initialize the OpenSearch client with configuration from environment variables
client = OpenSearch(
    hosts=[config('OPENSEARCH_URL', 'http://localhost:9200')],
    http_auth=None,
    use_ssl=False,
    verify_certs=False,
    ssl_show_warn=False,
)

In [3]:
# Define the SQL query to retrieve the 'ingredients' column from the 'recipes' index
query = """
SELECT ingredients
FROM recipes
"""

res = client.sql.query(body={'query': query})
df = pd.DataFrame(res["datarows"], columns=[c["name"] for c in res["schema"]])

In [4]:
print(f"shape: {df.shape}")
df.head()

shape: (10000, 1)


Unnamed: 0,ingredients
0,"[1 cup all-purpose flour, 1/4 cup packed brown..."
1,"[1/2 (17.5 ounce) package frozen puff pastry, ..."
2,"[1 (8 ounce) package cream cheese, softened, 1..."
3,"[1 cup all-purpose flour, 1/2 cup packed brown..."
4,"[8 apples, 1 cup all-purpose flour, 1 cup whit..."


In [5]:
# Let's look at a sample
list(df.loc[0]['ingredients'])

['1 cup all-purpose flour',
 '1/4 cup packed brown sugar',
 '1/2 cup butter',
 '1/2 cup chopped walnuts',
 '2 egg whites',
 '1 cup white sugar',
 '2 cups fresh strawberries',
 '1 cup heavy cream',
 '1 teaspoon lemon juice']

## Use GenAI to extract specific ingredients name

In [13]:
# Get OpenAI API key from environment
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [12]:
# Initialize the OpenAI chat model with custom parameters
# client = openai.OpenAI(api_key=API_KEY)
llm = ChatOpenAI(
    model="gpt-4.1-nano",
    temperature=0,
    max_tokens=20,
    api_key=OPENAI_API_KEY
)

In [41]:
llm.invoke([HumanMessage(content='prompt')]).content

'Hello! How can I assist you today?'

In [32]:
def extract_ingredient(ingredient_text):
    """
    Extracts the ingredient name from a full recipe ingredient string using a language model.

    Parameters:
        ingredient_text (str): A string containing a quantity, unit, and ingredient (e.g. "1 cup sugar").

    Returns:
        str: The extracted ingredient name only (e.g. "sugar").
    """    
    system_msg=f'''You are a help assistant with the role of extracting ingredient name from a food recipe.
You will be provided with food ingredient and amount, you need to return the ingredient name.
Your response must always be nothing but the food name, do not add any explanation or anything else to your response!
Examples:
Input: "1 cup all-purpose flour", Expected output: "all-purpose flour"
Input: "2 egg whites", Expected output: "egg white"
Input: "1 cup milk 3%", Expected output: "milk 3%"'''
    
    human_msg = f'''Extract the ingredient name from: "{ingredient_text}"'''
    
    try:
        res = llm.invoke([SystemMessage(system_msg), HumanMessage(content=human_msg)])
        return res.content #response.choices[0].message.content 
    except Exception as e:
        print(f"Error: {e}")

# Sanity test to verify model output
test_cases = [
    "1 cup all-purpose flour",
    "2 egg whites", 
    "1 cup white sugar",
    "1 teaspoon lemon juice",
    "1 cup milk 3%"
]

for ingredient in test_cases:
    result = extract_ingredient(ingredient)
    print(f"'{ingredient}' → {result}")

'1 cup all-purpose flour' → all-purpose flour
'2 egg whites' → egg white
'1 cup white sugar' → white sugar
'1 teaspoon lemon juice' → lemon juice
'1 cup milk 3%' → milk 3%


Extract all unique ingredients from a DataFrame:

In [38]:
# Track and deduplicate extracted ingredients
old_file_path = None
unique_ingredients = set()
seen_ingredients = set()

# Iterate through the ingredients in the DataFrame
for i, row in tqdm(df.iterrows()):
    iter = i+1
    for ingredient_text in row['ingredients']:
        if ingredient_text in seen_ingredients:
            continue
        ingredient_name = extract_ingredient(ingredient_text)
        unique_ingredients.add(ingredient_name)
        seen_ingredients.add(ingredient_text)
        
    # Periodically save intermediate results every 20 rows
    if iter%20==0:
        file_path = f"data/unique_ingredients_iter_{iter}.pkl"
        with open(file_path, "wb") as file:
            pickle.dump(unique_ingredients, file)

        # Remove the previous intermediate result file
        if old_file_path:
            os.remove(old_file_path)
        old_file_path = file_path

10000it [2:31:39,  1.10it/s]


In [39]:
# Output the final count of unique ingredients
len(unique_ingredients)

3969