In [1]:
from opensearchpy import OpenSearch
from decouple import config
import pandas as pd
import pickle
import os
from tqdm import tqdm


## Get all reciepies ingredients:

In [None]:
client = OpenSearch(
    hosts=[config('OPENSEARCH_URL', 'http://localhost:9200')],
    http_auth=None,
    use_ssl=False,
    verify_certs=False,
    ssl_show_warn=False,
)

In [3]:
query = """
SELECT ingredients
FROM recipes
"""

res = client.sql.query(body={'query': query})
df = pd.DataFrame(res["datarows"], columns=[c["name"] for c in res["schema"]])

In [4]:
print(f"shape: {df.shape}")
df.head()

shape: (10000, 1)


Unnamed: 0,ingredients
0,"[1 cup all-purpose flour, 1/4 cup packed brown..."
1,"[1/2 (17.5 ounce) package frozen puff pastry, ..."
2,"[1 (8 ounce) package cream cheese, softened, 1..."
3,"[1 cup all-purpose flour, 1/2 cup packed brown..."
4,"[8 apples, 1 cup all-purpose flour, 1 cup whit..."


In [5]:
list(df.loc[0]['ingredients'])

['1 cup all-purpose flour',
 '1/4 cup packed brown sugar',
 '1/2 cup butter',
 '1/2 cup chopped walnuts',
 '2 egg whites',
 '1 cup white sugar',
 '2 cups fresh strawberries',
 '1 cup heavy cream',
 '1 teaspoon lemon juice']

## Use GenAI to extract specific ingredients name

In [9]:
!pip install langchain-openai langchain-community

Collecting langchain-openai
  Downloading langchain_openai-0.3.27-py3-none-any.whl.metadata (2.3 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.26-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-core<1.0.0,>=0.3.66 (from langchain-openai)
  Downloading langchain_core-0.3.66-py3-none-any.whl.metadata (5.8 kB)
Collecting openai<2.0.0,>=1.86.0 (from langchain-openai)
  Downloading openai-1.93.0-py3-none-any.whl.metadata (29 kB)
Collecting tiktoken<1,>=0.7 (from langchain-openai)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting langchain<1.0.0,>=0.3.26 (from langchain-community)
  Downloading langchain-0.3.26-py3-none-any.whl.metadata (7.8 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain-community)
  Downloading aiohttp-3.12.13-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Collecting tenacity!=8.4.0,<10,>=8.1.0 (from langchain-community)
  Downloading ten

In [10]:
# import openai
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, ToolMessage

import re
from typing import List, Optional

In [11]:
OPENAI_API_KEY = "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"

In [12]:
# client = openai.OpenAI(api_key=API_KEY)
llm = ChatOpenAI(
    model="gpt-4.1-nano",
    temperature=0,
    max_tokens=20,
    api_key=OPENAI_API_KEY
)

In [41]:
llm.invoke([HumanMessage(content='prompt')]).content

'Hello! How can I assist you today?'

In [32]:
def extract_ingredient(ingredient_text):
    """Extract ingredient name from recipe text using GPT."""
    
    system_msg=f'''You are a help assistant with the role of extracting ingredient name from a food recipe.
You will be provided with food ingredient and amount, you need to return the ingredient name.
Your response must always be nothing but the food name, do not add any explanation or anything else to your response!
Examples:
Input: "1 cup all-purpose flour", Expected output: "all-purpose flour"
Input: "2 egg whites", Expected output: "egg white"
Input: "1 cup milk 3%", Expected output: "milk 3%"'''
    
    human_msg = f'''Extract the ingredient name from: "{ingredient_text}"'''
    
    # response = client.chat.completions.create(
    #     model="gpt-4.1-nano", #"gpt-3.5-turbo",
    #     messages=[
    #         {"role": "user", "content": prompt}
    #     ],
    #     max_tokens=10,
    #     temperature=0
    # )
    
    try:
        res = llm.invoke([SystemMessage(system_msg), HumanMessage(content=human_msg)])
        return res.content #response.choices[0].message.content 
    except Exception as e:
        print(f"Error: {e}")

# Sanity check:
test_cases = [
    "1 cup all-purpose flour",
    "2 egg whites", 
    "1 cup white sugar",
    "1 teaspoon lemon juice",
    "1 cup milk 3%"
]

for ingredient in test_cases:
    result = extract_ingredient(ingredient)
    print(f"'{ingredient}' → {result}")

'1 cup all-purpose flour' → all-purpose flour
'2 egg whites' → egg white
'1 cup white sugar' → white sugar
'1 teaspoon lemon juice' → lemon juice
'1 cup milk 3%' → milk 3%


Load extracted engrediient till now:

In [34]:
# file_path = f"unique_ingredients_iter_2300.pkl"
# with open(file_path, "rb") as f:
#     prev_unique_ingredients = pickle.load(f)

Extract all unique ingredients from a DataFrame:

In [38]:
old_file_path = None
unique_ingredients = set() #prev_unique_ingredients
seen_ingredients = set()
for i, row in tqdm(df.iterrows()):
    iter = i+1
    for ingredient_text in row['ingredients']:
        if ingredient_text in seen_ingredients:
            continue
        ingredient_name = extract_ingredient(ingredient_text)
        unique_ingredients.add(ingredient_name)
        seen_ingredients.add(ingredient_text)
    # Save results for each 20 rows
    if iter%20==0:
        # save unique ingredients:
        file_path = f"unique_ingredients_iter_{iter}.pkl"
        with open(file_path, "wb") as file:
            pickle.dump(unique_ingredients, file)
        if old_file_path: # remove old res file
            os.remove(old_file_path)
        old_file_path = file_path
        # save seen ingredients:
        with open("seen_ingredients_text.pkl", "wb") as file:
           pickle.dump(seen_ingredients, file)

10000it [2:31:39,  1.10it/s]


In [39]:
len(unique_ingredients)

3969

In [55]:
import json
with open('ingredients_carbs_vegan.json', 'r', encoding ='utf8') as json_file:
    prev_extracted_ingredients_carbs_vegan = json.load(json_file)

In [56]:
len(prev_extracted_ingredients_carbs_vegan)

2610