In [2]:
import pandas as pd
import ast
import json
import openai
import requests
from tenacity import retry, wait_random_exponential, stop_after_attempt
from termcolor import colored
import os
from dotenv import load_dotenv
import re

# Load environment variables from .env file
load_dotenv()
GPT_MODEL = "gpt-3.5-turbo-0613"
openai.api_key = os.getenv("OPENAI_API_KEY")

## Extract ingredients from the recipe

In [3]:
# Helper functions
def json_gpt(input: str):
    completion = openai.ChatCompletion.create(
        model=GPT_MODEL,
        messages=[
            {"role": "system", "content": "Output only valid JSON"},
            {"role": "user", "content": input},
        ],
        temperature=0.5,
    )

    text = completion.choices[0].message.content
    parsed = json.loads(text)

    return parsed


def embeddings(input: list[str]) -> list[list[str]]:
    response = openai.Embedding.create(model="text-embedding-ada-002", input=input)
    return [data.embedding for data in response.data]

In [4]:
recipe = """
3 heaping tablespoons of peanut butter
3 tablespoons of sambal sauce
3 tablespoons of soy sauce
I tablespoons of honey
1 tablespoons of sesame oil
The juice of 1 lime
1 garlic, grated
1 small knob of ginger, grated
 cup of water
1 cup of mushrooms
1/2 cup of cabbage leaves
2 servings of soba noodles
Chopped scallion
Chopped cilantro
""" 

# ChatGPT to help get similar items
QUERIES_INPUT = f"""
Get all the ingredients in the recipe, e.g. flour, egg, milk,... This is the recipe: {recipe}
Remove any basic ingredient like water.
Change plural ingredients to singular. For example, "noodles" to "noodle".
Simplify the ingredients. For example, "cabbage leaves to cabbage".
Format: {{"Products": ["product_1", "product_2",...]}}
"""

similar_products = json_gpt(QUERIES_INPUT)["Products"]
print(similar_products)

['peanut butter', 'sambal sauce', 'soy sauce', 'honey', 'sesame oil', 'lime juice', 'garlic', 'ginger', 'mushroom', 'cabbage', 'soba noodle', 'scallion', 'cilantro']


## Find the right general category

In [5]:
general_categories = ["bakery", "dairy-eggs-fridge", "drinks", "freezer", "fruit-veg", "health-wellnes health-foods", "lunch-box", "pantry", "poultry-meat-seafood"]


In [6]:
category_dict = {
    "bakery": ["bakery", "bread", "pastries"],
    "dairy-eggs-fridge": ["dairy-eggs-fridge", "milk", "cheese", "yogurt", "cream", "dips", "ready to eat meals", "international food", "vegan"],
    "drinks": ["drinks", "juices", "soda", "water", "tea", "coffee", "energy drinks"],
    "freezer": ["freezer", "frozen meals", "ice cream", "frozen vegetables", "frozen fruit"],
    "fruit-veg": ["fruit-veg", "fruits", "vegetables", "salads", "organic"],
    "health-wellness health-foods": ["health-wellness health-foods", "vitamins", "superfoods", "protein bars", "health foods", "dried fruit, nuts, seeds"],
    "lunch-box": ["lunch-box","sandwiches", "snack packs", "fruit cups"],
    "pantry": ["pantry", "canned goods", "breakfast and spreads", "herbs and spices", "condiments", "canned food", "pasta, rice, grains", "cooking sauces and recipe bases", "oil and vinegar", "international foods"],
    "poultry-meat-seafood": ["poultry-meat-seafood", "poultry", "meat", "seafood"]
}

In [7]:
category_list = [item for sublist in category_dict.values() for item in sublist]

In [8]:
# ChatGPT to help get similar items
QUERIES_INPUT = f"""
Group all the items into the right categories. 
These are the categories: {category_list}.
These are the items: {similar_products}.
Format: {{"category": ["item_1", "item_2",...]}}
"""

similar_products = json_gpt(QUERIES_INPUT)
print(similar_products)

{'pantry': ['peanut butter', 'sambal sauce', 'soy sauce', 'honey', 'sesame oil', 'lime juice', 'garlic', 'ginger'], 'vegetables': ['mushroom', 'cabbage', 'scallion', 'cilantro'], 'pasta, rice, grains': ['soba noodle']}


In [30]:
categorized_items = {}

# Iterate over the items
for key, value in similar_products.items():
    print(key)
    for category, keywords in category_dict.items():
        # Check if any keyword in the category is present in the item
        if key in keywords:
            categorized_items[category] = categorized_items.get(category, []) + value
            break

print(categorized_items)

pantry
vegetables
pasta, rice, grains
{'pantry': ['peanut butter', 'sambal sauce', 'soy sauce', 'honey', 'sesame oil', 'lime juice', 'garlic', 'ginger', ['peanut butter', 'sambal sauce', 'soy sauce', 'honey', 'sesame oil', 'lime juice', 'garlic', 'ginger', [...]], 'soba noodle'], 'fruit-veg': ['mushroom', 'cabbage', 'scallion', 'cilantro']}


In [21]:
similar_products

{'pantry': ['peanut butter',
  'sambal sauce',
  'soy sauce',
  'honey',
  'sesame oil',
  'lime juice',
  'garlic',
  'ginger',
  [...]],
 'vegetables': ['mushroom', 'cabbage', 'scallion', 'cilantro'],
 'pasta, rice, grains': ['soba noodle']}

## Find the product

In [463]:
df = pd.read_excel("Data\Woolies Extracted\Woolies pantry 1 info.xlsx")
df2 = pd.read_excel("Data\Woolies Extracted\Woolies pantry 2 info.xlsx")
df = pd.concat([df, df2], ignore_index=True)

In [464]:

# QUERIES_INPUT = f"""
# You have access to a list of Sap Categories: {df["Sap Category Name"].unique()}
# You also have access to a list of Sap Sub Categories: {df["Sap Sub Category Name"].unique()}
# Return all of the relevant categories within the list for the following item: {product}
# ONLY return the categories within the list provided.
# Format: {{"Sap Category": ["category_1", "category_2",...], "Sap Sub Category": ["category_1", "category_2",...]}}
# """

# res_sap = json_gpt(QUERIES_INPUT)["Sap Category"]
# # res_sub_sap = json_gpt(QUERIES_INPUT)["Sap Sub Category"]
# # res_sap, res_sub_sap
# json_gpt(QUERIES_INPUT)

In [465]:
bad_list = [
    "Artificial flavor",
    "Artificial flavour",
    "Natural flavor",
    "Natural flavour",
    "Aspartame",
    "BHT",
    "Calcium disodium EDTA",
    "Caramel color",
    "Carrageenan",
    "Corn starch",
    "Corn syrup",
    "Dextrose",
    "Dough conditioners",
    "Enriched flour",
    "Bleached flour",
    "Food color",
    "Maltodextrin",
    "Monoglycerides",
    "Monosodium glutamate",
    "Diglyceride",
    "Natural flavor",
    "Natural flavors",
    "Polysorbate",
    "Potassium sorbate",
    "Sodium erythorbate",
    "Sodium nitrate",
    "Sodium nitrite",
    "Sodium phosphate",
    "Soy protein isolate",
    "Splenda",
    "Sugar",
    "Syrup",
    "Skim milk",
    "Low fat",
    "Reduced fat",
    "Xylitol",
]


In [466]:
product = "soba"
product_split = product.split()

# ChatGPT to help get similar items
QUERIES_INPUT = f"""
Give me similar products related to this prompt but is not it: {product}
ONLY if the items are similar, otherwise don't.
Example: if the prompt is milk then similar products would be: cheese, butter, yoghurt, etc.
Include variations of the product name, e.g. yogurt and yoghurt
Format: {{"Products": ["product_1", "product_2",...]}}
"""

similar_products = json_gpt(QUERIES_INPUT)["Products"]
print(similar_products)

# Filter out rows that do not contain the product name
selected_rows = df.copy()  # Create a copy of the original dataframe
for keyword in product_split:
    selected_rows = selected_rows[selected_rows['Product Name'].str.contains(fr'\b{re.escape(keyword)}\b', case=False)]
for item in similar_products:
    selected_rows = selected_rows[~selected_rows['Product Name'].str.contains(fr'\b{re.escape(item)}\b', case=False)]
# Filter out rows with no ingredients
selected_rows = selected_rows[~selected_rows['Ingredients'].isna()]
print(len(selected_rows))

['udon', 'ramen', 'vermicelli', 'spaghetti', 'linguine', 'fettuccine', 'lasagna', 'macaroni']
3


In [467]:
# Get the 'Product Name' and 'Ingredients' columns as Series
product_names = selected_rows['Product Name']
ingredients_series = selected_rows['Ingredients']
cup_prices = selected_rows['Cup Price']
price = selected_rows['Price']

clean_products_df = pd.DataFrame(columns=['Product Name', 'Ingredients', 'Cup Price', 'Price'])
all_clean = []

# Iterate over each ingredient string along with its corresponding product name and cup price
for product_name, ingredients, cup_price, price in zip(product_names, ingredients_series, cup_prices, price):
    # Split the string at commas that are not between parentheses
    ingredients_list = re.split(r',\s*(?![^()]*\))', ingredients)
    clean = True
    
    # Iterate over each ingredient in the list
    for ingredient in ingredients_list:
        # Check if the ingredient is in the bad_list
        for bad_item in bad_list:
            # Normalize bad_list item to lowercase and split it into individual words
            bad_item_lower = bad_item.lower()
            bad_words = re.findall(r'\b\w+\b', bad_item_lower)
            
            # Check if all the words from bad_list are present in the ingredient
            all_words_present = all(word in ingredient.lower() for word in bad_words)
            
            if all_words_present:
                clean = False

    # Ingredients shouldn't be more than a certain amount
    gum = 0
    oil = 0
    emulsifier = 0
    # Count the occurrences of specific ingredients
    gum = sum(ingredient.lower().count("gum") for ingredient in ingredients_list)
    oil = sum(ingredient.lower().count("oil") for ingredient in ingredients_list)
    emulsifier = sum(ingredient.lower().count("emulsifier") for ingredient in ingredients_list)

    if gum > 2 or oil > 2 or emulsifier > 2:
        print("Too many")
        clean = False

    # If the product is clean, add it to the list
    if clean:
        clean_products_df = pd.concat([clean_products_df, pd.DataFrame({
            'Product Name': [product_name],
            'Ingredients': [ingredients],
            'Cup Price': [cup_price],
            "Price": [price] 
        })])
        
print(all_clean)

[]


In [468]:
clean_products_df_sorted = clean_products_df.sort_values(by='Cup Price')
clean_products_df_sorted

Unnamed: 0,Product Name,Ingredients,Cup Price,Price
0,Obento Japanese Noodles Soba,"Wheat Flour (49%), Water, Buckwheat Flour (21 ...",0.86,1.55
0,Mr Chen's Soba Noodles,"Buckwheat Flour 49%, Wheat Flour 38% Water, Salt",0.93,2.5
0,Hakubaku Organic Japanese Soba Buckwheat Noodl...,"Organic Wheat Flour (69%), Organic Buckwheat F...",1.48,4.0


In [432]:
# clean_products_df_sorted.to_excel('clean_products.xlsx', index=False)