In [144]:
import pandas as pd
import ast
import json
import openai
import requests
from tenacity import retry, wait_random_exponential, stop_after_attempt
from termcolor import colored
import os
from dotenv import load_dotenv
import re
import inflect

# Load environment variables from .env file
load_dotenv()
GPT_MODEL = "gpt-3.5-turbo"
openai.api_key = os.getenv("OPENAI_API_KEY")
p = inflect.engine()

## Extract ingredients from the recipe

In [199]:
# Helper functions
def json_gpt(input: str):
    completion = openai.ChatCompletion.create(
        model=GPT_MODEL,
        messages=[
            {"role": "system", "content": "Output only valid JSON"},
            {"role": "user", "content": input},
        ],
        temperature=0.5,
    )

    text = completion.choices[0].message.content
    parsed = json.loads(text)

    return parsed


def embeddings(input: list[str]) -> list[list[str]]:
    response = openai.Embedding.create(model="text-embedding-ada-002", input=input)
    return [data.embedding for data in response.data]

In [214]:
# recipe = """
# 3 heaping tablespoons of peanut butter
# 3 tablespoons of sambal sauce
# 3 tablespoons of soy sauce
# I tablespoons of honey
# 1 tablespoons of sesame oil
# The juice of 1 lime
# 1 garlic, grated
# 1 small knob of ginger, grated
#  cup of water
# 1 cup of mushrooms
# 1/2 cup of cabbage leaves
# 2 servings of soba noodles
# Chopped scallion
# Chopped cilantro
# """ 
# recipe = """
# 1 pound of boneless chicken thighs, skin on
# 4 cloves of garlic 
# 1/4 cup of soy sauce 
# Slight less than 1/4 cup of honey 
# 1 head of broccoli
# """
# recipe = """
# ~3 bunches of scallions
# 3 tbsp (~45g) butter
# 1/4 cup (~54g) olive oil
# 1 clove of garlic
# 1/2 lb (~220g) dry spaghetti
# 1/2 tbsp (~8ml) fish sauce
# Splash of sake or dry white wine
# 1 oz (~30g) parmigiano reggiano
# Salt and pepper to taste
# """

recipe = """
1 pound lean ground meat like beef, turkey, chicken or lamb

3 tablespoons olive oil

1 cup (130 grams) chopped onion

3 garlic cloves, minced (1 tablespoon)

2 tablespoons tomato paste

1/2 teaspoon dried oregano

Pinch crushed red pepper flakes

1 cup water, broth or dry red wine

1 (28-ounce) can crushed tomatoes

Salt and fresh ground black pepper

Handful fresh basil leaves, plus more for serving

12 ounces dried spaghetti or favorite pasta shape

1/2 cup shredded parmesan cheese
"""

# Experiment with prompts
# Change plural ingredients to singular. For example, "noodles" to "noodle".

# ChatGPT to help get similar items
QUERIES_INPUT = f"""
Get all the ingredients in the recipe, e.g. flour, egg, milk,... This is the recipe: {recipe}
Remove any basic ingredient like water.

Simplify the ingredients. For example, "cabbage leaves to cabbage".
Format: {{"Products": ["product_1", "product_2",...]}}
"""

similar_products = json_gpt(QUERIES_INPUT)["Products"]
print(similar_products)

['lean ground meat', 'olive oil', 'chopped onion', 'garlic cloves', 'tomato paste', 'dried oregano', 'crushed red pepper flakes', 'water', 'broth', 'dry red wine', 'crushed tomatoes', 'salt', 'fresh ground black pepper', 'fresh basil leaves', 'dried spaghetti', 'shredded parmesan cheese']


## Find the right general category

In [215]:
general_categories = ["bakery", "dairy-eggs-fridge", "drinks", "freezer", "fruit-veg", "health-wellnes health-foods", "lunch-box", "pantry", "poultry-meat-seafood"]

In [216]:
category_dict = {
    "bakery": ["bakery", "bread", "pastries"],
    "dairy-eggs-fridge": ["dairy-eggs-fridge", "milk", "cheese", "yogurt", "cream", "dips", "ready meals", "international food", "vegan"],
    "drinks": ["drinks", "juices", "soda", "water", "tea", "coffee", "energy drinks"],
    "freezer": ["freezer", "frozen meals", "ice cream", "frozen vegetables", "frozen fruit"],
    "fruit-veg": ["fruit-veg", "fruits", "vegetables", "salads", "organic", "fresh herbs"],
    "health-wellness health-foods": ["health-wellness", "vitamins", "superfoods", "protein bars", "health-foods", "health foods", "dried fruit, nuts, seeds"],
    "lunch-box": ["lunch-box", "sandwiches", "snack packs", "fruit cups"],
    "pantry": ["pantry", "canned goods", "breakfast and spreads", "spices", "condiments", "pasta, rice, grains", "cooking sauces", "oil and vinegar", "international foods"],
    "poultry-meat-seafood": ["poultry-meat-seafood", "poultry", "meat", "seafood"]
}

In [217]:
category_list = [item for sublist in category_dict.values() for item in sublist]

In [218]:
# HARD CODE
known_category = {
    "bakery": ["bakery", "bread", "pastries"],
    "dairy-eggs-fridge": ["parmigiano reggiano", "milk", "cheese", "yogurt", "cream", "dips", "butter","egg"],
    "drinks": ["drinks", "juices", "soda", "water", "tea", "coffee", "energy drinks"],
    "freezer": ["freezer", "frozen meals", "ice cream", "frozen vegetables", "frozen fruit"],
    "fruit-veg": ["scallion"],
    "health-wellness health-foods": ["health-wellness", "vitamins", "superfoods", "protein bars", "health-foods", "health foods", "dried fruit, nuts, seeds"],
    "lunch-box": [],
    "pantry": ["fish sauce", "flour", "self-raising flour"],
    "poultry-meat-seafood": ["poultry", "meat", "seafood"]
}
known_product = {}
for product in similar_products:
    product2 = p.singular_noun(product.lower()) or product.lower()
    
    for k, v in known_category.items():
        if product2 in v:
            known_product[k] = known_product.get(k, []) + [product]
            similar_products.remove(product)
            break

known_product

{'drinks': ['water']}

In [219]:
# ChatGPT to help get similar items
QUERIES_INPUT = f"""
Group the items into their respective categories. Use the provided categories and items to create the desired grouping.

Categories: {category_list}
Items: {similar_products}

Only show the categories that have items in them.

Format: 
  "category_1": ["item_1", "item_2", ...],
  "category_2": ["item_1", "item_2", ...],
"""

similar_products = json_gpt(QUERIES_INPUT)
print(similar_products)

{'pasta, rice, grains': ['dried spaghetti'], 'condiments': ['olive oil'], 'spices': ['dried oregano', 'crushed red pepper flakes'], 'vegetables': ['chopped onion', 'garlic cloves'], 'canned goods': ['tomato paste', 'crushed tomatoes'], 'cheese': ['shredded parmesan cheese'], 'fresh herbs': ['fresh basil leaves']}


In [220]:
similar_products = {key: known_product.get(key, []) + similar_products.get(key, []) for key in set(known_product) | set(similar_products)}
similar_products

{'drinks': ['water'],
 'pasta, rice, grains': ['dried spaghetti'],
 'cheese': ['shredded parmesan cheese'],
 'vegetables': ['chopped onion', 'garlic cloves'],
 'condiments': ['olive oil'],
 'fresh herbs': ['fresh basil leaves'],
 'canned goods': ['tomato paste', 'crushed tomatoes'],
 'spices': ['dried oregano', 'crushed red pepper flakes']}

In [221]:
categorized_items = {}
# Iterate over the items
for key, value in similar_products.items():
    for category, keywords in category_dict.items():
        # Check if any keyword in the category is present in the item
        if key in keywords:
            categorized_items[category] = categorized_items.get(category, []) + value
            break
# Filter out ones with empty list
categorized_items = {key: value for key, value in categorized_items.items() if value}
print(categorized_items)

{'drinks': ['water'], 'pantry': ['dried spaghetti', 'olive oil', 'tomato paste', 'crushed tomatoes', 'dried oregano', 'crushed red pepper flakes'], 'dairy-eggs-fridge': ['shredded parmesan cheese'], 'fruit-veg': ['chopped onion', 'garlic cloves', 'fresh basil leaves']}


## Find the product

In [222]:
# Bad list
bad_list = [
"Artificial flavor",
"Artificial flavour",
"Natural flavor",
"Natural flavour",
"Aspartame",
"BHT",
"Calcium disodium EDTA",
"Caramel color",
"Carrageenan",
"Corn starch",
"Corn syrup",
"Dextrose",
"Dough conditioners",
"Enriched flour",
"Bleached flour",
"Food color",
"Maltodextrin",
"Monoglycerides",
"Monosodium glutamate",
"Diglyceride",
"Natural flavor",
"Natural flavors",
"Polysorbate",
"Potassium sorbate",
"Sodium erythorbate",
"Sodium nitrate",
"Sodium nitrite",
"Sodium phosphate",
"Soy protein isolate",
"Splenda",
"Sugar",
"Syrup",
"Skim milk",
"Low fat",
"Reduced fat",
"Xylitol",
]

In [223]:
def find_product(product, df ,k):
    # HARD CODE FILTERING
    add_list = []

    if "scallion" in product:
        product = "spring onion"
    if "spaghetti" in product:
        product = "spaghetti"
    if ("raising" in product and "flour" in product) or "self-raising" in product:
        product = "raising flour"
    if "maple syrup" in product:
        add_list.append("Syrup")
        bad_list.remove("Syrup")
    if "pepper flakes" in product:
        product = "chilli flakes"
    

    product_split = product.split()

    # ChatGPT to help get similar items
    QUERIES_INPUT = f"""
    Give me similar products related to this prompt but is not it: {product}
    ONLY if the items are similar, otherwise don't.
    Example: if the prompt is milk then similar products would be: cheese, butter, yoghurt, etc.
    Include variations of the product name, e.g. yogurt and yoghurt
    Format: {{"Products": ["product_1", "product_2",...]}}
    """

    similar_products = json_gpt(QUERIES_INPUT)["Products"]

    # Filter out rows that do not contain the product name
    selected_rows = df.copy()  # Create a copy of the original dataframe
    for keyword in product_split:
        selected_rows = selected_rows[selected_rows['Product Name'].str.contains(fr'\b{re.escape(keyword)}\b', case=False)]
    for item in similar_products:
        selected_rows = selected_rows[~selected_rows['Product Name'].str.contains(fr'\b{re.escape(item)}\b', case=False)]
    
    # HARD CODE FILTERING

    # Filter out rows with no ingredients for certain categories only
    if k != 'fruit-veg' and k != 'poultry-meat-seedfood':
        selected_rows = selected_rows[~selected_rows['Ingredients'].isna()]
    if product == "honey":
        selected_rows = selected_rows[selected_rows["Aisle"].str.lower() == "honey"]
    if product == "ginger":
        selected_rows = selected_rows[selected_rows["Department"].str.lower() != "drink"]
    if product == "butter":
        selected_rows = selected_rows[selected_rows["Sap Category Name"].str.lower() == "dairy - butter & margarine"]
    if "spaghetti" in product:
        selected_rows = selected_rows[selected_rows["Sap Sub Category Name"].str.lower() == "pasta"]



    # Get the 'Product Name' and 'Ingredients' columns as Series
    product_names = selected_rows['Product Name']
    ingredients_series = selected_rows['Ingredients']
    cup_prices = selected_rows['Cup Price']
    price = selected_rows['Price']

    clean_products_df = pd.DataFrame(columns=['Product Name', 'Ingredients', 'Cup Price', 'Price'])

    # Iterate over each ingredient string along with its corresponding product name and cup price
    for product_name, ingredients, cup_price, price in zip(product_names, ingredients_series, cup_prices, price):
        clean = True
        # For categories like fruit-veg or poultry-meat-seafood, the ingredients list is empty -> if isinstance
        # Split the string at commas that are not between parentheses
        if isinstance(ingredients, str):
            ingredients_list = re.split(r',\s*(?![^()]*\))', ingredients)
                    # Iterate over each ingredient in the list
            for ingredient in ingredients_list:
                # Check if the ingredient is in the bad_list
                for bad_item in bad_list:
                    # Normalize bad_list item to lowercase and split it into individual words
                    bad_item_lower = bad_item.lower()
                    bad_words = re.findall(r'\b\w+\b', bad_item_lower)
                    
                    # Check if all the words from bad_list are present in the ingredient
                    all_words_present = all(word in ingredient.lower() for word in bad_words)
                    
                    if all_words_present:
                        clean = False
        else:
            ingredients_list = []

        # Ingredients shouldn't be more than a certain amount
        gum = 0
        oil = 0
        emulsifier = 0
        # Count the occurrences of specific ingredients
        gum = sum(ingredient.lower().count("gum") for ingredient in ingredients_list)
        oil = sum(ingredient.lower().count("oil") for ingredient in ingredients_list)
        emulsifier = sum(ingredient.lower().count("emulsifier") for ingredient in ingredients_list)

        if gum > 2 or oil > 2 or emulsifier > 2:
            clean = False

        # If the product is clean, add it to the list
        if clean:
            clean_products_df = pd.concat([clean_products_df, pd.DataFrame({
                'Product Name': [product_name],
                'Ingredients': [ingredients],
                'Cup Price': [cup_price],
                "Price": [price] 
            })])
    clean_products_df_sorted = clean_products_df.sort_values(by='Cup Price')
    # Add back the ingredients removed from the bad list
    for item in add_list:
        bad_list.append(item)
    return clean_products_df_sorted

In [224]:
def convert_plural_singular(word):
    p = inflect.engine()
    if p.singular_noun(word):
        return p.singular_noun(word)
    elif p.plural_noun(word):
        return p.plural_noun(word)
    else:
        return word

In [225]:
grocery_list = pd.DataFrame(columns=['Product Name', 'Ingredients', 'Cup Price', 'Price'])
all_none = []

for k, v in categorized_items.items():
    # Load files
    # Because there are 2 files for pantry items
    file_path2 = None
    if k == "pantry":
        file_path = "Data\Woolies Extracted\Woolies {} 1 info.xlsx".format(k)
        file_path2 = "Data\Woolies Extracted\Woolies {} 2 info.xlsx".format(k)
    else:
        file_path = "Data\Woolies Extracted\Woolies {} info.xlsx".format(k)
    df = pd.read_excel(file_path)
    if file_path2:
        df2 = pd.read_excel(file_path2)
        df = pd.concat([df, df2], ignore_index=True)
    # Find product 
    for product in v:
        print(product)
        print(k)
        all_skip = ["water"]
        # Skip unnecessary ingredients
        if product == "water":
            continue

        clean_products_df_sorted = find_product(product, df, k)

        # If the product is not found, try to find the singular/plural version of the product
        if clean_products_df_sorted.empty:
            # Turn plural to singular and vice versa (ex: chicken thighs to chicken thigh)
            product = convert_plural_singular(product)
            clean_products_df_sorted = find_product(product, df, k)

        # Find similar products (ex: Spring onion -> green onion) if not found
        if clean_products_df_sorted.empty:
            QUERIES_INPUT = f"""
                Give me the other names of the the product in this prompt: {product}
                If the prompt is a protein, then give me the protein name and the cut indicated. (e.g. boneless chicken thighs to chicken thighs)
                ONLY if the names refer to one specific thing, otherwise don't.
                Example: if the prompt is spring onion then similar products would be: green onions, scallions etc.
                Format: ["alternative_name_1", "alternative_name_2",...]
            """
            similar_products = json_gpt(QUERIES_INPUT)
            print("Alternative names of the product: ", similar_products)
            for product in similar_products:
                # If the product is not found, try to find the singular/plural version of the product
                clean_products_df_sorted = find_product(product, df, k)
                if clean_products_df_sorted.empty:
                    # Turn plural to singular and vice versa (ex: chicken thighs to chicken thigh)
                    product = convert_plural_singular(product)
                    print("transformed")
                    clean_products_df_sorted = find_product(product, df, k)
                if not clean_products_df_sorted.empty:
                    break
                print("Current alternative product: ", product)
            
        grocery_list = pd.concat([grocery_list, clean_products_df_sorted.head(1)], ignore_index=True)
        print(clean_products_df_sorted.head(1))
        
        if clean_products_df_sorted.empty:
            all_none.append(product)

print(all_none)

water
drinks
                     Product Name                Ingredients Cup Price Price
0  Woolworths Spring Water Sipper  100% Natural Spring Water       1.2   1.2
dried spaghetti
pantry
                        Product Name                        Ingredients  \
0  Macro Organic Wholemeal Spaghetti  Organic Durum Wholewheat Semolina   

  Cup Price Price  
0      0.33  1.65  
olive oil
pantry
           Product Name                          Ingredients Cup Price Price
0  Essentials Olive Oil  Refined Olive Oil, Virgin Olive Oil       0.8   4.0
tomato paste
pantry
              Product Name               Ingredients Cup Price Price
0  Essentials Tomato Paste  Tomato Paste (99%), Salt      0.28   1.4
crushed tomatoes
pantry
                            Product Name  \
0  Ardmona Crushed Vine Ripened Tomatoes   

                                         Ingredients Cup Price Price  
0  Crushed Tomato (63%), Tomato Juice, Tomato Pas...      4.39   1.8  
dried oregano
pantry
              

In [226]:
grocery_list

Unnamed: 0,Product Name,Ingredients,Cup Price,Price
0,Woolworths Spring Water Sipper,100% Natural Spring Water,1.2,1.2
1,Macro Organic Wholemeal Spaghetti,Organic Durum Wholewheat Semolina,0.33,1.65
2,Essentials Olive Oil,"Refined Olive Oil, Virgin Olive Oil",0.8,4.0
3,Essentials Tomato Paste,"Tomato Paste (99%), Salt",0.28,1.4
4,Ardmona Crushed Vine Ripened Tomatoes,"Crushed Tomato (63%), Tomato Juice, Tomato Pas...",4.39,1.8
5,Gourmet Garden Oregano Lightly Dried,"Organic Oregano (91%), organic canola oil, sea...",8.0,4.0
6,Macro Onion Spring Organic,,4.9,4.9
7,Woolworths Garlic Cloves,,35.71,2.5


In [227]:
total_cost = grocery_list["Price"].sum()

print("Total Cost: $" + str(total_cost))

Total Cost: $21.450000000000003


In [31]:
df = pd.read_excel("Data\Woolies Extracted\Woolies fruit-veg info.xlsx")
# HARD CODE FILTERING
product = "scallions"
if "scallion" in product:
    product = "spring onion"


product_split = product.split()

# ChatGPT to help get similar items
QUERIES_INPUT = f"""
Give me similar products related to this prompt but is not it: {product}
ONLY if the items are similar, otherwise don't.
Example: if the prompt is milk then similar products would be: cheese, butter, yoghurt, etc.
Include variations of the product name, e.g. yogurt and yoghurt
Format: {{"Products": ["product_1", "product_2",...]}}
"""

similar_products = json_gpt(QUERIES_INPUT)["Products"]

# Filter out rows that do not contain the product name
selected_rows = df.copy()  # Create a copy of the original dataframe
for keyword in product_split:
    selected_rows = selected_rows[selected_rows['Product Name'].str.contains(fr'\b{re.escape(keyword)}\b', case=False)]
for item in similar_products:
    selected_rows = selected_rows[~selected_rows['Product Name'].str.contains(fr'\b{re.escape(item)}\b', case=False)]

# HARD CODE FILTERING
k = "fruit-veg"
# Filter out rows with no ingredients for certain categories only
if k != 'fruit-veg' and k != 'poultry-meat-seedfood':
    selected_rows = selected_rows[~selected_rows['Ingredients'].isna()]

if product == "honey":
    selected_rows = selected_rows[selected_rows["Aisle"].str.lower() == "honey"]
if product == "ginger":
    selected_rows = selected_rows[selected_rows["Department"].str.lower() != "drink"]
if product == "butter":
    selected_rows = selected_rows[selected_rows["Sap Category Name"].str.lower() == "dairy - butter & margarine"]
if "spaghetti" in product:
    selected_rows = selected_rows[selected_rows["Sap Sub Category Name"].str.lower() == "pasta"]

# Get the 'Product Name' and 'Ingredients' columns as Series
product_names = selected_rows['Product Name']
ingredients_series = selected_rows['Ingredients']
cup_prices = selected_rows['Cup Price']
price = selected_rows['Price']

clean_products_df = pd.DataFrame(columns=['Product Name', 'Ingredients', 'Cup Price', 'Price'])

# Iterate over each ingredient string along with its corresponding product name and cup price
for product_name, ingredients, cup_price, price in zip(product_names, ingredients_series, cup_prices, price):
    clean = True
# For categories like fruit-veg or poultry-meat-seafood, the ingredients list is empty -> if isinstance
# Split the string at commas that are not between parentheses
if isinstance(ingredients, str):
    ingredients_list = re.split(r',\s*(?![^()]*\))', ingredients)
            # Iterate over each ingredient in the list
    for ingredient in ingredients_list:
        # Check if the ingredient is in the bad_list
        for bad_item in bad_list:
            # Normalize bad_list item to lowercase and split it into individual words
            bad_item_lower = bad_item.lower()
            bad_words = re.findall(r'\b\w+\b', bad_item_lower)
            
            # Check if all the words from bad_list are present in the ingredient
            all_words_present = all(word in ingredient.lower() for word in bad_words)
            
            if all_words_present:
                clean = False
else:
    ingredients_list = []

# Ingredients shouldn't be more than a certain amount
gum = 0
oil = 0
emulsifier = 0
# Count the occurrences of specific ingredients
gum = sum(ingredient.lower().count("gum") for ingredient in ingredients_list)
oil = sum(ingredient.lower().count("oil") for ingredient in ingredients_list)
emulsifier = sum(ingredient.lower().count("emulsifier") for ingredient in ingredients_list)

if gum > 2 or oil > 2 or emulsifier > 2:
    clean = False

# If the product is clean, add it to the list
if clean:
    clean_products_df = pd.concat([clean_products_df, pd.DataFrame({
        'Product Name': [product_name],
        'Ingredients': [ingredients],
        'Cup Price': [cup_price],
        "Price": [price] 
    })])
clean_products_df_sorted = clean_products_df.sort_values(by='Cup Price')
clean_products_df_sorted

Unnamed: 0,Product Name,Ingredients,Cup Price,Price
0,Macro Onion Spring Organic,,4.9,4.9


In [5]:
# QUERIES_INPUT = f"""
#                 Give me the other names of the the product in this prompt: {product}
#                 If the prompt is a protein, then ONLY give me the protein name and the cut if indicated.
#                 ONLY if the names refer to one specific thing, otherwise don't.
#                 Example: if the prompt is spring onion then similar products would be: green onions, scallions etc.
#                 Format: ["alternative_name_1", "alternative_name_2",...]
#             """
# similar_products = json_gpt(QUERIES_INPUT)
# similar_products

['chicken thigh']

In [12]:

# QUERIES_INPUT = f"""
# You have access to a list of Sap Categories: {df["Sap Category Name"].unique()}
# You also have access to a list of Sap Sub Categories: {df["Sap Sub Category Name"].unique()}
# Return all of the relevant categories within the list for the following item: {product}
# ONLY return the categories within the list provided.
# Format: {{"Sap Category": ["category_1", "category_2",...], "Sap Sub Category": ["category_1", "category_2",...]}}
# """

# res_sap = json_gpt(QUERIES_INPUT)["Sap Category"]
# # res_sub_sap = json_gpt(QUERIES_INPUT)["Sap Sub Category"]
# # res_sap, res_sub_sap
# json_gpt(QUERIES_INPUT)

## Debug

In [16]:
df = pd.read_excel("Data\Woolies Extracted\Woolies fruit-veg info.xlsx")

# df2 = pd.read_excel("Data\Woolies Extracted\Woolies pantry 2 info.xlsx")
# df = pd.concat([df, df2], ignore_index=True)

In [18]:
# Helper functions
def json_gpt(input: str):
    completion = openai.ChatCompletion.create(
        model=GPT_MODEL,
        messages=[
            {"role": "system", "content": "Output only valid JSON"},
            {"role": "user", "content": input},
        ],
        temperature=0.5,
    )

    text = completion.choices[0].message.content
    parsed = json.loads(text)

    return parsed


def embeddings(input: list[str]) -> list[list[str]]:
    response = openai.Embedding.create(model="text-embedding-ada-002", input=input)
    return [data.embedding for data in response.data]

bad_list = [
    "Artificial flavor",
    "Artificial flavour",
    "Natural flavor",
    "Natural flavour",
    "Aspartame",
    "BHT",
    "Calcium disodium EDTA",
    "Caramel color",
    "Carrageenan",
    "Corn starch",
    "Corn syrup",
    "Dextrose",
    "Dough conditioners",
    "Enriched flour",
    "Bleached flour",
    "Food color",
    "Maltodextrin",
    "Monoglycerides",
    "Monosodium glutamate",
    "Diglyceride",
    "Natural flavor",
    "Natural flavors",
    "Polysorbate",
    "Potassium sorbate",
    "Sodium erythorbate",
    "Sodium nitrate",
    "Sodium nitrite",
    "Sodium phosphate",
    "Soy protein isolate",
    "Splenda",
    "Sugar",
    "Syrup",
    "Skim milk",
    "Low fat",
    "Reduced fat",
    "Xylitol",
]

product = "spring onion"

product_split = product.split()

# ChatGPT to help get similar items and remove them
QUERIES_INPUT = f"""
Give me similar products related to this prompt but is not it: {product}
ONLY if the items are similar, otherwise don't.
Example: if the prompt is milk then similar products would be: cheese, butter, yoghurt, etc.
Include variations of the product name, e.g. yogurt and yoghurt
Format: {{"Products": ["product_1", "product_2",...]}}
"""

similar_products = json_gpt(QUERIES_INPUT)["Products"]
print("Similar products: ", similar_products)

# Filter out rows that do not contain the product name
selected_rows = df.copy()  # Create a copy of the original dataframe
for keyword in product_split:
    selected_rows = selected_rows[selected_rows['Product Name'].str.contains(fr'\b{re.escape(keyword)}\b', case=False)]
for item in similar_products:
    selected_rows = selected_rows[~selected_rows['Product Name'].str.contains(fr'\b{re.escape(item)}\b', case=False)]
# Filter out rows with no ingredients for certain categories only
k = 'fruit-veg'
if k != 'fruit-veg' and k != 'poultry-meat-seedfood':
    print("here")
    selected_rows = selected_rows[~selected_rows['Ingredients'].isna()]
print(len(selected_rows))

# Get the 'Product Name' and 'Ingredients' columns as Series
product_names = selected_rows['Product Name']
ingredients_series = selected_rows['Ingredients']
cup_prices = selected_rows['Cup Price']
price = selected_rows['Price']

clean_products_df = pd.DataFrame(columns=['Product Name', 'Ingredients', 'Cup Price', 'Price'])
all_clean = []

# Iterate over each ingredient string along with its corresponding product name and cup price
for product_name, ingredients, cup_price, price in zip(product_names, ingredients_series, cup_prices, price):
    clean = True

    # For categories like fruit-veg or poultry-meat-seafood, the ingredients list is empty -> if isinstance
    # Split the string at commas that are not between parentheses
    if isinstance(ingredients, str):
        ingredients_list = re.split(r',\s*(?![^()]*\))', ingredients)
    else:
        ingredients_list = []
    
        # Iterate over each ingredient in the list
        for ingredient in ingredients_list:
            # Check if the ingredient is in the bad_list
            for bad_item in bad_list:
                # Normalize bad_list item to lowercase and split it into individual words
                bad_item_lower = bad_item.lower()
                bad_words = re.findall(r'\b\w+\b', bad_item_lower)
                
                # Check if all the words from bad_list are present in the ingredient
                all_words_present = all(word in ingredient.lower() for word in bad_words)
                
                if all_words_present:
                    clean = False

    # Ingredients shouldn't be more than a certain amount
    gum = 0
    oil = 0
    emulsifier = 0
    # Count the occurrences of specific ingredients
    gum = sum(ingredient.lower().count("gum") for ingredient in ingredients_list)
    oil = sum(ingredient.lower().count("oil") for ingredient in ingredients_list)
    emulsifier = sum(ingredient.lower().count("emulsifier") for ingredient in ingredients_list)

    if gum > 2 or oil > 2 or emulsifier > 2:
        print("Too many")
        clean = False

    # If the product is clean, add it to the list
    if clean:
        clean_products_df = pd.concat([clean_products_df, pd.DataFrame({
            'Product Name': [product_name],
            'Ingredients': [ingredients],
            'Cup Price': [cup_price],
            "Price": [price] 
        })])

clean_products_df_sorted = clean_products_df.sort_values(by='Cup Price')
first_row = clean_products_df_sorted.head(1)

if first_row.empty:
    print("Can't find a product.")
else:
    print("There's a product.")
clean_products_df_sorted.head(1)

Similar products:  ['scallion', 'green onion', 'shallot', 'leek']
1
There's a product.


Unnamed: 0,Product Name,Ingredients,Cup Price,Price
0,Macro Onion Spring Organic,,4.9,4.9


The DataFrame is empty.


True

In [15]:
# clean_products_df_sorted.to_excel('clean_products.xlsx', index=False)

In [49]:
similar_products = {
    'product1': [1, 2, 3],
    'product2': [4, 5, 6],
    'product3': [7, 8, 9]
}

for key, value in similar_products.items():
    print("Key:", key)
    for item in value:
        if item == 1:  # Replace 'condition' with your desired condition to skip to the next key
            continue
        print(key, value)
        print("Value:", item)

Key: product1
product1 [1, 2, 3]
Value: 2
product1 [1, 2, 3]
Value: 3
Key: product2
product2 [4, 5, 6]
Value: 4
product2 [4, 5, 6]
Value: 5
product2 [4, 5, 6]
Value: 6
Key: product3
product3 [7, 8, 9]
Value: 7
product3 [7, 8, 9]
Value: 8
product3 [7, 8, 9]
Value: 9


In [234]:
import nltk
from nltk.stem import WordNetLemmatizer

# nltk.download('wordnet')  # Download the WordNet corpus if not already downloaded

def extract_base_word(text):
    tokens = nltk.word_tokenize(text)
    base_words = [token for token in tokens if re.match(r'^[a-zA-Z]+$', token)]
    return ' '.join(base_words)

original_text = "boneless chicken breasts"
base_word = extract_base_word(original_text)
print(base_word)

boneless chicken breasts
