<!-- #### Step 1: remove rows where food_desc == ingredient_dec -->

### **Approach 1 (did not work): making API calls to `food.com` to find if a corresponding url can be retrieved for a food item**

Using the name of the food and try to look up on food.com directly kept returning error code 200

In [6]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import random

foods_df = pd.read_csv('../processed_data/foods_list.csv.csv')

# Function to search Food.com for a specific food item
def find_recipe_on_food_com(food_desc):
    # Format the search URL
    search_url = f"https://www.food.com/search/{food_desc.replace(' ', '%20')}"
    
    try:
        # Send a GET request to Food.com
        response = requests.get(search_url, headers={'User-Agent': 'Mozilla/5.0'})
        
        # Debugging: Print status code
        print(f"Requesting {search_url}, Status Code: {response.status_code}")

        if response.status_code == 200:
            # Parse the HTML response with BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Look for recipe results
            recipe_cards = soup.find_all('div', class_='card__detailsContainer-left')
            if recipe_cards:
                # Extract the first recipe link as an example
                first_recipe = recipe_cards[0]
                recipe_link_tag = first_recipe.find('a', href=True)
                if recipe_link_tag:
                    recipe_url = "https://www.food.com" + recipe_link_tag['href']
                    return True, recipe_url
            return False, None
        else:
            print(f"Failed to retrieve results for {food_desc} (status code: {response.status_code})")
            return False, None
    except Exception as e:
        print(f"Error while searching for {food_desc}: {e}")
        return False, None

# Filter the foods dataframe and collect those that are found on Food.com
filtered_foods = []
matched_foods = []

for _, row in foods_df.iterrows():
    food_desc = row['food_desc']
    found, recipe_url = find_recipe_on_food_com(food_desc)
    if found:
        # Append row and the recipe URL to the list
        row_dict = row.to_dict()
        row_dict['recipe_url'] = recipe_url
        matched_foods.append(row_dict)
        print(f"Found recipe for '{food_desc}': {recipe_url}")
    else:
        filtered_foods.append(row)
    
    # Randomized sleep to avoid rate limiting
    time.sleep(random.uniform(3, 7))

filtered_foods_df = pd.DataFrame(filtered_foods)
matched_foods_df = pd.DataFrame(matched_foods)

filtered_foods_df.to_csv('../processed_data/food.com_filtered_foods_df.csv', index=False)
matched_foods_df.to_csv('../processed_data/food.com_matched_foods_df.csv', index=False)

Requesting https://www.food.com/search/Hard%20candy, Status Code: 200
Requesting https://www.food.com/search/Soft%20drink,%20root%20beer, Status Code: 200
Requesting https://www.food.com/search/Potato%20chips,%20baked,%20flavored, Status Code: 200
Requesting https://www.food.com/search/Ham,%20prepackaged%20or%20deli,%20luncheon%20meat,%20reduced%20sodium, Status Code: 200
Requesting https://www.food.com/search/Bread,%20wheat%20or%20cracked%20wheat, Status Code: 200
Requesting https://www.food.com/search/Mayonnaise-type%20salad%20dressing, Status Code: 200
Requesting https://www.food.com/search/Popcorn,%20air-popped,%20no%20butter%20added, Status Code: 200
Requesting https://www.food.com/search/Fruit%20flavored%20drink,%20powdered,%20reconstituted, Status Code: 200


KeyboardInterrupt: 

### **Approach 2 (worked but not quite satisfactorily): same as approach 1 but looking up on AllRecipes.com**

 also we should not use AllRecipes.com since it doesn't have the same nutrition facts as food.com

In [72]:
# !python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ------------------------ --------------- 7.9/12.8 MB 37.3 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 34.8 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [75]:
import pandas as pd
import time
import re
from allrecipes import AllRecipes
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import spacy

# Load your dataset
file_path = '../processed_data/foods_list.csv'
foods_df = pd.read_csv(file_path)

# Load an open-source LLM (using GPT-Neo here as an example)
model_name = "EleutherAI/gpt-neo-1.3B"  # You can replace this with another open-source model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Load spaCy model for fallback noun extraction
nlp = spacy.load("en_core_web_sm")

# Rule-based simplification function
def rule_based_simplify(food_desc):
    # Convert to lowercase for uniformity
    food_desc = food_desc.lower()

    # # Remove known unnecessary words and phrases
    # food_desc = re.sub(r'\b(no butter added|reduced sodium|low fat|unsweetened|packaged|prepackaged|deli|flavored|optional)\b', '', food_desc, flags=re.IGNORECASE)

    # Keep only the first 2-3 key terms (focus on the core description)
    simplified_name = " ".join(food_desc.split()[:3])

    # Remove conjunctions and unnecessary small words like "or", "and", "with", etc.
    simplified_name = re.sub(r'\b(or|and|with)\b', '', simplified_name)

    # Strip unnecessary commas, trailing or leading whitespaces
    simplified_name = re.sub(r',', '', simplified_name).strip()

    return simplified_name

# Function to extract the main food noun from the food description using a rule-based approach with spaCy, and fallback to LLM if necessary
def extract_main_noun(food_desc):
    # First try extracting with spaCy
    main_noun = extract_main_noun_spacy(food_desc)
    if main_noun:
        return main_noun

    # If spaCy fails, use the LLM
    prompt = f"From the following food description, extract only one word that is the main food noun: '{food_desc}'. Make sure it is the most important word that describes the food."
    try:
        response = generator(prompt, max_new_tokens=5, temperature=0.2, num_return_sequences=1)
        main_noun = response[0]['generated_text'].strip().split()[0]  # Extract the first word of the LLM response

        # Log the extracted main noun
        print(f"Extracted main noun: '{main_noun}' from '{food_desc}'")
        return main_noun

    except Exception as e:
        print(f"Error while extracting main noun for '{food_desc}' using LLM: {e}")
        return ""

# Function to extract the main noun using spaCy
def extract_main_noun_spacy(food_desc):
    doc = nlp(food_desc)
    nouns = [token.text for token in doc if token.pos_ == "NOUN"]
    if nouns:
        main_noun = nouns[0]  # Choose the first noun as the main food item
        print(f"spaCy extracted main noun: '{main_noun}' from '{food_desc}'")
        return main_noun
    else:
        print(f"No noun found in '{food_desc}' using spaCy.")
        return ""

# Function to use allrecipes package to search AllRecipes.com for a specific food item
def find_recipe_on_allrecipes(food_desc):
    # Use combined approach to simplify the food description
    simplified_name = rule_based_simplify(food_desc)
    print(f"Simplified food name for AllRecipes: '{simplified_name}' from '{food_desc}'")

    try:
        # Use AllRecipes package to perform the search
        query_result = AllRecipes.search(simplified_name)

        # If no results found, print a message for debugging
        if not query_result:
            print(f"No recipe found for '{simplified_name}' on AllRecipes.")
            return False, None

        # Get the details of the first returned recipe
        main_recipe_url = query_result[0]['url']

        # Extract the main food noun for validation
        main_noun = extract_main_noun(food_desc)

        # Check if the main noun appears in the URL
        if main_noun and main_noun.lower() in main_recipe_url.lower():
            return True, main_recipe_url
        else:
            print(f"Recipe URL '{main_recipe_url}' does not match the main noun '{main_noun}' from '{food_desc}'.")
            return False, None

    except Exception as e:
        print(f"Error while searching for '{simplified_name}' on AllRecipes: {e}")
        return False, None

filtered_foods = []
matched_foods = []

for _, row in foods_df.iterrows():
    food_desc = row['food_desc']
    found, recipe_url = find_recipe_on_allrecipes(food_desc)

    if found:
        row_dict = row.to_dict()
        row_dict['recipe_url'] = recipe_url
        matched_foods.append(row_dict)
        print(f"Found recipe for '{food_desc}': {recipe_url}")
    else:
        filtered_foods.append(row)

    time.sleep(3)

filtered_foods_df = pd.DataFrame(filtered_foods)
matched_foods_df = pd.DataFrame(matched_foods)

filtered_foods_df.to_csv('../processed_data/filtered_foods_df.csv', index=False)
matched_foods_df.to_csv('../processed_data/matched_foods_df.csv', index=False)

print("Filtered data saved to 'filtered_foods_df.csv'")
print("Matched data saved to 'matched_foods_df.csv'")


Simplified food name for AllRecipes: 'hard candy' from 'Hard candy'
spaCy extracted main noun: 'candy' from 'Hard candy'
Found recipe for 'Hard candy': https://www.allrecipes.com/recipe/35842/hard-candy/
Simplified food name for AllRecipes: 'soft drink root' from 'Soft drink, root beer'
spaCy extracted main noun: 'drink' from 'Soft drink, root beer'
Recipe URL 'https://www.allrecipes.com/recipe/284771/old-fashioned-root-beer-slushy/' does not match the main noun 'drink' from 'Soft drink, root beer'.
Simplified food name for AllRecipes: 'potato chips baked' from 'Potato chips, baked, flavored'
spaCy extracted main noun: 'chips' from 'Potato chips, baked, flavored'
Recipe URL 'https://www.allrecipes.com/recipe/8848/potato-chip-chicken-casserole/' does not match the main noun 'chips' from 'Potato chips, baked, flavored'.
Simplified food name for AllRecipes: 'ham' from 'Ham, prepackaged or deli, luncheon meat, reduced sodium'
spaCy extracted main noun: 'Ham' from 'Ham, prepackaged or deli,

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Roll, white, soft' using spaCy.
Extracted main noun: 'From' from 'Roll, white, soft'
Recipe URL 'https://www.allrecipes.com/recipe/15295/soft-sugar-cookies-iv/' does not match the main noun 'From' from 'Roll, white, soft'.
Simplified food name for AllRecipes: 'fruit juice drink' from 'Fruit juice drink'
spaCy extracted main noun: 'Fruit' from 'Fruit juice drink'
Found recipe for 'Fruit juice drink': https://www.allrecipes.com/recipe/20699/exotic-fruit-drink/
Simplified food name for AllRecipes: 'egg salad made' from 'Egg Salad, made with any type of fat free dressing'
spaCy extracted main noun: 'type' from 'Egg Salad, made with any type of fat free dressing'
Recipe URL 'https://www.allrecipes.com/recipe/103767/jens-heavenly-egg-salad/' does not match the main noun 'type' from 'Egg Salad, made with any type of fat free dressing'.
Simplified food name for AllRecipes: 'grits instant made' from 'Grits, instant, made with water, no added fat'
spaCy extracted main noun: 'Gr

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Butter, tub' using spaCy.
Extracted main noun: 'From' from 'Butter, tub'
Recipe URL 'https://www.allrecipes.com/recipe/246358/johnsonville-brat-hot-tub/' does not match the main noun 'From' from 'Butter, tub'.
Simplified food name for AllRecipes: 'potato french fries' from 'Potato, french fries, fast food'
spaCy extracted main noun: 'fries' from 'Potato, french fries, fast food'
Found recipe for 'Potato, french fries, fast food': https://www.allrecipes.com/recipe/50223/homemade-crispy-seasoned-french-fries/
Simplified food name for AllRecipes: 'chili con carne' from 'Chili con carne, NS as to beans'
spaCy extracted main noun: 'carne' from 'Chili con carne, NS as to beans'
Found recipe for 'Chili con carne, NS as to beans': https://www.allrecipes.com/recipe/89993/award-winning-chili-con-carne/
Simplified food name for AllRecipes: 'soft drink fruit' from 'Soft drink, fruit flavored, caffeine free'
spaCy extracted main noun: 'drink' from 'Soft drink, fruit flavored, caff

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Salsa, red' using spaCy.
Extracted main noun: 'From' from 'Salsa, red'
Recipe URL 'https://www.allrecipes.com/recipe/238420/fiery-red-salsa/' does not match the main noun 'From' from 'Salsa, red'.
Simplified food name for AllRecipes: 'breakfast tart' from 'Breakfast tart'
spaCy extracted main noun: 'Breakfast' from 'Breakfast tart'
Found recipe for 'Breakfast tart': https://www.allrecipes.com/recipe/23378/country-manor-breakfast-tart/
Simplified food name for AllRecipes: 'orange juice 100%' from 'Orange juice, 100%, canned, bottled or in a carton'
spaCy extracted main noun: 'juice' from 'Orange juice, 100%, canned, bottled or in a carton'
Recipe URL 'https://www.allrecipes.com/recipe/264721/orange-cream-pops/' does not match the main noun 'juice' from 'Orange juice, 100%, canned, bottled or in a carton'.
Simplified food name for AllRecipes: 'chicken fillet grilled' from 'Chicken fillet, grilled'
spaCy extracted main noun: 'Chicken' from 'Chicken fillet, grilled'
Recip

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Okra, batter-dipped, fried' using spaCy.
Extracted main noun: 'From' from 'Okra, batter-dipped, fried'
Recipe URL 'https://www.allrecipes.com/recipe/150306/the-best-chicken-fried-steak/' does not match the main noun 'From' from 'Okra, batter-dipped, fried'.
Simplified food name for AllRecipes: 'coffee substitute dry' from 'Coffee substitute, dry powder'
spaCy extracted main noun: 'Coffee' from 'Coffee substitute, dry powder'
Found recipe for 'Coffee substitute, dry powder': https://www.allrecipes.com/recipe/278950/whole-wheat-pumpkin-coffee-cake-muffins/
Simplified food name for AllRecipes: 'lemon juice 100%' from 'Lemon juice, 100%, freshly squeezed'
spaCy extracted main noun: 'juice' from 'Lemon juice, 100%, freshly squeezed'
Recipe URL 'https://www.allrecipes.com/recipe/265432/lemon-cheesecake-bars/' does not match the main noun 'juice' from 'Lemon juice, 100%, freshly squeezed'.
Simplified food name for AllRecipes: 'sugar substitute saccharin' from 'Sugar substitu

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Coffee, brewed' using spaCy.
Extracted main noun: 'From' from 'Coffee, brewed'
Recipe URL 'https://www.allrecipes.com/recipe/233721/cold-brewed-coffee/' does not match the main noun 'From' from 'Coffee, brewed'.
Simplified food name for AllRecipes: 'sugar white granulated' from 'Sugar, white, granulated or lump'
spaCy extracted main noun: 'lump' from 'Sugar, white, granulated or lump'
Recipe URL 'https://www.allrecipes.com/recipe/10402/the-best-rolled-sugar-cookies/' does not match the main noun 'lump' from 'Sugar, white, granulated or lump'.
Simplified food name for AllRecipes: 'coffee creamer powder' from 'Coffee creamer, powder'
spaCy extracted main noun: 'creamer' from 'Coffee creamer, powder'
Found recipe for 'Coffee creamer, powder': https://www.allrecipes.com/recipe/233504/coffee-creamer/
Simplified food name for AllRecipes: 'vegetable chips' from 'Vegetable chips'
spaCy extracted main noun: 'Vegetable' from 'Vegetable chips'
Found recipe for 'Vegetable chips':

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Tequila' using spaCy.
Extracted main noun: 'From' from 'Tequila'
Recipe URL 'https://www.allrecipes.com/recipe/19127/restaurant-style-tequila-lime-chicken/' does not match the main noun 'From' from 'Tequila'.
Simplified food name for AllRecipes: 'tea hot herbal' from 'Tea, hot, herbal'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Tea, hot, herbal' using spaCy.
Extracted main noun: 'From' from 'Tea, hot, herbal'
Recipe URL 'https://www.allrecipes.com/recipe/40401/hot-toddy/' does not match the main noun 'From' from 'Tea, hot, herbal'.
Simplified food name for AllRecipes: 'banana raw' from 'Banana, raw'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Banana, raw' using spaCy.
Extracted main noun: 'From' from 'Banana, raw'
Recipe URL 'https://www.allrecipes.com/recipe/166611/alexs-raw-chocolate-pudding/' does not match the main noun 'From' from 'Banana, raw'.
Simplified food name for AllRecipes: 'infant formula powder' from 'Infant formula, powder, made with water, NFS (Similac Go and Grow)'
No recipe found for 'infant formula powder' on AllRecipes.
Simplified food name for AllRecipes: 'honey' from 'Honey'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Honey' using spaCy.
Extracted main noun: 'From' from 'Honey'
Recipe URL 'https://www.allrecipes.com/recipe/14745/honey-glazed-ham/' does not match the main noun 'From' from 'Honey'.
Simplified food name for AllRecipes: 'fruit juice drink' from 'Fruit juice drink (Capri Sun)'
spaCy extracted main noun: 'Fruit' from 'Fruit juice drink (Capri Sun)'
Found recipe for 'Fruit juice drink (Capri Sun)': https://www.allrecipes.com/recipe/20699/exotic-fruit-drink/
Simplified food name for AllRecipes: 'pork sausage' from 'Pork sausage, reduced sodium'
spaCy extracted main noun: 'Pork' from 'Pork sausage, reduced sodium'
Found recipe for 'Pork sausage, reduced sodium': https://www.allrecipes.com/recipe/22299/pork-sausage-and-cabbage-pitas/
Simplified food name for AllRecipes: 'egg whole boiled' from 'Egg, whole, boiled or poached '


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Egg, whole, boiled or poached ' using spaCy.
Extracted main noun: 'From' from 'Egg, whole, boiled or poached '
Recipe URL 'https://www.allrecipes.com/recipe/213737/kens-perfect-hard-boiled-egg-and-i-mean-perfect/' does not match the main noun 'From' from 'Egg, whole, boiled or poached '.
Simplified food name for AllRecipes: 'rice white cooked' from 'Rice, white, cooked with fat, Puerto Rican style'
spaCy extracted main noun: 'Rice' from 'Rice, white, cooked with fat, Puerto Rican style'
Found recipe for 'Rice, white, cooked with fat, Puerto Rican style': https://www.allrecipes.com/recipe/24059/creamy-rice-pudding/
Simplified food name for AllRecipes: 'bread wheat' from 'Bread, wheat or cracked wheat, toasted'
spaCy extracted main noun: 'wheat' from 'Bread, wheat or cracked wheat, toasted'
Found recipe for 'Bread, wheat or cracked wheat, toasted': https://www.allrecipes.com/recipe/142267/steakhouse-wheat-bread-for-the-bread-machine/
Simplified food name for AllRecipes:

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Jelly' using spaCy.
Extracted main noun: 'From' from 'Jelly'
Recipe URL 'https://www.allrecipes.com/recipe/239824/grape-jelly-meatballs/' does not match the main noun 'From' from 'Jelly'.
Simplified food name for AllRecipes: 'potato hash brown' from 'Potato, hash brown, from fresh, with cheese'
spaCy extracted main noun: 'hash' from 'Potato, hash brown, from fresh, with cheese'
Found recipe for 'Potato, hash brown, from fresh, with cheese': https://www.allrecipes.com/recipe/160132/simply-potatoes-cheesy-hash-browns/
Simplified food name for AllRecipes: 'coffee substitute dry' from 'Coffee substitute, dry powder'
spaCy extracted main noun: 'Coffee' from 'Coffee substitute, dry powder'
Found recipe for 'Coffee substitute, dry powder': https://www.allrecipes.com/recipe/278950/whole-wheat-pumpkin-coffee-cake-muffins/
Simplified food name for AllRecipes: 'double hamburger 2' from 'Double hamburger, 2 medium patties, with condiments, on bun, from fast food / restaurant (Wen

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Cereal (General Mills Cocoa Puffs)' using spaCy.
Extracted main noun: 'From' from 'Cereal (General Mills Cocoa Puffs)'
Recipe URL 'https://www.allrecipes.com/recipe/15820/puppy-chow/' does not match the main noun 'From' from 'Cereal (General Mills Cocoa Puffs)'.
Simplified food name for AllRecipes: 'grape juice 100%' from 'Grape juice, 100%'
No recipe found for 'grape juice 100%' on AllRecipes.
Simplified food name for AllRecipes: 'pork  beans' from 'Pork and beans'
spaCy extracted main noun: 'Pork' from 'Pork and beans'
Found recipe for 'Pork and beans': https://www.allrecipes.com/recipe/236731/tender-pork-stew-with-beans/
Simplified food name for AllRecipes: 'orange raw' from 'Orange, raw'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Orange, raw' using spaCy.
Extracted main noun: 'From' from 'Orange, raw'
Recipe URL 'https://www.allrecipes.com/recipe/220406/raw-pad-thai/' does not match the main noun 'From' from 'Orange, raw'.
Simplified food name for AllRecipes: 'bread white' from 'Bread, white'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Bread, white' using spaCy.
Extracted main noun: 'From' from 'Bread, white'
Recipe URL 'https://www.allrecipes.com/recipe/6813/white-bread-for-the-bread-machine/' does not match the main noun 'From' from 'Bread, white'.
Simplified food name for AllRecipes: 'peanut butter' from 'Peanut butter'
spaCy extracted main noun: 'Peanut' from 'Peanut butter'
Found recipe for 'Peanut butter': https://www.allrecipes.com/recipe/10670/peanut-butter-balls-iii/
Simplified food name for AllRecipes: 'fish ns as' from 'Fish, NS as to type, from fast food'
No recipe found for 'fish ns as' on AllRecipes.
Simplified food name for AllRecipes: 'green beans fresh' from 'Green beans, fresh, cooked, fat added, NS as to fat type'
spaCy extracted main noun: 'beans' from 'Green beans, fresh, cooked, fat added, NS as to fat type'
Found recipe for 'Green beans, fresh, cooked, fat added, NS as to fat type': https://www.allrecipes.com/recipe/230103/buttery-garlic-green-beans/
Simplified food name for A

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Apple, raw' using spaCy.
Extracted main noun: 'From' from 'Apple, raw'
Recipe URL 'https://www.allrecipes.com/recipe/8269/raw-apple-cake/' does not match the main noun 'From' from 'Apple, raw'.
Simplified food name for AllRecipes: 'cereal (kellogg's frosted' from 'Cereal (Kellogg's Frosted Flakes)'
No recipe found for 'cereal (kellogg's frosted' on AllRecipes.
Simplified food name for AllRecipes: 'apple juice 100%' from 'Apple juice, 100%'
spaCy extracted main noun: 'juice' from 'Apple juice, 100%'
Recipe URL 'https://www.allrecipes.com/recipe/239259/spinach-salad-with-pomegranate-cranberry-dressing/' does not match the main noun 'juice' from 'Apple juice, 100%'.
Simplified food name for AllRecipes: 'shrimp fried' from 'Shrimp, fried'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Shrimp, fried' using spaCy.
Extracted main noun: 'From' from 'Shrimp, fried'
Recipe URL 'https://www.allrecipes.com/recipe/272834/easy-homemade-shrimp-fried-rice/' does not match the main noun 'From' from 'Shrimp, fried'.
Simplified food name for AllRecipes: 'pizza  pepperoni' from 'Pizza with pepperoni, from frozen, thick crust'
spaCy extracted main noun: 'Pizza' from 'Pizza with pepperoni, from frozen, thick crust'
Found recipe for 'Pizza with pepperoni, from frozen, thick crust': https://www.allrecipes.com/recipe/233222/easy-pepperoni-pizza-muffins/
Simplified food name for AllRecipes: 'soft drink cola' from 'Soft drink, cola, diet'
spaCy extracted main noun: 'drink' from 'Soft drink, cola, diet'
Recipe URL 'https://www.allrecipes.com/recipe/7502/coco-cola-cake-ii/' does not match the main noun 'drink' from 'Soft drink, cola, diet'.
Simplified food name for AllRecipes: 'gelatin dessert dietetic' from 'Gelatin dessert, dietetic, with fruit, sweetened with low calori

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Cabbage, green, raw' using spaCy.
Extracted main noun: 'From' from 'Cabbage, green, raw'
Recipe URL 'https://www.allrecipes.com/recipe/281314/lemon-tahini-quinoa-with-raw-chickpeas/' does not match the main noun 'From' from 'Cabbage, green, raw'.
Simplified food name for AllRecipes: 'bread white toasted' from 'Bread, white, toasted'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Bread, white, toasted' using spaCy.
Extracted main noun: 'From' from 'Bread, white, toasted'
Recipe URL 'https://www.allrecipes.com/recipe/6788/amish-white-bread/' does not match the main noun 'From' from 'Bread, white, toasted'.
Simplified food name for AllRecipes: 'butter stick' from 'Butter, stick'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Butter, stick' using spaCy.
Extracted main noun: 'From' from 'Butter, stick'
Recipe URL 'https://www.allrecipes.com/recipe/10049/butter-tarts/' does not match the main noun 'From' from 'Butter, stick'.
Simplified food name for AllRecipes: 'tea hot leaf' from 'Tea, hot, leaf, black'
No recipe found for 'tea hot leaf' on AllRecipes.
Simplified food name for AllRecipes: 'beef ground' from 'Beef, ground'
spaCy extracted main noun: 'Beef' from 'Beef, ground'
Found recipe for 'Beef, ground': https://www.allrecipes.com/recipe/158140/spaghetti-sauce-with-ground-beef/
Simplified food name for AllRecipes: 'lettuce raw' from 'Lettuce, raw'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Lettuce, raw' using spaCy.
Extracted main noun: 'From' from 'Lettuce, raw'
Recipe URL 'https://www.allrecipes.com/recipe/239788/chef-johns-chicken-lettuce-wraps/' does not match the main noun 'From' from 'Lettuce, raw'.
Simplified food name for AllRecipes: 'cheese brie' from 'Cheese, Brie'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Cheese, Brie' using spaCy.
Extracted main noun: 'From' from 'Cheese, Brie'
Recipe URL 'https://www.allrecipes.com/recipe/15015/brie-cheese-appetizer/' does not match the main noun 'From' from 'Cheese, Brie'.
Simplified food name for AllRecipes: 'oatmeal beverage' from 'Oatmeal beverage with milk'
spaCy extracted main noun: 'beverage' from 'Oatmeal beverage with milk'
Recipe URL 'https://www.allrecipes.com/recipe/37934/oatmeal-cookie/' does not match the main noun 'beverage' from 'Oatmeal beverage with milk'.
Simplified food name for AllRecipes: 'crackers matzo' from 'Crackers, matzo, reduced sodium'
spaCy extracted main noun: 'Crackers' from 'Crackers, matzo, reduced sodium'
Recipe URL 'https://www.allrecipes.com/recipe/17110/omas-fabulous-matzo-ball-soup/' does not match the main noun 'Crackers' from 'Crackers, matzo, reduced sodium'.
Simplified food name for AllRecipes: 'potato boiled from' from 'Potato, boiled, from fresh, peel not eaten, no added fat'
spaCy extrac

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Plum, raw' using spaCy.
Extracted main noun: 'From' from 'Plum, raw'
Recipe URL 'https://www.allrecipes.com/recipe/96431/fresh-salsa/' does not match the main noun 'From' from 'Plum, raw'.
Simplified food name for AllRecipes: 'fruit juice blend' from 'Fruit juice blend, 100% juice'
spaCy extracted main noun: 'Fruit' from 'Fruit juice blend, 100% juice'
Found recipe for 'Fruit juice blend, 100% juice': https://www.allrecipes.com/recipe/23553/basic-fruit-smoothie/
Simplified food name for AllRecipes: 'egg cheese' from 'Egg, cheese, and sausage on biscuit'
spaCy extracted main noun: 'cheese' from 'Egg, cheese, and sausage on biscuit'
Found recipe for 'Egg, cheese, and sausage on biscuit': https://www.allrecipes.com/recipe/241166/fast-and-fabulous-egg-and-cottage-cheese-casserole/
Simplified food name for AllRecipes: 'orange juice 100%' from 'Orange juice, 100%, NFS'
spaCy extracted main noun: 'juice' from 'Orange juice, 100%, NFS'
Recipe URL 'https://www.allrecipes.com/r

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Bologna' using spaCy.
Extracted main noun: 'From' from 'Bologna'
Recipe URL 'https://www.allrecipes.com/recipe/21062/bologna-salad-sandwich-spread-i/' does not match the main noun 'From' from 'Bologna'.
Simplified food name for AllRecipes: 'bagel' from 'Bagel'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Bagel' using spaCy.
Extracted main noun: 'From' from 'Bagel'
Recipe URL 'https://www.allrecipes.com/recipe/38553/bagel-and-cheese-bake/' does not match the main noun 'From' from 'Bagel'.
Simplified food name for AllRecipes: 'mayonnaise regular' from 'Mayonnaise, regular'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Mayonnaise, regular' using spaCy.
Extracted main noun: 'From' from 'Mayonnaise, regular'
Recipe URL 'https://www.allrecipes.com/recipe/91765/salmon-deviled-eggs-with-homemade-mayonnaise/' does not match the main noun 'From' from 'Mayonnaise, regular'.
Simplified food name for AllRecipes: 'cheese nfs' from 'Cheese, NFS'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Cheese, NFS' using spaCy.
Extracted main noun: 'From' from 'Cheese, NFS'
Recipe URL 'https://www.allrecipes.com/recipe/262631/citrus-ricotta-oats/' does not match the main noun 'From' from 'Cheese, NFS'.
Simplified food name for AllRecipes: 'tea hot leaf' from 'Tea, hot, leaf, black'
No recipe found for 'tea hot leaf' on AllRecipes.
Simplified food name for AllRecipes: 'lamb  mutton' from 'Lamb or mutton curry'
spaCy extracted main noun: 'mutton' from 'Lamb or mutton curry'
Recipe URL 'https://www.allrecipes.com/recipe/70424/moroccan-tagine/' does not match the main noun 'mutton' from 'Lamb or mutton curry'.
Simplified food name for AllRecipes: 'rice  beans' from 'Rice with beans'
spaCy extracted main noun: 'Rice' from 'Rice with beans'
Found recipe for 'Rice with beans': https://www.allrecipes.com/recipe/53063/rice-beans-haitian-style/
Simplified food name for AllRecipes: 'egg salad made' from 'Egg Salad, made with any type of fat free dressing'
spaCy extracted main 

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Tomatoes, raw' using spaCy.
Extracted main noun: 'From' from 'Tomatoes, raw'
Recipe URL 'https://www.allrecipes.com/recipe/280771/raw-corn-salad/' does not match the main noun 'From' from 'Tomatoes, raw'.
Simplified food name for AllRecipes: 'snack mix' from 'Snack mix'
spaCy extracted main noun: 'Snack' from 'Snack mix'
Found recipe for 'Snack mix': https://www.allrecipes.com/recipe/9489/white-chocolate-snack-mix/
Simplified food name for AllRecipes: 'grapes raw' from 'Grapes, raw'
spaCy extracted main noun: 'Grapes' from 'Grapes, raw'
Recipe URL 'https://www.allrecipes.com/recipe/88961/raw-veggie-picnic-salad/' does not match the main noun 'Grapes' from 'Grapes, raw'.
Simplified food name for AllRecipes: 'beef sandwich steak' from 'Beef, sandwich steak'
spaCy extracted main noun: 'Beef' from 'Beef, sandwich steak'
Recipe URL 'https://www.allrecipes.com/recipe/80937/griddle-style-philly-steak-sandwiches/' does not match the main noun 'Beef' from 'Beef, sandwich steak

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Cheese, NFS' using spaCy.
Extracted main noun: 'From' from 'Cheese, NFS'
Recipe URL 'https://www.allrecipes.com/recipe/262631/citrus-ricotta-oats/' does not match the main noun 'From' from 'Cheese, NFS'.
Simplified food name for AllRecipes: 'nachos  chicken' from 'Nachos with chicken, cheese, and sour cream'
spaCy extracted main noun: 'chicken' from 'Nachos with chicken, cheese, and sour cream'
Found recipe for 'Nachos with chicken, cheese, and sour cream': https://www.allrecipes.com/recipe/74433/chicken-nachos/
Simplified food name for AllRecipes: 'olives black' from 'Olives, black'
spaCy extracted main noun: 'Olives' from 'Olives, black'
Found recipe for 'Olives, black': https://www.allrecipes.com/recipe/29529/tilapia-with-tomatoes-black-olives-and-corn/
Simplified food name for AllRecipes: 'quesadilla chicken' from 'Quesadilla, chicken'
spaCy extracted main noun: 'chicken' from 'Quesadilla, chicken'
Found recipe for 'Quesadilla, chicken': https://www.allrecipes.com

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Zombie' using spaCy.
Extracted main noun: 'From' from 'Zombie'
Recipe URL 'https://www.allrecipes.com/recipe/261198/chef-johns-zombie-meatloaf/' does not match the main noun 'From' from 'Zombie'.
Simplified food name for AllRecipes: 'cream half' from 'Cream, half and half'
spaCy extracted main noun: 'half' from 'Cream, half and half'
Recipe URL 'https://www.allrecipes.com/recipe/214978/ice-cream-base/' does not match the main noun 'half' from 'Cream, half and half'.
Simplified food name for AllRecipes: 'cereal (kellogg's rice' from 'Cereal (Kellogg's Rice Krispies)'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Cereal (Kellogg's Rice Krispies)' using spaCy.
Extracted main noun: 'From' from 'Cereal (Kellogg's Rice Krispies)'
Recipe URL 'https://www.allrecipes.com/recipe/241647/rice-krispies-chocolate-peanut-butter-balls/' does not match the main noun 'From' from 'Cereal (Kellogg's Rice Krispies)'.
Simplified food name for AllRecipes: 'orange juice 100%' from 'Orange juice, 100%, with calcium added, canned, bottled or in a carton'
spaCy extracted main noun: 'juice' from 'Orange juice, 100%, with calcium added, canned, bottled or in a carton'
Recipe URL 'https://www.allrecipes.com/recipe/264721/orange-cream-pops/' does not match the main noun 'juice' from 'Orange juice, 100%, with calcium added, canned, bottled or in a carton'.
Simplified food name for AllRecipes: 'chicken breast sauteed' from 'Chicken breast, sauteed, skin not eaten'
spaCy extracted main noun: 'Chicken' from 'Chicken breast, sauteed, skin not eaten'
Found recipe for 'Chicken breast, sauteed, skin not eaten': h

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Watermelon, raw' using spaCy.
Extracted main noun: 'From' from 'Watermelon, raw'
Recipe URL 'https://www.allrecipes.com/recipe/74824/watermelon-summer-salad/' does not match the main noun 'From' from 'Watermelon, raw'.
Simplified food name for AllRecipes: 'graham crackers' from 'Graham crackers'
spaCy extracted main noun: 'crackers' from 'Graham crackers'
Recipe URL 'https://www.allrecipes.com/recipe/12254/graham-cracker-crust-i/' does not match the main noun 'crackers' from 'Graham crackers'.
Simplified food name for AllRecipes: 'cranberry juice 100%' from 'Cranberry juice, 100%, not a blend'
spaCy extracted main noun: 'Cranberry' from 'Cranberry juice, 100%, not a blend'
Found recipe for 'Cranberry juice, 100%, not a blend': https://www.allrecipes.com/recipe/97244/cranberry-parfaits/
Simplified food name for AllRecipes: 'cookie vanilla wafer' from 'Cookie, vanilla wafer'
spaCy extracted main noun: 'vanilla' from 'Cookie, vanilla wafer'
Found recipe for 'Cookie, vani

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Cabbage, red, raw' using spaCy.
Extracted main noun: 'From' from 'Cabbage, red, raw'
Recipe URL 'https://www.allrecipes.com/recipe/258893/szybka-surowka-z-czerwonej-kapusty-polish-red-cabbage-slaw/' does not match the main noun 'From' from 'Cabbage, red, raw'.
Simplified food name for AllRecipes: 'thousand island dressing' from 'Thousand Island dressing'
spaCy extracted main noun: 'dressing' from 'Thousand Island dressing'
Found recipe for 'Thousand Island dressing': https://www.allrecipes.com/recipe/246380/quick-thousand-island-dressing/
Simplified food name for AllRecipes: 'crackers woven wheat' from 'Crackers, woven wheat, reduced sodium'
No recipe found for 'crackers woven wheat' on AllRecipes.
Simplified food name for AllRecipes: 'bread white made' from 'Bread, white, made from home recipe or purchased at a bakery'
spaCy extracted main noun: 'home' from 'Bread, white, made from home recipe or purchased at a bakery'
Recipe URL 'https://www.allrecipes.com/recipe/20

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Muffin, plain' using spaCy.
Extracted main noun: 'From' from 'Muffin, plain'
Recipe URL 'https://www.allrecipes.com/recipe/228553/moist-chocolate-muffins/' does not match the main noun 'From' from 'Muffin, plain'.
Simplified food name for AllRecipes: 'snow cone' from 'Snow cone'
spaCy extracted main noun: 'Snow' from 'Snow cone'
Found recipe for 'Snow cone': https://www.allrecipes.com/recipe/73492/snow-cone-syrup-ii/
Simplified food name for AllRecipes: 'oatmeal ns as' from 'Oatmeal, NS as to regular, quick, or instant, no added fat'
No recipe found for 'oatmeal ns as' on AllRecipes.
Simplified food name for AllRecipes: 'milk (1%)' from 'Milk, low fat (1%)'
spaCy extracted main noun: 'Milk' from 'Milk, low fat (1%)'
Found recipe for 'Milk, low fat (1%)': https://www.allrecipes.com/recipe/229035/old-time-kentucky-bacon-milk-gravy-for-biscuits/
Simplified food name for AllRecipes: 'bread multigrain toasted' from 'Bread, multigrain, toasted'
spaCy extracted main noun: 'm

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Cereal (Kellogg's Corn Flakes)' using spaCy.
Extracted main noun: 'From' from 'Cereal (Kellogg's Corn Flakes)'
Recipe URL 'https://www.allrecipes.com/recipe/149064/kelloggs-chocolate-scotcheroos/' does not match the main noun 'From' from 'Cereal (Kellogg's Corn Flakes)'.
Simplified food name for AllRecipes: 'strawberries frozen' from 'Strawberries, frozen'
spaCy extracted main noun: 'Strawberries' from 'Strawberries, frozen'
Recipe URL 'https://www.allrecipes.com/recipe/24494/ultimate-frozen-strawberry-margarita/' does not match the main noun 'Strawberries' from 'Strawberries, frozen'.
Simplified food name for AllRecipes: 'breakfast tart lowfat' from 'Breakfast tart, lowfat'
No recipe found for 'breakfast tart lowfat' on AllRecipes.
Simplified food name for AllRecipes: 'enchilada  chicken' from 'Enchilada with chicken, red-chile or enchilada sauce'
spaCy extracted main noun: 'chicken' from 'Enchilada with chicken, red-chile or enchilada sauce'
Found recipe for 'Enchil

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Applesauce, unsweetened' using spaCy.
Extracted main noun: 'From' from 'Applesauce, unsweetened'
Recipe URL 'https://www.allrecipes.com/recipe/51301/sarahs-applesauce/' does not match the main noun 'From' from 'Applesauce, unsweetened'.
Simplified food name for AllRecipes: 'quesadilla just cheese' from 'Quesadilla, just cheese, meatless'
spaCy extracted main noun: 'cheese' from 'Quesadilla, just cheese, meatless'
Recipe URL 'https://www.allrecipes.com/recipe/149679/mashed-potato-quesadilla/' does not match the main noun 'cheese' from 'Quesadilla, just cheese, meatless'.
Simplified food name for AllRecipes: 'milk nfs' from 'Milk, NFS'
spaCy extracted main noun: 'Milk' from 'Milk, NFS'
Recipe URL 'https://www.allrecipes.com/recipe/262631/citrus-ricotta-oats/' does not match the main noun 'Milk' from 'Milk, NFS'.
Simplified food name for AllRecipes: 'egg salad made' from 'Egg Salad, made with any type of fat free dressing'
spaCy extracted main noun: 'type' from 'Egg Sala

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Pancakes, plain' using spaCy.
Extracted main noun: 'From' from 'Pancakes, plain'
Recipe URL 'https://www.allrecipes.com/recipe/20334/banana-pancakes-i/' does not match the main noun 'From' from 'Pancakes, plain'.
Simplified food name for AllRecipes: 'maple  corn' from 'Maple and corn and/or cane pancake syrup blends'
spaCy extracted main noun: 'corn' from 'Maple and corn and/or cane pancake syrup blends'
Found recipe for 'Maple and corn and/or cane pancake syrup blends': https://www.allrecipes.com/recipe/56565/corn-fritters-with-maple-syrup/
Simplified food name for AllRecipes: 'fruit drink diet' from 'Fruit flavored drink, diet'
No recipe found for 'fruit drink diet' on AllRecipes.
Simplified food name for AllRecipes: 'mango raw' from 'Mango, raw'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Mango, raw' using spaCy.
Extracted main noun: 'From' from 'Mango, raw'
Recipe URL 'https://www.allrecipes.com/recipe/283996/air-fryer-butterflied-shrimp-with-pineapple-and-mango-salsa/' does not match the main noun 'From' from 'Mango, raw'.
Simplified food name for AllRecipes: 'bacon cheeseburger 1' from 'Bacon cheeseburger, 1 medium patty, with condiments, on bun, from fast food / restaurant'
spaCy extracted main noun: 'cheeseburger' from 'Bacon cheeseburger, 1 medium patty, with condiments, on bun, from fast food / restaurant'
Found recipe for 'Bacon cheeseburger, 1 medium patty, with condiments, on bun, from fast food / restaurant': https://www.allrecipes.com/recipe/223498/bacon-cheeseburger-meatloaf/
Simplified food name for AllRecipes: 'fondant' from 'Fondant'
spaCy extracted main noun: 'Fondant' from 'Fondant'
Found recipe for 'Fondant': https://www.allrecipes.com/recipe/233295/fondant-potatoes/
Simplified food name for AllRecipes: 'bologna' from 'Bologna'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Bologna' using spaCy.
Extracted main noun: 'From' from 'Bologna'
Recipe URL 'https://www.allrecipes.com/recipe/21062/bologna-salad-sandwich-spread-i/' does not match the main noun 'From' from 'Bologna'.
Simplified food name for AllRecipes: 'lima beans from' from 'Lima beans, from frozen, fat added'
spaCy extracted main noun: 'beans' from 'Lima beans, from frozen, fat added'
Recipe URL 'https://www.allrecipes.com/recipe/234455/bahgali-polo/' does not match the main noun 'beans' from 'Lima beans, from frozen, fat added'.
Simplified food name for AllRecipes: 'potato french fries' from 'Potato, french fries, NS as to fresh or frozen'
spaCy extracted main noun: 'fries' from 'Potato, french fries, NS as to fresh or frozen'
Found recipe for 'Potato, french fries, NS as to fresh or frozen': https://www.allrecipes.com/recipe/50223/homemade-crispy-seasoned-french-fries/
Simplified food name for AllRecipes: 'chocolate milk ready' from 'Chocolate milk, ready to drink, reduced fat

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Muffin, English' using spaCy.
Extracted main noun: 'From' from 'Muffin, English'
Recipe URL 'https://www.allrecipes.com/recipe/86649/fast-english-muffin-pizzas/' does not match the main noun 'From' from 'Muffin, English'.
Simplified food name for AllRecipes: 'butter nfs' from 'Butter, NFS'
No recipe found for 'butter nfs' on AllRecipes.
Simplified food name for AllRecipes: 'beef nfs' from 'Beef, NFS'
No recipe found for 'beef nfs' on AllRecipes.
Simplified food name for AllRecipes: 'summer squash green' from 'Summer squash, green, raw'
spaCy extracted main noun: 'Summer' from 'Summer squash, green, raw'
Found recipe for 'Summer squash, green, raw': https://www.allrecipes.com/recipe/233398/summer-squash-and-sausage-stew/
Simplified food name for AllRecipes: 'jam  jelly' from 'Jam or jelly, sugar free'
spaCy extracted main noun: 'sugar' from 'Jam or jelly, sugar free'
Recipe URL 'https://www.allrecipes.com/recipe/10537/shortbread-cookies-iii/' does not match the main no

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Cereal (General Mills Kix)' using spaCy.
Extracted main noun: 'From' from 'Cereal (General Mills Kix)'
Recipe URL 'https://www.allrecipes.com/recipe/15820/puppy-chow/' does not match the main noun 'From' from 'Cereal (General Mills Kix)'.
Simplified food name for AllRecipes: 'french toast sticks' from 'French toast sticks, plain'
spaCy extracted main noun: 'toast' from 'French toast sticks, plain'
Found recipe for 'French toast sticks, plain': https://www.allrecipes.com/recipe/267555/air-fryer-french-toast-sticks/
Simplified food name for AllRecipes: 'maple syrup' from 'Maple syrup'
spaCy extracted main noun: 'syrup' from 'Maple syrup'
Found recipe for 'Maple syrup': https://www.allrecipes.com/recipe/152002/homemade-maple-syrup/
Simplified food name for AllRecipes: 'cheese colby' from 'Cheese, Colby'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Cheese, Colby' using spaCy.
Extracted main noun: 'From' from 'Cheese, Colby'
Recipe URL 'https://www.allrecipes.com/recipe/21324/quick-and-easy-pimento-cheese/' does not match the main noun 'From' from 'Cheese, Colby'.
Simplified food name for AllRecipes: 'cheese parmesan dry' from 'Cheese, Parmesan, dry grated'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Cheese, Parmesan, dry grated' using spaCy.
Extracted main noun: 'From' from 'Cheese, Parmesan, dry grated'
Recipe URL 'https://www.allrecipes.com/recipe/41116/parmesan-broccoli-balls/' does not match the main noun 'From' from 'Cheese, Parmesan, dry grated'.
Simplified food name for AllRecipes: 'cookie marshmallow pie' from 'Cookie, marshmallow pie, chocolate covered'
spaCy extracted main noun: 'pie' from 'Cookie, marshmallow pie, chocolate covered'
Found recipe for 'Cookie, marshmallow pie, chocolate covered': https://www.allrecipes.com/recipe/17612/creme-de-menthe-grasshopper-pie/
Simplified food name for AllRecipes: 'hot dog beef' from 'Hot dog, beef'
spaCy extracted main noun: 'dog' from 'Hot dog, beef'
Found recipe for 'Hot dog, beef': https://www.allrecipes.com/recipe/81553/hot-dog-pie/
Simplified food name for AllRecipes: 'steak sauce' from 'Steak sauce'
spaCy extracted main noun: 'Steak' from 'Steak sauce'
Found recipe for 'Steak sauce': https://www.allrecipes.

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Cucumber, raw' using spaCy.
Extracted main noun: 'From' from 'Cucumber, raw'
Recipe URL 'https://www.allrecipes.com/recipe/162830/easy-greek-yogurt-cucumber-sauce/' does not match the main noun 'From' from 'Cucumber, raw'.
Simplified food name for AllRecipes: 'rolo' from 'Rolo'
spaCy extracted main noun: 'Rolo' from 'Rolo'
Recipe URL 'https://www.allrecipes.com/recipe/96749/pretzel-turtles/' does not match the main noun 'Rolo' from 'Rolo'.
Simplified food name for AllRecipes: 'venison/deer steak cooked' from 'Venison/deer steak, cooked, NS as to cooking method'
spaCy extracted main noun: 'deer' from 'Venison/deer steak, cooked, NS as to cooking method'
Found recipe for 'Venison/deer steak, cooked, NS as to cooking method': https://www.allrecipes.com/recipe/233612/beer-marinated-deerelkmoose-steak/
Simplified food name for AllRecipes: 'roll french' from 'Roll, French or Vienna'
spaCy extracted main noun: 'Roll' from 'Roll, French or Vienna'
Found recipe for 'Roll, Fren

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Pear, raw' using spaCy.
Extracted main noun: 'From' from 'Pear, raw'
Recipe URL 'https://www.allrecipes.com/recipe/278225/raw-kale-salad-with-feta-apples-and-pecans/' does not match the main noun 'From' from 'Pear, raw'.
Simplified food name for AllRecipes: 'cookie chocolate wafer' from 'Cookie, chocolate wafer'
spaCy extracted main noun: 'chocolate' from 'Cookie, chocolate wafer'
Found recipe for 'Cookie, chocolate wafer': https://www.allrecipes.com/recipe/8230/chocolate-cheesecake-ii/
Simplified food name for AllRecipes: 'tortilla flour' from 'Tortilla, flour'
spaCy extracted main noun: 'flour' from 'Tortilla, flour'
Found recipe for 'Tortilla, flour': https://www.allrecipes.com/recipe/157642/homemade-flour-tortillas/
Simplified food name for AllRecipes: 'vodka' from 'Vodka'
spaCy extracted main noun: 'Vodka' from 'Vodka'
Found recipe for 'Vodka': https://www.allrecipes.com/recipe/49165/vodka-sauce/
Simplified food name for AllRecipes: 'wine dessert sweet' from 'Win

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Cereal (General Mills Chex Wheat)' using spaCy.
Extracted main noun: 'From' from 'Cereal (General Mills Chex Wheat)'
Recipe URL 'https://www.allrecipes.com/recipe/15820/puppy-chow/' does not match the main noun 'From' from 'Cereal (General Mills Chex Wheat)'.
Simplified food name for AllRecipes: 'bread rye toasted' from 'Bread, rye, toasted'
spaCy extracted main noun: 'rye' from 'Bread, rye, toasted'
Found recipe for 'Bread, rye, toasted': https://www.allrecipes.com/recipe/257661/swedish-limpa-rye-bread/
Simplified food name for AllRecipes: 'meat loaf ns' from 'Meat loaf, NS as to type of meat'
No recipe found for 'meat loaf ns' on AllRecipes.
Simplified food name for AllRecipes: 'bread rye' from 'Bread, rye'
spaCy extracted main noun: 'rye' from 'Bread, rye'
Found recipe for 'Bread, rye': https://www.allrecipes.com/recipe/230396/real-ny-jewish-rye-bread/
Simplified food name for AllRecipes: 'potato boiled from' from 'Potato, boiled, from fresh, peel eaten, fat added,

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Potato, baked, peel not eaten' using spaCy.
Extracted main noun: 'From' from 'Potato, baked, peel not eaten'
Recipe URL 'https://www.allrecipes.com/recipe/237083/oven-baked-sweet-potato-fries/' does not match the main noun 'From' from 'Potato, baked, peel not eaten'.
Simplified food name for AllRecipes: 'croutons' from 'Croutons'
spaCy extracted main noun: 'Croutons' from 'Croutons'
Found recipe for 'Croutons': https://www.allrecipes.com/recipe/237238/chef-johns-homemade-croutons/
Simplified food name for AllRecipes: 'caesar dressing' from 'Caesar dressing'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Caesar dressing' using spaCy.
Extracted main noun: 'From' from 'Caesar dressing'
Recipe URL 'https://www.allrecipes.com/recipe/34616/batmans-best-caesar-dressing/' does not match the main noun 'From' from 'Caesar dressing'.
Simplified food name for AllRecipes: 'ice cream cake' from 'Ice cream cake'
spaCy extracted main noun: 'Ice' from 'Ice cream cake'
Found recipe for 'Ice cream cake': https://www.allrecipes.com/recipe/20784/ice-cream-cake/
Simplified food name for AllRecipes: 'candy marshmallow' from 'Candy, marshmallow'
spaCy extracted main noun: 'Candy' from 'Candy, marshmallow'
Recipe URL 'https://www.allrecipes.com/recipe/184362/peppermint-marshmallows/' does not match the main noun 'Candy' from 'Candy, marshmallow'.
Simplified food name for AllRecipes: 'pita chips' from 'Pita chips'
spaCy extracted main noun: 'chips' from 'Pita chips'
Found recipe for 'Pita chips': https://www.allrecipes.com/recipe/14827/pita-chips/
Simplified food name for AllRecipes: 'chocola

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Waffle, plain' using spaCy.
Extracted main noun: 'From' from 'Waffle, plain'
Recipe URL 'https://www.allrecipes.com/recipe/86538/cinnamon-belgian-waffles/' does not match the main noun 'From' from 'Waffle, plain'.
Simplified food name for AllRecipes: 'milk lactose free' from 'Milk, lactose free, fat free (skim)'
spaCy extracted main noun: 'Milk' from 'Milk, lactose free, fat free (skim)'
Recipe URL 'https://www.allrecipes.com/recipe/260123/gluten-free-and-lactose-free-pancakes/' does not match the main noun 'Milk' from 'Milk, lactose free, fat free (skim)'.
Simplified food name for AllRecipes: 'cereal (post grape-nuts' from 'Cereal (Post Grape-Nuts Flakes)'
No recipe found for 'cereal (post grape-nuts' on AllRecipes.
Simplified food name for AllRecipes: 'raisins' from 'Raisins'
spaCy extracted main noun: 'Raisins' from 'Raisins'
Found recipe for 'Raisins': https://www.allrecipes.com/recipe/105691/indian-style-rice-with-cashews-raisins-and-turmeric/
Simplified food nam

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Pepper, sweet, green, raw' using spaCy.
Extracted main noun: 'From' from 'Pepper, sweet, green, raw'
Recipe URL 'https://www.allrecipes.com/recipe/234952/uncle-ds-sweet-piccalilli-green-tomato-relish/' does not match the main noun 'From' from 'Pepper, sweet, green, raw'.
Simplified food name for AllRecipes: 'fish tuna canned' from 'Fish, tuna, canned'
spaCy extracted main noun: 'tuna' from 'Fish, tuna, canned'
Found recipe for 'Fish, tuna, canned': https://www.allrecipes.com/recipe/16468/tuna-fish-salad/
Simplified food name for AllRecipes: 'cookie sugar wafer' from 'Cookie, sugar wafer'
spaCy extracted main noun: 'sugar' from 'Cookie, sugar wafer'
Recipe URL 'https://www.allrecipes.com/recipe/228212/vanilla-wafer-cookies-that-are-better-than-storebought/' does not match the main noun 'sugar' from 'Cookie, sugar wafer'.
Simplified food name for AllRecipes: 'pea soup prepared' from 'Pea soup, prepared with milk'
spaCy extracted main noun: 'soup' from 'Pea soup, prepare

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Salsa, red, homemade' using spaCy.
Extracted main noun: 'From' from 'Salsa, red, homemade'
Recipe URL 'https://www.allrecipes.com/recipe/230878/fresh-homemade-salsa/' does not match the main noun 'From' from 'Salsa, red, homemade'.
Simplified food name for AllRecipes: 'hamburger 1 small' from 'Hamburger, 1 small patty, with condiments, on bun, from fast food / restaurant (Wendy's Jr. Hamburger)'
spaCy extracted main noun: 'patty' from 'Hamburger, 1 small patty, with condiments, on bun, from fast food / restaurant (Wendy's Jr. Hamburger)'
Recipe URL 'https://www.allrecipes.com/recipe/233652/homemade-hamburger-buns/' does not match the main noun 'patty' from 'Hamburger, 1 small patty, with condiments, on bun, from fast food / restaurant (Wendy's Jr. Hamburger)'.
Simplified food name for AllRecipes: 'light ice cream' from 'Light ice cream, soft serve, chocolate'
spaCy extracted main noun: 'Light' from 'Light ice cream, soft serve, chocolate'
Recipe URL 'https://www.allre

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Pasta, cooked' using spaCy.
Extracted main noun: 'From' from 'Pasta, cooked'
Recipe URL 'https://www.allrecipes.com/recipe/255823/pasta-fazool-pasta-e-fagioli/' does not match the main noun 'From' from 'Pasta, cooked'.
Simplified food name for AllRecipes: 'infant formula similac' from 'Infant formula, Similac for Spit-Up, powder, made with water'
No recipe found for 'infant formula similac' on AllRecipes.
Simplified food name for AllRecipes: 'baby toddler cereal' from 'Baby Toddler cereal, rice, dry'
No recipe found for 'baby toddler cereal' on AllRecipes.
Simplified food name for AllRecipes: 'baby toddler bananas' from 'Baby Toddler bananas, Stage 1'
spaCy extracted main noun: 'bananas' from 'Baby Toddler bananas, Stage 1'
Recipe URL 'https://www.allrecipes.com/recipe/219330/toddler-muffins/' does not match the main noun 'bananas' from 'Baby Toddler bananas, Stage 1'.
Simplified food name for AllRecipes: 'baby toddler sweet' from 'Baby Toddler sweet potatoes, Stage 1

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Lemon, raw' using spaCy.
Extracted main noun: 'From' from 'Lemon, raw'
Recipe URL 'https://www.allrecipes.com/recipe/166611/alexs-raw-chocolate-pudding/' does not match the main noun 'From' from 'Lemon, raw'.
Simplified food name for AllRecipes: 'cereal (post raisin' from 'Cereal (Post Raisin Bran)'
No recipe found for 'cereal (post raisin' on AllRecipes.
Simplified food name for AllRecipes: 'margarine stick' from 'Margarine, stick'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Margarine, stick' using spaCy.
Extracted main noun: 'From' from 'Margarine, stick'
Recipe URL 'https://www.allrecipes.com/recipe/284216/becel-oatmeal-chocolate-chip-cookies/' does not match the main noun 'From' from 'Margarine, stick'.
Simplified food name for AllRecipes: 'chicken breast grilled' from 'Chicken breast, grilled with sauce, skin not eaten'
spaCy extracted main noun: 'Chicken' from 'Chicken breast, grilled with sauce, skin not eaten'
Found recipe for 'Chicken breast, grilled with sauce, skin not eaten': https://www.allrecipes.com/recipe/83793/best-chicken-marinade/
Simplified food name for AllRecipes: 'cranberries cooked' from 'Cranberries, cooked or canned'
spaCy extracted main noun: 'Cranberries' from 'Cranberries, cooked or canned'
Recipe URL 'https://www.allrecipes.com/recipe/157872/winter-chicken-bake/' does not match the main noun 'Cranberries' from 'Cranberries, cooked or canned'.
Simplified food name for AllRecipes: 'beans string cooked' from 'Bea

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Bread, French or Vienna' using spaCy.
Extracted main noun: 'From' from 'Bread, French or Vienna'
Recipe URL 'https://www.allrecipes.com/recipe/56689/banana-bread-french-toast/' does not match the main noun 'From' from 'Bread, French or Vienna'.
Simplified food name for AllRecipes: 'milk evaporated reduced' from 'Milk, evaporated, reduced fat (2%)'
spaCy extracted main noun: 'Milk' from 'Milk, evaporated, reduced fat (2%)'
Recipe URL 'https://www.allrecipes.com/recipe/12206/pumpkin-pie-ii/' does not match the main noun 'Milk' from 'Milk, evaporated, reduced fat (2%)'.
Simplified food name for AllRecipes: 'radish raw' from 'Radish, raw'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Radish, raw' using spaCy.
Extracted main noun: 'From' from 'Radish, raw'
Recipe URL 'https://www.allrecipes.com/recipe/278848/roasted-carrots-and-radishes/' does not match the main noun 'From' from 'Radish, raw'.
Simplified food name for AllRecipes: 'mayonnaise light' from 'Mayonnaise, light'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Mayonnaise, light' using spaCy.
Extracted main noun: 'From' from 'Mayonnaise, light'
Recipe URL 'https://www.allrecipes.com/recipe/24023/mayonnaise-biscuits/' does not match the main noun 'From' from 'Mayonnaise, light'.
Simplified food name for AllRecipes: 'queso asadero' from 'Queso Asadero'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No noun found in 'Queso Asadero' using spaCy.
Extracted main noun: 'From' from 'Queso Asadero'
Recipe URL 'https://www.allrecipes.com/recipe/237356/honey-lime-chicken-enchiladas/' does not match the main noun 'From' from 'Queso Asadero'.
Simplified food name for AllRecipes: 'grapefruit raw' from 'Grapefruit, raw'
No recipe found for 'grapefruit raw' on AllRecipes.
Simplified food name for AllRecipes: 'enchilada just cheese' from 'Enchilada, just cheese, meatless, no beans, green-chile or enchilada sauce'
spaCy extracted main noun: 'cheese' from 'Enchilada, just cheese, meatless, no beans, green-chile or enchilada sauce'
Recipe URL 'https://www.allrecipes.com/recipe/74188/green-chicken-enchilada/' does not match the main noun 'cheese' from 'Enchilada, just cheese, meatless, no beans, green-chile or enchilada sauce'.
Simplified food name for AllRecipes: 'enchilada sauce green' from 'Enchilada sauce, green'
spaCy extracted main noun: 'sauce' from 'Enchilada sauce, green'
Found recipe for 

KeyboardInterrupt: 

### **Approach 3: using Autogen to automate the process of judging whether a food is recommendable or not**

Helped reduce from 9.6k foods down to 7.5k foods and can be reduced further with manual filters from human reviewers

In [87]:
import os
import pandas as pd
import time
from autogen import ConversableAgent
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

llm_config = {
    "model": "gpt-3.5-turbo",
    "api_key": api_key
}

# Define the food recommendation agent
agent = ConversableAgent(
    name="food_recommendation_agent",
    llm_config=llm_config,
    human_input_mode="NEVER",
)

# Define the criteria for what makes a food item recommendable
# We as the prompters can always add more criteria as needed
recommendable_criteria = """
The food description must represent a complete food item or dish that is suitable for recommendation to users. Avoid descriptions that are:
1. Too generic (e.g., "water", "chewing gum").
2. Single ingredients unless they have culinary value (e.g., "garlic").
3. Non-food items or items that do not constitute a recipe (e.g., "salt", "sugar").
4. Raw vegetables, raw fruits, raw meat
5. Condiments
6. Baby foods

Recommendable items are those that can be used as part of a recipe or a full dish.
"""

file_path = '../processed_data/foods_list.csv'
foods_df = pd.read_csv(file_path)

# use the agent to determine if a food item is recommendable with retry
def is_recommendable(food_desc, retries=3):
    prompt = f"""
    You are a food recommendation agent. Your task is to judge whether a food description should be recommended to users or not.
    Follow the criteria below:
    {recommendable_criteria}

    Food Description: "{food_desc}"
    Should this be recommended? Answer "Yes" or "No" with a brief explanation.
    """
    
    for attempt in range(retries):
        try:
            response = agent.generate_reply(
                messages=[{"content": prompt, "role": "user"}]
            )
            answer = response.lower().strip()
            
            if "yes" in answer:
                return True, answer
            elif "no" in answer:
                return False, answer
            else:
                return False, "Unclear response: " + answer

        except Exception as e:
            print(f"Attempt {attempt + 1} failed for '{food_desc}' with error: {e}")
            time.sleep(2 * (attempt + 1))  

    return False, "Error during evaluation after retries"

# Loop through each row in the dataset and use the agent to classify it
recommendable_foods = []
non_recommendable_foods = []

for _, row in foods_df.iterrows():
    food_desc = row['food_desc']
    
    # Determine if the food is recommendable
    is_recommendable_result, explanation = is_recommendable(food_desc)

    if is_recommendable_result:
        recommendable_foods.append({**row.to_dict(), "explanation": explanation})
        print(f"Recommendable: '{food_desc}' - Explanation: {explanation}")
    else:
        non_recommendable_foods.append({**row.to_dict(), "explanation": explanation})
        print(f"Not Recommendable: '{food_desc}' - Explanation: {explanation}")

    # sleep to avoid rate limiting
    time.sleep(2)

recommendable_foods_df = pd.DataFrame(recommendable_foods)
non_recommendable_foods_df = pd.DataFrame(non_recommendable_foods)

recommendable_foods_df.to_csv('../processed_data/recommendable_foods.csv', index=False)
non_recommendable_foods_df.to_csv('../processed_data/non_recommendable_foods.csv', index=False)

Not Recommendable: 'Hard candy' - Explanation: no, "hard candy" should not be recommended. it falls under the category of generic items and does not represent a complete food item or dish suitable for recommendation to users.
Not Recommendable: 'Soft drink, root beer' - Explanation: no. this food description should not be recommended because "soft drink, root beer" falls under the category of beverages rather than a complete food item or dish suitable for recommendation.
Recommendable: 'Potato chips, baked, flavored' - Explanation: yes, this food description should be recommended. "potato chips, baked, flavored" represents a complete food item that can be enjoyed as a snack or part of a meal. the description includes preparation method (baked) and additional detail (flavored), making it suitable for recommendation to users looking for snack options.
Recommendable: 'Ham, prepackaged or deli, luncheon meat, reduced sodium' - Explanation: yes, this food description should be recommended. 

In [88]:
recommendable_foods_df.to_csv('../processed_data/recommendable_foods_original.csv', index=False)
non_recommendable_foods_df.to_csv('../processed_data/non_recommendable_foods_original.csv', index=False)

In [91]:
import pandas as pd

# Load the recommendable_foods.csv into a DataFrame
recommendable_foods_df = pd.read_csv('../processed_data/recommendable_foods.csv')

# Load the food_tagging.csv into a DataFrame
food_tagging_df = pd.read_csv('../processed_data/food_tagging.csv')

# Perform a left join on 'food_id'
recommendable_foods_tagging_df = pd.merge(
    recommendable_foods_df,
    food_tagging_df,
    how='left',
    on='food_id'
)

# Save the joined DataFrame to a new CSV file
recommendable_foods_tagging_df.to_csv('../processed_data/recommendable_foods_tagging.csv', index=False)

### **Q&A pairs creation:**

##### Step 1: finding top 5,000 pairs of user-food that have the most macro tags in common (we don't care about the high/low level for now)

*Note: here, I initially used the wrong data: I used the whole users list (96k users) from user_tagging but forgot that we actually only need to look at opioid users and recovered users (i.e. excluding non-opioid users)*

This will be fixed in the next step

In [2]:
import pandas as pd

num_qa = 5000

recommendable_foods_tagging = pd.read_csv('../processed_data/recommendable_foods_tagging.csv') 
food_tagging = pd.read_csv('../processed_data/food_tagging.csv')
user_tagging = pd.read_csv('../processed_data/user_tagging.csv')

# List of macro nutrient columns to consider
macro_columns = [
    'macro_carb', 'macro_phosphorus', 'macro_calorie', 'macro_potassium', 'macro_sodium',
    'macro_cholesterol', 'macro_saturated_fat', 'macro_protein', 'macro_sugar', 'macro_fiber',
    'macro_iron', 'macro_folic_acid', 'macro_vitamin_b12', 'macro_calcium', 'micro_vitamin_d',
    'macro_vitamin_c'
]

# Step 0: Filter food_tagging to only include food_ids that exist in recommendable_foods_tagging
food_tagging = food_tagging[food_tagging['food_id'].isin(recommendable_foods_tagging['food_id'])]

# Step 1: Identify Top num_qa Users with Most `macro_` Tags Equal to 1
user_tagging['macro_tag_count'] = user_tagging[macro_columns].sum(axis=1)
top_users = user_tagging.nlargest(num_qa, 'macro_tag_count')

# Step 2: Identify Top num_qa Foods with Most `macro_` Tags Equal to 1 (after filtering food_tagging)
food_tagging['macro_tag_count'] = food_tagging[macro_columns].sum(axis=1)
top_foods = food_tagging.nlargest(num_qa, 'macro_tag_count')

# Step 3: Perform cross join on the top num_qa users and top num_qa foods
user_macro_df = top_users[['SEQN'] + macro_columns]
food_macro_df = top_foods[['food_id'] + macro_columns]

# Cross join users and foods
user_food_cross = user_macro_df.assign(key=1).merge(food_macro_df.assign(key=1), on='key').drop('key', axis=1)

# Multiply corresponding macro columns and sum them up to calculate matching tags
for col in macro_columns:
    user_food_cross[f'{col}_match'] = user_food_cross[f'{col}_x'] * user_food_cross[f'{col}_y']

user_food_cross['matching_nutrients'] = user_food_cross[[f'{col}_match' for col in macro_columns]].sum(axis=1)

# Sort and select top pairs with the most matching macro tags
top_pairs = user_food_cross.sort_values(by='matching_nutrients', ascending=False).head(num_qa)

# Merge with recommendable_foods_tagging to get food descriptions, and filter out rows where 'food_desc' is NaN
top_with_desc = pd.merge(top_pairs, recommendable_foods_tagging[['food_id', 'food_desc']], on='food_id', how='left')
top_with_desc = top_with_desc[top_with_desc['food_desc'].notna()]

selected_columns = top_with_desc[['SEQN', 'food_id']]
selected_columns.to_csv('../processed_data/user_food_pairs.csv', index=False)

        SEQN   food_id                                          food_desc  \
0      34040  91770020                 Dietetic or low calorie hard candy   
1      34040  95201000  Nutritional powder mix (Carnation Instant Brea...   
2      34040  42203200                                     Soy nut butter   
3      34040  95210020   Nutritional powder mix, high protein (Slim Fast)   
4      34040  57332050           Cereal (General Mills Total Raisin Bran)   
...      ...       ...                                                ...   
4995   63287  57212100                Cereal (General Mills Frankenberry)   
4996  114261  21302000                           Beef, neck bones, cooked   
4997  114261  51109110                               Bread, pita, toasted   
4998  114261  25221520  Salami, made from any type of meat, reduced so...   
4999   83019  92309520             Tea, iced, bottled, green, unsweetened   

      matching_nutrients  
0                     13  
1                    

In [7]:
# selected_columns = top_with_desc[['SEQN', 'food_id']]
# selected_columns.to_csv('../processed_data/user_food_pairs.csv', index=False)

In [11]:
import os
import pandas as pd
import time
from autogen import ConversableAgent
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

llm_config = {
    "model": "gpt-3.5-turbo",
    "api_key": api_key
}

# food recommendation agent 
agent = ConversableAgent(
    name="food_recommendation_agent",
    llm_config=llm_config,
    human_input_mode="NEVER",
)

recommendable_foods_df = pd.read_csv('../processed_data/recommendable_foods_tagging.csv')
user_tagging_df = pd.read_csv('../processed_data/user_tagging.csv')
user_food_pairs_df = pd.read_csv('../processed_data/user_food_pairs.csv')

# Create a helper function to get the nutrition tags with value 1 for user or food
def get_tags_with_value_1(row, prefix):
    return [col for col in row.index if row[col] == 1 and col.startswith(prefix)]

# Define scoring weights
high_weight_categories = [
    "low_calorie", "high_calorie", "low_protein", "high_protein", "low_carb", "high_carb",
    "low_sugar", "high_sugar", "low_fiber", "high_fiber", "low_saturated_fat", "high_saturated_fat",
    "low_cholesterol", "high_cholesterol", "low_sodium", "high_sodium"
]
low_weight_categories = [
    "low_calcium", "high_calcium", "low_phosphorus", "high_phosphorus", "low_potassium", "high_potassium",
    "low_iron", "high_iron", "low_folic_acid", "high_folic_acid", "low_vitamin_c", "high_vitamin_c",
    "low_vitamin_d", "high_vitamin_d", "low_vitamin_b12", "high_vitamin_b12"
]

# Define a question-answer template
qa_template = """
You must produce output following this template:

Question: For User <SEQN>, given the health profile: <all nutrition tags of the user, i.e. columns in user_tagging starting with prefix user_ and have value of 1>, please judge if Food <food_id> <food_desc>, <all nutrition tags of the food, i.e. columns in recommendable_foods_tagging starting with high_ or low_ and have value of 1>, is a healthy option to the user, and why?

Answer: <Yes/No>, because the food is <high/low> in <nutrients> (these nutrients match with the user's health profile), but <high/low> in <nutrients> (these nutrients are opposite to the user's health profile).
"""

# Iterate over user-food pairs to generate question-answer pairs
question_answer_pairs = []

for _, pair in user_food_pairs_df.iterrows():
    seqn = pair['SEQN']
    food_id = pair['food_id']
    
    # Get user nutritional tags
    user_row = user_tagging_df[user_tagging_df['SEQN'] == seqn]
    if user_row.empty:
        continue
    user_tags = get_tags_with_value_1(user_row.iloc[0], "user_")

    # Get food nutritional tags
    food_row = recommendable_foods_df[recommendable_foods_df['food_id'] == food_id]
    if food_row.empty:
        continue
    food_tags = get_tags_with_value_1(food_row.iloc[0], ("low_", "high_"))

    # Calculate health score
    health_score = 0
    for tag in user_tags:
        food_tag = tag.replace("user_", "")
        
        if food_tag in high_weight_categories:
            if food_tag in food_tags:
                health_score += 10  # Match in high-weight category
            else:
                health_score -= 10  # Mismatch in high-weight category
        elif food_tag in low_weight_categories:
            if food_tag in food_tags:
                health_score += 1  # Match in low-weight category
            else:
                health_score -= 1  # Mismatch in low-weight category

    # Determine the suitability based on health score
    is_healthy = health_score >= 0

    # Create question text
    user_tags_str = ", ".join(user_tags)
    food_desc = food_row.iloc[0]['food_desc']
    food_tags_str = ", ".join(food_tags)
    
    question = (
        f"For User {seqn}, given the health profile: {user_tags_str}, "
        f"please judge if Food {food_id} ({food_desc}), with nutritional properties: {food_tags_str}, "
        "is a healthy option for the user, and why?"
    )

    # Prepare the complete prompt by adding the QA template
    complete_prompt = qa_template + f"\n\n{question}"

    # Generate answer from the score
    answer = "Yes" if is_healthy else "No"

    # Generate reply from agent (LLM should generate a coherent explanation)
    try:
        response = agent.generate_reply(
            messages=[{"content": complete_prompt, "role": "user"}]
        )
        
        formatted_answer = f"{answer}, because {response.strip()}"
        
        question_answer_pairs.append({
            "SEQN": seqn,
            "food_id": food_id,
            "food_desc": food_desc,
            "question": question,
            "answer": formatted_answer
        })
        print(f"Generated answer for SEQN {seqn} and food_id {food_id}: {formatted_answer}")

    except Exception as e:
        print(f"Error generating answer for SEQN {seqn} and food_id {food_id}: {e}")
        continue

    # avoid exceeding rate limits
    time.sleep(2)

qa_pairs_df = pd.DataFrame(question_answer_pairs)

Generated answer for SEQN 34040 and food_id 91770020: Yes, because Question: For User 34040, given the health profile: user_low_calorie, user_high_potassium, user_low_sodium, user_low_cholesterol, user_low_saturated_fat, user_low_protein, user_high_fiber, user_high_iron, user_high_folate_acid, user_high_vitamin_b12, user_high_calcium, user_high_vitamin_d, user_high_vitamin_c, please judge if Food 91770020 (Dietetic or low calorie hard candy), with nutritional properties: high_calorie, low_protein, high_carb, high_sugar, low_fiber, low_saturated_fat, low_cholesterol, low_sodium, low_calcium, low_phosphorus, low_potassium, low_iron, low_folic_acid, low_vitamin_c, low_vitamin_d, low_vitamin_b12, is a healthy option for the user, and why?

Answer: No, because the food is high in sugar (opposite to user_high_fiber and user_low_sugar) and low in potassium (opposite to user_high_potassium).
Generated answer for SEQN 34040 and food_id 95201000: No, because Question: For User 34040, given the h

KeyboardInterrupt: 

In [None]:
# qa_pairs_df.to_csv('../processed_data/recommendable_foods_question_answers.csv', index=False)
# print("Question-answer pairs saved to '../processed_data/recommendable_foods_question_answers.csv'")

**Using the correct data:** filtering for only active & recovered opioid users by referencing users that are in joined_graph

In [13]:
import torch
import pandas as pd


num_qa = 5000

# Load the joined graph
joined_graph = torch.load('../processed_data/joined_graph.pt')

# Extract the user node IDs from the graph
user_ids_in_graph = joined_graph['user'].node_id.tolist()

# Load the user_tagging.csv file
user_tagging = pd.read_csv('../processed_data/user_tagging.csv')

# Filter user_tagging to include only users that are present in the graph
filtered_user_tagging = user_tagging[user_tagging['SEQN'].isin(user_ids_in_graph)]

# Now proceed with your original code using the filtered_user_tagging DataFrame

# List of macro nutrient columns to consider
macro_columns = [
    'macro_carb', 'macro_phosphorus', 'macro_calorie', 'macro_potassium', 'macro_sodium',
    'macro_cholesterol', 'macro_saturated_fat', 'macro_protein', 'macro_sugar', 'macro_fiber',
    'macro_iron', 'macro_folic_acid', 'macro_vitamin_b12', 'macro_calcium', 'micro_vitamin_d',
    'macro_vitamin_c'
]

# Step 0: Filter food_tagging to only include food_ids that exist in recommendable_foods_tagging
recommendable_foods_tagging = pd.read_csv('../processed_data/recommendable_foods_tagging.csv') 
food_tagging = pd.read_csv('../processed_data/food_tagging.csv')
food_tagging = food_tagging[food_tagging['food_id'].isin(recommendable_foods_tagging['food_id'])]

# Step 1: Identify Top num_qa Users with Most `macro_` Tags Equal to 1
filtered_user_tagging['macro_tag_count'] = filtered_user_tagging[macro_columns].sum(axis=1)
top_users = filtered_user_tagging.nlargest(num_qa, 'macro_tag_count')

# Step 2: Identify Top num_qa Foods with Most `macro_` Tags Equal to 1 (after filtering food_tagging)
food_tagging['macro_tag_count'] = food_tagging[macro_columns].sum(axis=1)
top_foods = food_tagging.nlargest(num_qa, 'macro_tag_count')

# Step 3: Perform cross join on the top num_qa users and top num_qa foods
user_macro_df = top_users[['SEQN'] + macro_columns]
food_macro_df = top_foods[['food_id'] + macro_columns]

# Cross join users and foods
user_food_cross = user_macro_df.assign(key=1).merge(food_macro_df.assign(key=1), on='key').drop('key', axis=1)

# Multiply corresponding macro columns and sum them up to calculate matching tags
for col in macro_columns:
    user_food_cross[f'{col}_match'] = user_food_cross[f'{col}_x'] * user_food_cross[f'{col}_y']

user_food_cross['matching_nutrients'] = user_food_cross[[f'{col}_match' for col in macro_columns]].sum(axis=1)

# Sort and select top pairs with the most matching macro tags
top_pairs = user_food_cross.sort_values(by='matching_nutrients', ascending=False).head(num_qa)

# Merge with recommendable_foods_tagging to get food descriptions, and filter out rows where 'food_desc' is NaN
top_with_desc = pd.merge(top_pairs, recommendable_foods_tagging[['food_id', 'food_desc']], on='food_id', how='left')
top_with_desc = top_with_desc[top_with_desc['food_desc'].notna()]

selected_columns = top_with_desc[['SEQN', 'food_id']]
selected_columns.to_csv('../processed_data/user_food_pairs.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_user_tagging['macro_tag_count'] = filtered_user_tagging[macro_columns].sum(axis=1)


In [14]:
display(top_with_desc[['SEQN', 'food_id', 'food_desc', 'matching_nutrients']])

Unnamed: 0,SEQN,food_id,food_desc,matching_nutrients
0,23816,91770020,Dietetic or low calorie hard candy,12
1,123231,14108060,"Cheese, Parmesan, dry grated, fat free",12
2,123231,91770020,Dietetic or low calorie hard candy,12
3,123231,57332050,Cereal (General Mills Total Raisin Bran),12
4,123231,11830210,"Chocolate beverage powder, light, dry mix, not...",12
...,...,...,...,...
4995,123231,54337100,"Crackers, woven wheat, reduced fat",9
4996,95412,92650210,Cornmeal beverage with chocolate milk,9
4997,47343,57305174,Cereal (Malt-O-Meal Colossal Crunch),9
4998,95656,92191400,"Coffee, instant, pre-sweetened with sugar, not...",9


In [17]:
import os
import pandas as pd
import time
from autogen import ConversableAgent
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

llm_config = {
    "model": "gpt-3.5-turbo",
    "api_key": api_key
}

# food recommendation agent
agent = ConversableAgent(
    name="food_recommendation_agent",
    llm_config=llm_config,
    human_input_mode="NEVER",
)

recommendable_foods_df = pd.read_csv('../processed_data/recommendable_foods_tagging.csv')
user_tagging_df = pd.read_csv('../processed_data/user_tagging.csv')
user_food_pairs_df = pd.read_csv('../processed_data/user_food_pairs.csv')

# Define important and unimportant tags
important_tags = [
    "low_calorie", "high_calorie", "low_protein", "high_protein", "low_carb", "high_carb",
    "low_sugar", "high_sugar", "low_fiber", "high_fiber", "low_saturated_fat", "high_saturated_fat",
    "low_cholesterol", "high_cholesterol", "low_sodium", "high_sodium"
]
unimportant_tags = [
    "low_calcium", "high_calcium", "low_phosphorus", "high_phosphorus", "low_potassium", "high_potassium",
    "low_iron", "high_iron", "low_folic_acid", "high_folic_acid", "low_vitamin_c", "high_vitamin_c",
    "low_vitamin_d", "high_vitamin_d", "low_vitamin_b12", "high_vitamin_b12"
]

# Create a helper function to get the nutrition tags with value 1 for user or food
def get_tags_with_value_1(row, prefix):
    return [col for col in row.index if row[col] == 1 and col.startswith(prefix)]

# Define a question-answer template
qa_template = """
You must produce output following this template:

Question: For User <SEQN>, given the health profile: <all important nutrition tags of the user, i.e. columns in user_tagging starting with prefix user_ and have value of 1 from the list of important tags>, please judge if Food <food_id> <food_desc>, <all important nutrition tags of the food, i.e. columns in recommendable_foods_tagging starting with high_ or low_ and have value of 1 from the list of important tags>, is a healthy option to the user, and why?
Answer: <Yes/No>, because the food is <high/low> in <nutrients> (these nutrients match with the user's health profile), but <high/low> in <nutrients> (these nutrients are opposite to the user's health profile).
"""

# Iterate over user-food pairs to generate question-answer pairs
question_answer_pairs = []

for _, pair in user_food_pairs_df.iterrows():
    seqn = pair['SEQN']
    food_id = pair['food_id']
    
    # Get user nutritional tags
    user_row = user_tagging_df[user_tagging_df['SEQN'] == seqn]
    if user_row.empty:
        continue
    user_tags = get_tags_with_value_1(user_row.iloc[0], "user_")
    # Filter user_tags to include only important tags
    user_tags = [tag for tag in user_tags if tag.replace("user_", "") in important_tags]

    # Get food nutritional tags
    food_row = recommendable_foods_df[recommendable_foods_df['food_id'] == food_id]
    if food_row.empty:
        continue
    food_tags = get_tags_with_value_1(food_row.iloc[0], ("low_", "high_"))
    # Filter food_tags to include only important tags
    food_tags = [tag for tag in food_tags if tag in important_tags]

    # Create question text
    user_tags_str = ", ".join(user_tags)
    food_desc = food_row.iloc[0]['food_desc']
    food_tags_str = ", ".join(food_tags)
    
    question = (
        f"For User {seqn}, given the health profile: {user_tags_str}, "
        f"please judge if Food {food_id} ({food_desc}), with nutritional properties: {food_tags_str}, "
        "is a healthy option for the user, and why?"
    )

    # Prepare the complete prompt by adding the QA template
    complete_prompt = qa_template + f"\n\n{question}"

    # Generate reply from agent
    try:
        response = agent.generate_reply(
            messages=[{"content": complete_prompt, "role": "user"}]
        )
        
        # Extract the question and answer from the response (since it should be formatted as required)
        question_answer_pairs.append({
            "SEQN": seqn,
            "food_id": food_id,
            "food_desc": food_desc,
            "question": question,
            "answer": response.strip()
        })
        print('*' * 100)
        print(f"Generated answer for SEQN {seqn} and food_id {food_id}:\n\n{response}")

    except Exception as e:
        print(f"Error generating answer for SEQN {seqn} and food_id {food_id}: {e}")
        continue

    # avoid exceeding rate limits
    time.sleep(2)

qa_pairs_df = pd.DataFrame(question_answer_pairs)



****************************************************************************************************
Generated answer for SEQN 23816 and food_id 91770020:

Question: For User 23816, given the health profile: user_low_calorie, user_low_sodium, user_low_saturated_fat, user_high_protein, user_low_sugar, user_high_fiber, please judge if Food 91770020 (Dietetic or low calorie hard candy), with nutritional properties: high_calorie, low_protein, high_carb, high_sugar, low_fiber, low_saturated_fat, low_cholesterol, low_sodium, is a healthy option for the user, and why?

Answer: No, because the food is high in calorie (opposite to user_low_calorie), low in protein (opposite to user_high_protein), high in sugar (opposite to user_low_sugar), low in fiber (opposite to user_high_fiber), low in saturated fat (matches user_low_saturated_fat), low in cholesterol (not directly relevant to the user's health profile), and low in sodium (matches user_low_sodium).
******************************************

KeyboardInterrupt: 

In [None]:
qa_pairs_df.to_csv('../processed_data/recommendable_foods_question_answers.csv', index=False)