In [110]:
import pandas as pd
import ast

df = pd.read_csv("recepies.csv", index_col=None)

In [111]:
df = df[["title","ingredients","vegan"]]
df

Unnamed: 0,title,ingredients,vegan
0,"Grandma's Eggless, Butterless, Milkless Cake","['3 cups all-purpose flour', '2 cups white sug...",True
1,Sweet and Spicy Turkey Rub,"['3 tablespoons brown sugar', '2 tablespoons g...",True
2,Marinated Mushrooms with Red Bell Peppers,"['1/2 cup red wine vinegar', '1/3 cup water', ...",True
3,Easy 4-Ingredient Margarita,"['1 cup ice cubes, or as needed', '1/3 cup teq...",True
4,Two Bowl Cake,"['3 cups all-purpose flour', '2 cups white sug...",True
...,...,...,...
95,Gingery Lemonade,"['1 1/2 cups SPLENDA® No Calorie Sweetener, Gr...",True
96,Simple Cajun Seasoning,"['2 1/2 tablespoons salt', '1 tablespoon dried...",True
97,Lemon-Basil Vinaigrette Corn Topper,"['1 tablespoon lemon juice', '2 teaspoons extr...",True
98,Dry Spice Rub for Lamb or Beef,"['1 teaspoon paprika', '1 1/2 teaspoons dried ...",True


In [125]:
import re

NON_VEGAN_INGREDIENTS = {
    'meat': ['beef', 'steak','pork', 'lamb', 'veal', 'chicken', 'turkey', 'duck', 'goose', 'bacon', 'ham', 'sausage', 'pepperoni', 'salami', 'meat', 'poultry', 'game', 'venison', 'rabbit', 'quail', 'pheasant'],
    'dairy': ['milk', 'cream', 'butter', 'cheese', 'yogurt', 'whey', 'casein', 'lactose', 'dairy', 'ghee', 'curd', 'buttermilk', 'sour cream', 'heavy cream', 'half and half'],
    'eggs': ['egg', 'albumen', 'ovalbumin', 'yolk', 'eggs', 'egg white', 'egg yolk'],
    'fish': ['fish', 'salmon', 'tuna', 'cod', 'shrimp', 'prawn', 'crab', 'lobster', 'shellfish', 'anchovy', 'seafood', 'mussel', 'clam', 'oyster', 'squid', 'octopus', 'eel'],
    'honey': ['honey', 'bee pollen', 'royal jelly', 'beeswax'],
    'gelatin': ['gelatin', 'gelatine', 'collagen'],
    'other': ['lard', 'tallow', 'rennet', 'carmine', 'shellac', 'bone', 'marrow', 'broth', 'stock', 'gravy', 'mayonnaise', 'worcestershire sauce']
}

# vegan ingredients that might be confused with non-vegan ones
VEGAN_EXCEPTIONS = {
    'peanut butter', 'cocoa butter', 'eggless', 'butternut', 'butter beans',
    'coconut milk', 'almond milk', 'soy milk', 'oat milk', 'rice milk',
    'cashew milk', 'hemp milk', 'flax milk', 'pea milk',
    'vegan butter', 'margarine', 'vegan cheese', 'nutritional yeast',
    'vegan mayonnaise', 'vegan cream', 'vegan yogurt'
}

MEASUREMENT_WORDS = {
    'cup', 'cups', 'tablespoon', 'tablespoons', 'teaspoon', 'teaspoons',
    'tsp', 'tbsp', 'oz', 'ounce', 'ounces', 'pound', 'pounds',
    'extract', 'baking', 'powder', 'package', 'can', 'jar',
    'pint', 'quart', 'ml', 'l', 'gram', 'grams', 'kg','drops', 'pinch',
    'dash', 'slice', 'slices', 'piece', 'pieces', 'clove', 'cloves',
    'head', 'heads', 'leaf', 'leaves', 'sprig', 'sprigs', 'handful',
    'handfuls', 'bunch', 'bunches', 'block', 'blocks', 'stick', 'sticks',
    'slice', 'slices', 'strip', 'strips', 'cube', 'cubes', 'chunk',
    'chunks', 'pat', 'pats', 'dash', 'dashes', 'scoop', 'scoops',
    'pinch', 'pinches', 'portion', 'portions',
}

PREPARATION_WORDS = {
    'diced', 'shredded', 'chopped', 'melted', 'grated', 'minced',
    'sliced', 'crushed', 'peeled', 'roasted', 'boiled', 'baked', 'steamed',
    'blanched', 'cooked', 'raw', 'fresh', 'frozen', 'dry', 'ground'
}

STOPWORDS = {
    'a', 'an', 'of', 'and', 'or', 'with', 'the', 'to', 'in', 'for'
}


def normalize_ingredient(text: str) -> str:
    """Normalize and clean ingredient text by removing measurements and filler words."""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)  # Remove punctuation/digits
    words = text.split()
    filtered_words = [
        word for word in words
        if word not in MEASUREMENT_WORDS and word not in STOPWORDS and word not in PREPARATION_WORDS
    ]
    return ' '.join(filtered_words)

def is_ingredient_vegan(ingredient: str) -> bool:
    """
    Check if an ingredient is vegan (contains no animal products).
    Returns False if the ingredient is known to be derived from animals.
    """

    norm_ing = normalize_ingredient(ingredient)
    
    # Check vegan exceptions first (these are known vegan ingredients that might be confused)
    for exception in VEGAN_EXCEPTIONS:
        if exception in norm_ing:
            return True
    
    # Check for non-vegan ingredients using word boundaries
    for category, items in NON_VEGAN_INGREDIENTS.items():
        for item in items:
            # Use word boundaries to avoid partial matches
            if re.search(rf'\b{item}\b', norm_ing):
                return False
            
    return True
    

In [126]:
ingridients1 = ['3 cups all-purpose flour', '2 cups white sugar',"2 spoons of peanut butter", '2 pounds of ground beef','6 tablespoons unsweetened cocoa powder', '2 teaspoons baking soda', '2 teaspoons baking powder', '2/3 cup vegetable oil', '2 cups water', '2 tablespoons distilled white vinegar', '2 teaspoons vanilla extract']
for ingredient in ingridients1:   
    print(f"{ingredient}: normalized: {normalize_ingredient(ingredient)},{is_ingredient_vegan(ingredient)}")

ingridients2 = ['1/4 cup butter or margarine, softened', '2 tablespoons ground mustard', '2 tablespoons vinegar', '1/4 teaspoon garlic salt', '4 drops hot pepper sauce']   
for ingredient in ingridients2:   
    print(f"{ingredient}: normalized: {normalize_ingredient(ingredient)},{is_ingredient_vegan(ingredient)}")

3 cups all-purpose flour: normalized: all purpose flour,True
2 cups white sugar: normalized: white sugar,True
2 spoons of peanut butter: normalized: spoons peanut butter,True
2 pounds of ground beef: normalized: beef,False
6 tablespoons unsweetened cocoa powder: normalized: unsweetened cocoa,True
2 teaspoons baking soda: normalized: soda,True
2 teaspoons baking powder: normalized: ,True
2/3 cup vegetable oil: normalized: vegetable oil,True
2 cups water: normalized: water,True
2 tablespoons distilled white vinegar: normalized: distilled white vinegar,True
2 teaspoons vanilla extract: normalized: vanilla,True
1/4 cup butter or margarine, softened: normalized: butter margarine softened,True
2 tablespoons ground mustard: normalized: mustard,True
2 tablespoons vinegar: normalized: vinegar,True
1/4 teaspoon garlic salt: normalized: garlic salt,True
4 drops hot pepper sauce: normalized: hot pepper sauce,True


In [None]:
from typing import List
def fix_ingredient_string(s: str) -> List[str]:
    # Fix the case where ingredients are quoted but not comma-separated:
    # Turn this:  'a''b''c'  →  'a', 'b', 'c'
    s = re.sub(r"'\s*'", "', '", s)

    # Now try to parse it as a Python list
    try:
        import ast
        result = ast.literal_eval(s)
        if isinstance(result, list):
            return [str(item).strip() for item in result]
    except Exception:
        pass

    # Fallback: return a single-item list
    return [s]

def is_vegan(ingredients: List[str]) -> bool:
    return all(map(is_ingredient_vegan, ingredients))

df['ingredients'] = df['ingredients'].apply(lambda x: fix_ingredient_string(x) if isinstance(x, str) else x)

df['vegan_pred'] = df['ingredients'].apply(
            is_vegan)

In [120]:
from sklearn.metrics import classification_report

print("===Vegan===")
print(classification_report(
        df['vegan'], df['vegan_pred']))

df.to_csv("recepies_with_vegan_pred.csv", index=False)
missidentied_ingredients = df[df['vegan_pred'] != df['vegan']]
missidentied_ingredients.to_csv("missidentied_ingredients.csv", index=False)

===Vegan===
              precision    recall  f1-score   support

       False       1.00      0.97      0.98        60
        True       0.95      1.00      0.98        40

    accuracy                           0.98       100
   macro avg       0.98      0.98      0.98       100
weighted avg       0.98      0.98      0.98       100

