In [8]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [4]:
with open('products.json') as f:
    products = json.load(f)

df = pd.DataFrame(products)
print("Raw DataFrame:")
print(df.head())


Raw DataFrame:
         barcode                        name  \
0  0044000004464                Oreo Cookies   
1   078742300512    Jif Creamy Peanut Butter   
2   041196912573     Nutella Hazelnut Spread   
3   030000016042  Lay's Classic Potato Chips   
4   028400076053        Doritos Nacho Cheese   

                                         ingredients          allergens  
0  [Sugar, Palm Oil, Cocoa, Wheat Flour, Soy Leci...       [Wheat, Soy]  
1                     [Roasted Peanuts, Sugar, Salt]             [Nuts]  
2  [Sugar, Palm Oil, Hazelnuts, Cocoa, Milk, Soy ...  [Nuts, Milk, Soy]  
3                    [Potatoes, Vegetable Oil, Salt]                 []  
4  [Corn, Vegetable Oil, Cheese, Milk, Salt, Garl...             [Milk]  


In [None]:
df['combined_features'] = df['name'] + ' ' + df['ingredients'].apply(' '.join)
df['combined_features'] = df['combined_features'].str.lower()

# Convert allergens to consistent format
df['allergens'] = df['allergens'].apply(lambda x: [a.lower().strip() for a in x])

print("\nProcessed DataFrame:")
print(df[['name', 'allergens', 'combined_features']].head())

                         name          allergens  \
0                Oreo Cookies       [wheat, soy]   
1    Jif Creamy Peanut Butter             [nuts]   
2     Nutella Hazelnut Spread  [nuts, milk, soy]   
3  Lay's Classic Potato Chips                 []   
4        Doritos Nacho Cheese             [milk]   

                                   combined_features  
0  oreo cookies sugar palm oil cocoa wheat flour ...  
1  jif creamy peanut butter roasted peanuts sugar...  
2  nutella hazelnut spread sugar palm oil hazelnu...  
3  lay's classic potato chips potatoes vegetable ...  
4  doritos nacho cheese corn vegetable oil cheese...  


In [10]:
vectorizer = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2
)

tfidf_matrix = vectorizer.fit_transform(df['combined_features'])
print(f"\nTrained TF-IDF model with {len(vectorizer.vocabulary_)} features")


Trained TF-IDF model with 137 features


In [9]:
def get_recommendations(self, scanned_text, user_allergies, top_n=3):
        # Filter safe products using pandas
        user_allergies = [a.lower().strip() for a in user_allergies]
        
        # Create allergen mask
        safe_mask = ~self.df['allergens'].apply(
            lambda x: any(allergy in x for allergy in user_allergies)
        )
        
        safe_df = self.df[safe_mask].copy()
        
        if safe_df.empty:
            return []
        
        # Vectorize input text
        input_vector = self.vectorizer.transform([scanned_text.lower()])
        
        # Calculate similarities
        similarities = cosine_similarity(input_vector, self.tfidf_matrix[safe_df.index])
        safe_df['similarity_score'] = similarities[0]
        
        # Filter and sort
        safe_df = safe_df[safe_df['similarity_score'] > 0]
        safe_df = safe_df.sort_values('similarity_score', ascending=False)
        
        return safe_df.head(top_n).to_dict('records')

# Initialize recommender
recommender = AllergyRecommender(df, vectorizer, tfidf_matrix)

NameError: name 'AllergyRecommender' is not defined

In [None]:

from sklearn.model_selection import train_test_split
import pandas as pd

class ModelEvaluator:
    def __init__(self, df, recommender):
        self.df = df
        self.recommender = recommender
        self.results = []
    
    def generate_test_cases(self, test_size=0.2):
        """Create synthetic test cases from existing data"""
        # Use products with allergens as test cases
        allergic_products = self.df[self.df['allergens'].apply(len) > 0]
        self.test_cases = []
        
        for _, row in allergic_products.iterrows():
            self.test_cases.append({
                'scanned_text': row['name'],
                'user_allergies': row['allergens'],
                'expected_allergen_free': True,
                'expected_similar_product': row['name']  # Ideally should find alternative
            })
            
        # Add some safe product tests
        safe_products = self.df[self.df['allergens'].apply(len) == 0]
        for _, row in safe_products.sample(min(5, len(safe_products))).iterrows():
            self.test_cases.append({
                'scanned_text': row['name'],
                'user_allergies': [],
                'expected_allergen_free': True,
                'expected_similar_product': row['name']
            })
        
        return self.test_cases

    def evaluate(self):
        """Run evaluation metrics"""
        metrics = {
            'precision@1': 0,
            'precision@3': 0,
            'safety_accuracy': 0,
            'coverage': 0
        }
        
        valid_tests = 0
        
        for case in self.test_cases:
            recommendations = self.recommender.get_recommendations(
                case['scanned_text'],
                case['user_allergies'],
                top_n=3
            )
            
            # Safety Check
            safe = all(
                not any(allergy in rec['allergens'] 
                    for allergy in case['user_allergies'])
                for rec in recommendations
            )
            metrics['safety_accuracy'] += safe
            
            # Precision Checks
            relevant = 0
            for rec in recommendations:
                if self._is_relevant(rec, case['scanned_text']):
                    relevant += 1
            
            if recommendations:
                metrics['precision@1'] += (relevant >= 1)
                metrics['precision@3'] += (relevant >= 1)  # At least 1 relevant
                metrics['coverage'] += 1
            
            valid_tests += 1
        
        # Normalize metrics
        for k in metrics:
            metrics[k] = metrics[k] / valid_tests if valid_tests > 0 else 0
            
        return metrics
    
    def _is_relevant(self, recommendation, query):
        """Custom relevance criteria (adjust based on your domain)"""
        query_terms = set(query.lower().split())
        product_terms = set(recommendation['name'].lower().split())
        return len(query_terms & product_terms) > 0

# %% [markdown]
# ## Run Evaluation
# %%
evaluator = ModelEvaluator(df, recommender)
test_cases = evaluator.generate_test_cases()
metrics = evaluator.evaluate()

print("Evaluation Metrics:")
print(f"1. Safety Accuracy: {metrics['safety_accuracy']:.2%}")
print(f"2. Precision@1: {metrics['precision@1']:.2%}")
print(f"3. Precision@3: {metrics['precision@3']:.2%}")
print(f"4. Coverage: {metrics['coverage']:.2%}")

Evaluation Metrics:
1. Safety Accuracy: 100.00%
2. Precision@1: 62.90%
3. Precision@3: 62.90%
4. Coverage: 67.74%


In [119]:
# %% [markdown]
# ## Random Product Test
# %%
import random
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load products
with open('products.json') as f:
    all_products = json.load(f)

# Randomly select 5 test products
test_products = random.sample(all_products, 5)

# Create combined text features (name + ingredients)
product_texts = [
    f"{p['name']} {' '.join(p['ingredients'])}".lower() 
    for p in all_products
]

# Train TF-IDF model
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(product_texts)

# Test each random product
for product in test_products:
    print(f"\n🚨 Testing Product: {product['name']}")
    print(f"   Barcode: {product['barcode']}")
    print(f"   Allergens: {', '.join(product['allergens']) or 'None'}")
    
    # Assume user has these allergies
    user_allergies = [a.lower() for a in product['allergens']]
    
    # Find safe alternatives (exclude current product)
    safe_products = [
        p for p in all_products 
        if not any(a in p['allergens'] for a in user_allergies)
        and p['barcode'] != product['barcode']
    ]
    
    if not safe_products:
        print("   ❌ No safe alternatives found")
        continue
    
    # Get similarity scores
    product_idx = all_products.index(product)
    safe_indices = [all_products.index(p) for p in safe_products]
    similarities = cosine_similarity(tfidf_matrix[product_idx], tfidf_matrix[safe_indices])[0]
    
    # Get top 3 matches
    top_matches = np.argsort(similarities)[::-1][:3]
    
    print("   ✅ Recommended Alternatives:")
    for i, idx in enumerate(top_matches, 1):
        match = safe_products[idx]
        score = similarities[idx]
        print(f"   {i}. {match['name']} (Score: {score:.2f})")
        print(f"      Allergens: {', '.join(match['allergens']) or 'None'}")


🚨 Testing Product: Macadamia Nuts
   Barcode: 0009876543218
   Allergens: Nuts
   ✅ Recommended Alternatives:
   1. Mixed Nuts (Score: 0.27)
      Allergens: Nuts
   2. Hershey's Nuts Chocolate Bar (Score: 0.21)
      Allergens: Milk, Soy, Nuts
   3. Peanut Butter (Score: 0.05)
      Allergens: Nuts

🚨 Testing Product: Kit Kat Chocolate Bar
   Barcode: 050000000412
   Allergens: Milk, Soy
   ✅ Recommended Alternatives:
   1. Hershey's Milk  Chocolate Bar (Score: 0.74)
      Allergens: Milk, Soy
   2. Milk Chocolate Bar (Score: 0.71)
      Allergens: Milk
   3. Hershey's Nuts Chocolate Bar (Score: 0.69)
      Allergens: Milk, Soy, Nuts

🚨 Testing Product: Coconut Cookies
   Barcode: 0054321876543
   Allergens: Eggs
   ✅ Recommended Alternatives:
   1. Juhayna Coconut Yogurt (Score: 0.57)
      Allergens: None
   2. Coconut Almond Granola (Score: 0.53)
      Allergens: Nuts, Wheat
   3. Coconut Secret Aminos (Imported) (Score: 0.51)
      Allergens: None

🚨 Testing Product: Coconut Almo