In [29]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json
from pathlib import Path

def score_deals_by_comparison():
    model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight but effective model
    
    # Get all items across stores
    all_items = []
    for file in Path('flyer_data').glob('*.json'):
        with open(file) as f:
            data = json.load(f)
            for item in data['items']:
                all_items.append({
                    'store': data['store'],
                    'name': item['name'],
                    'brand': item['brand'],
                    'price': float(item['price']),
                    'image': item['cutout_image_url'],
                })
    
    # Encode all item names
    item_names = [f"{item['brand']} {item['name']}" if item['brand'] else item['name'] 
                 for item in all_items]
    embeddings = model.encode(item_names)
    
    # Add embeddings to all_items
    for i, item in enumerate(all_items):
        item['embedding'] = embeddings[i]
    
    # Find similar items and compare prices
    deals = []
    for i, item in enumerate(all_items):
        similarities = cosine_similarity([embeddings[i]], embeddings)[0]
        similar_indices = [j for j in range(len(similarities)) 
                         if similarities[j] > 0.8 and j != i]  # 80% similarity threshold
        
        if similar_indices:
            similar_prices = [all_items[j]['price'] for j in similar_indices]
            avg_price = sum(similar_prices) / len(similar_prices)
            price_diff = avg_price - item['price']
            
            if price_diff > 0:  # It's cheaper than similar items
                deals.append({
                    'item': item,
                    'savings': price_diff,
                    'similar_items': [all_items[j] for j in similar_indices],
                    'similarity_score': similarities[similar_indices].mean()
                })
    
    return sorted(deals, key=lambda x: x['savings'], reverse=True)

deals = score_deals_by_comparison()
# print the top 10 deals
for deal in deals[:10]:
    print(deal)

{'item': {'store': 'Costco ', 'name': 'Round Brilliant 2.00 ctw VS2 Clarity, I Color Diamond Platinum Band', 'brand': None, 'price': 1499.97, 'image': 'https://f.wishabi.net/page_items/354099400/1729260171/extra_large.jpg', 'embedding': array([ 1.42899202e-02,  3.49805765e-02, -4.23535965e-02,  2.32675360e-04,
       -6.25097677e-02, -4.10071500e-02,  1.00660808e-01,  8.84054899e-02,
       -6.25300631e-02, -2.75153853e-02, -6.72453418e-02, -3.13200578e-02,
       -1.93514563e-02, -8.50076750e-02,  7.87793286e-03,  6.08386807e-02,
        7.06749707e-02, -2.08859034e-02,  1.88123423e-03, -6.55619577e-02,
       -1.06091537e-02, -5.92677146e-02, -5.31646945e-02,  6.50141481e-03,
       -7.80382901e-02,  7.14555606e-02, -3.70951220e-02,  2.96578165e-02,
        5.42396083e-02, -8.75476971e-02, -2.77099162e-02,  9.39310268e-02,
       -9.92873684e-03, -9.87314060e-03, -7.78964832e-02, -1.10300846e-01,
        2.32353737e-03,  5.18931672e-02, -2.91835759e-02, -3.86472140e-03,
        7.415

In [31]:
def pretty_print_deal(deal):
    print(f'Name: {deal["item"]["name"]}\nBrand: {deal["item"]["brand"]}\nPrice: ${deal["item"]["price"]:.2f}\nSavings: ${deal["savings"]:.2f}\n Image cutout: {deal["item"]["image"]}\n Similar item cutouts: {[x["image"] for x in deal["similar_items"]]}\n')

# search the deals by name
# search_term = input('Enter a search term: ')
# for deal in deals:
#     if search_term.lower() in deal['item']['name'].lower():
#         pretty_print_deal(deal)

search_term = 'fruit'
fruit_deals = []
model = SentenceTransformer('all-MiniLM-L6-v2')
search_embedding = model.encode([search_term])[0]  # Get the first (and only) embedding
for deal in deals:
    similarity = cosine_similarity([search_embedding], [deal['item']['embedding']])[0][0]
    if similarity > 0.4:
        pretty_print_deal(deal)
        fruit_deals.append(deal)

print(f'Found {len(fruit_deals)} deals for {search_term}')


Name: Welch's® 3-pk. Fruit Snacks
Brand: Welch's
Price: $1.25
Savings: $4.75
 Image cutout: https://f.wishabi.net/page_items/351837303/1727658841/extra_large.jpg
 Similar item cutouts: ['https://f.wishabi.net/page_items/354129153/1729278715/extra_large.jpg', 'https://f.wishabi.net/page_items/354099482/1729257729/extra_large.jpg']

Name: Strawberries
Brand: None
Price: $2.98
Savings: $3.02
 Image cutout: https://f.wishabi.net/page_items/353711218/1729056674/extra_large.jpg
 Similar item cutouts: ['https://f.wishabi.net/page_items/353326754/1728547996/extra_large.jpg']

Name: Honeycrisp Apples
Brand: None
Price: $0.98
Savings: $2.76
 Image cutout: https://f.wishabi.net/page_items/353711205/1729056672/extra_large.jpg
 Similar item cutouts: ['https://f.wishabi.net/page_items/353584688/1728881949/extra_large.jpg', 'https://f.wishabi.net/page_items/353326535/1728547992/extra_large.jpg']

Name: Red, Green or Black Seedless Grapes
Brand: None
Price: $0.88
Savings: $2.10
 Image cutout: https://

In [42]:
from openai import OpenAI
import os
from dotenv import load_dotenv
load_dotenv()

client = OpenAI()

def get_price_per_unit(item):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": f"Calculate the price per unit for this item: {item['name']} priced at ${item['price']:.2f}. Respond with only the price per unit in the format $X.XX/unit, where unit is one of: lb, oz, g, kg, gal, qt, pt, fl oz, ea. If it doesn't make sense to calculate a price per unit, respond with 'not applicable'. Use the most appropriate unit based on the image and item description."
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": item['image'],
                        },
                    },
                ],
            }
        ],
        max_tokens=30,
    )
    return response.choices[0].message.content.strip()

# Add price per unit to fruit_deals
for deal in fruit_deals:
    deal['price_per_unit'] = get_price_per_unit(deal['item'])

# Update pretty_print_deal function to include price per unit
def pretty_print_deal(deal):
    print(f"Name: {deal['item']['name']}")
    print(f"Brand: {deal['item']['brand']}")
    print(f"Price: ${deal['item']['price']:.2f}")
    print(f"Savings: ${deal['savings']:.2f}")
    print(f"Price per unit: {deal.get('price_per_unit', 'N/A')}")
    print(f"Image cutout: {deal['item']['image']}")
    print(f"Similar item cutouts: {[x['image'] for x in deal['similar_items']]}\n")

# Print updated fruit deals
for deal in fruit_deals:
    pretty_print_deal(deal)



Name: Welch's® 3-pk. Fruit Snacks
Brand: Welch's
Price: $1.25
Savings: $4.75
Price per unit: $0.42/ea
Image cutout: https://f.wishabi.net/page_items/351837303/1727658841/extra_large.jpg
Similar item cutouts: ['https://f.wishabi.net/page_items/354129153/1729278715/extra_large.jpg', 'https://f.wishabi.net/page_items/354099482/1729257729/extra_large.jpg']

Price per unit: $0.42/ea

Name: Strawberries
Brand: None
Price: $2.98
Savings: $3.02
Price per unit: $2.98/lb
Image cutout: https://f.wishabi.net/page_items/353711218/1729056674/extra_large.jpg
Similar item cutouts: ['https://f.wishabi.net/page_items/353326754/1728547996/extra_large.jpg']

Price per unit: $2.98/lb

Name: Honeycrisp Apples
Brand: None
Price: $0.98
Savings: $2.76
Price per unit: $0.98/lb
Image cutout: https://f.wishabi.net/page_items/353711205/1729056672/extra_large.jpg
Similar item cutouts: ['https://f.wishabi.net/page_items/353584688/1728881949/extra_large.jpg', 'https://f.wishabi.net/page_items/353326535/1728547992/ext

In [58]:
# Cell 1: Create embeddings and group edible items by similarity

from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
import json
from pathlib import Path
import numpy as np

def load_items():
    all_items = []
    for file in Path('flyer_data').glob('*.json'):
        with open(file) as f:
            data = json.load(f)
            for item in data['items']:
                all_items.append({
                    'store': data['store'],
                    'name': item['name'],
                    'brand': item['brand'],
                    'price': float(item['price']),
                    'image': item['cutout_image_url'],
                })
    return all_items

# List of common grocery categories
grocery_categories = [
    "fruit", "vegetable", "meat", "fish", "dairy", "bread", "cereal", "snack", 
    "beverage", "condiment", "spice", "baking", "pasta", "rice", "canned", 
    "frozen", "dessert", "cheese", "egg", "milk", "yogurt", "juice", "coffee", 
    "tea", "soup", "sauce", "oil", "nut", "bean", "grain", "herb"
]

def is_likely_edible(item, model, category_embeddings):
    item_embedding = model.encode(item['name'])
    similarities = cosine_similarity([item_embedding], category_embeddings)[0]
    return np.max(similarities) > 0.3  # Adjust this threshold as needed

def filter_edible_items(items):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    category_embeddings = model.encode(grocery_categories)
    return [item for item in items if is_likely_edible(item, model, category_embeddings)]

def create_embeddings(items):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    item_texts = [item['name'] for item in items]
    return model.encode(item_texts)

def group_items(items, embeddings):
    clustering = DBSCAN(eps=0.3, min_samples=2, metric='cosine').fit(embeddings)
    groups = {}
    for i, label in enumerate(clustering.labels_):
        if label != -1:  # -1 is the label for noise points
            if label not in groups:
                groups[label] = []
            groups[label].append(items[i])
    return list(groups.values())

all_items = load_items()
edible_items = filter_edible_items(all_items)
embeddings = create_embeddings(edible_items)
item_groups = group_items(edible_items, embeddings)

print(f"Loaded {len(all_items)} items")
print(f"Filtered to {len(edible_items)} likely edible items")
print(f"Created {len(item_groups)} groups of similar edible items")

Loaded 4005 items
Filtered to 1388 likely edible items
Created 203 groups of similar edible items


In [43]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
import json
from pathlib import Path
import numpy as np

def load_items():
    all_items = []
    for file in Path('flyer_data').glob('*.json'):
        with open(file) as f:
            data = json.load(f)
            for item in data['items']:
                all_items.append({
                    'store': data['store'],
                    'name': item['name'],
                    'brand': item['brand'],
                    'price': float(item['price']),
                    'image': item['cutout_image_url'],
                })
    return all_items

def create_embeddings(items):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    item_texts = [f"{item['brand']} {item['name']}" if item['brand'] else item['name'] for item in items]
    return model.encode(item_texts)

def group_items(items, embeddings):
    clustering = DBSCAN(eps=0.3, min_samples=2, metric='cosine').fit(embeddings)
    groups = {}
    for i, label in enumerate(clustering.labels_):
        if label != -1:  # -1 is the label for noise points
            if label not in groups:
                groups[label] = []
            groups[label].append(items[i])
    return list(groups.values())

all_items = load_items()
embeddings = create_embeddings(all_items)
item_groups = group_items(all_items, embeddings)

print(f"Created {len(item_groups)} groups of similar items")

Created 577 groups of similar items


In [78]:
# Cell: Debranding, vector-based edible filtering, and improved grouping

import re
from sklearn.cluster import DBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def clean_text(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def debrand_item(item):
    name = clean_text(item['name'])
    brand = clean_text(item.get('brand', ''))
    
    if brand and brand in name:
        name = name.replace(brand, '').strip()
    
    name = re.sub(r'\d+(\.\d+)?\s*(oz|lb|g|kg|ml|l|pack|ct|count|pk|piece|pc|fl\.?\s*oz)\b', '', name)
    name = re.sub(r'\s+', ' ', name).strip()
    
    return name

# List of keywords for edible items
edible_keywords = [
    'food', 'fruit', 'vegetable', 'meat', 'fish', 'dairy', 'milk', 'cheese', 'yogurt',
    'bread', 'cereal', 'snack', 'chip', 'cookie', 'cracker', 'candy', 'chocolate',
    'beverage', 'drink', 'juice', 'soda', 'water', 'coffee', 'tea',
    'condiment', 'sauce', 'spice', 'herb', 'oil', 'vinegar',
    'pasta', 'rice', 'grain', 'bean', 'legume',
    'soup', 'stew', 'broth',
    'frozen', 'fresh', 'canned',
    'breakfast', 'lunch', 'dinner', 'dessert',
    'organic', 'natural', 'processed'
]

# Initialize SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Create embeddings for edible keywords
edible_embeddings = model.encode(edible_keywords)

def is_likely_edible(name, threshold=0.3):
    name_embedding = model.encode([name])
    similarities = cosine_similarity(name_embedding, edible_embeddings)[0]
    return np.max(similarities) > threshold

# Debrand item names and categorize as edible or non-edible
edible_items = []
non_edible_items = []
for item in all_items:
    debranded_name = debrand_item(item)
    item['debranded_name'] = debranded_name
    if is_likely_edible(debranded_name):
        edible_items.append(item)
    else:
        non_edible_items.append(item)

# Create new embeddings based on debranded names of edible items
debranded_embeddings = model.encode([item['debranded_name'] for item in edible_items])

# Group items using the new embeddings
def group_items_improved(items, embeddings):
    clustering = DBSCAN(eps=0.3, min_samples=2, metric='cosine').fit(embeddings)
    groups = {}
    for i, label in enumerate(clustering.labels_):
        if label != -1:  # -1 is the label for noise points
            if label not in groups:
                groups[label] = []
            groups[label].append(items[i])
    return list(groups.values())

improved_item_groups = group_items_improved(edible_items, debranded_embeddings)

print(f"Total items: {len(all_items)}")
print(f"Likely edible items: {len(edible_items)}")
print(f"Non-edible items: {len(non_edible_items)}")
print(f"Created {len(improved_item_groups)} improved groups of similar edible items")

# Print a sample of edible groups
print("\nSample of Edible Item Groups:")
for i, group in enumerate(improved_item_groups):
    print(f"\nGroup {i+1}:")
    for item in group:
        print(f"  Original: {item['name']}")
        print(f"  Brand: {item.get('brand', 'N/A')}")
        print(f"  Debranded: {item['debranded_name']}")
    print(f"  Group size: {len(group)}")



# Print a sample of non-edible items
print("\nSample of Non-Edible Items:")
for item in non_edible_items:
    print(f"  Original: {item['name']}")
    print(f"  Brand: {item.get('brand', 'N/A')}")
    print(f"  Debranded: {item['debranded_name']}")


Total items: 4005
Likely edible items: 1753
Non-edible items: 2252
Created 243 improved groups of similar edible items

Sample of Edible Item Groups:

Group 1:
  Original: YETI® Rambler® 42 Oz. Straw Mug with Lid
  Brand: YETI®
  Debranded: rambler straw mug with lid
  Original: YETI Rambler® 35 Oz. Straw Mug
  Brand: YETI®
  Debranded: rambler straw mug
  Original: Rambler® 35 Oz. Straw Mug,
  Brand: None
  Debranded: rambler straw mug
  Original: YETI® Rambler® 42 Oz. Straw Mug with Lid
  Brand: YETI®
  Debranded: rambler straw mug with lid
  Original: Rambler® 35 Oz. Straw Mug
  Brand: None
  Debranded: rambler straw mug
  Original: YETI® Rambler® 42 Oz. Straw Mug with Lid
  Brand: YETI®
  Debranded: rambler straw mug with lid
  Original: YETI® Rambler® 35 Oz. Straw Mug
  Brand: YETI®
  Debranded: rambler straw mug
  Original: YETI® Rambler® 42 Oz. Straw Mug with Lid
  Brand: Yeti
  Debranded: rambler straw mug with lid
  Original: Harvest Mugs
  Brand: None
  Debranded: harvest mug

In [60]:
# print the groups
for i, group in enumerate(item_groups):
    print(f"Group {i+1}:")
    for item in group:
        print(f"  {item['name']}")


Group 1:
  Lowrance HOOK Reveal 9 TripleShot Fishfinder
  Lowrance HOOK Reveal 5 Fish Finder - 5 SS US Inland
Group 2:
  YETI® Rambler® 42 Oz. Straw Mug with Lid
  YETI Rambler® 35 Oz. Straw Mug
  Rambler® 35 Oz. Straw Mug,
  YETI® Rambler® 42 Oz. Straw Mug with Lid
  Rambler® 35 Oz. Straw Mug
  YETI® Rambler® 42 Oz. Straw Mug with Lid
  YETI® Rambler® 35 Oz. Straw Mug
  YETI® Rambler® 42 Oz. Straw Mug with Lid
  12-oz. coffee mug
Group 3:
  Genesis® SA-E-330™ Gas Grill,
  Genesis® SA-E-330™ Gas Grill
  Weber® Genesis® SA-E-325™ Gas Grill
  Weber® Genesis® SA-E-325™ Gas Grill
Group 4:
  Select Songbird Selections® Wild Bird Food, 5 Lb.**
  Select Songbird Selections® Wild Bird Food, 5 Lb.**
  Ace® 40 Lb. Wild Bird Food Kaytee® 14 Lb. Songbird Blend™ or 16 Lb. Birders' Blend® Wild Bird Food
  Ace® 40 Lb. Wild Bird Food Kaytee® 14 Lb. Songbird Blend™ or 16 Lb. Birders' Blend® Wild Bird Food
  Select Songbird Selections® Wild Bird Food, 5 Lb.**
  Select Songbird Selections® Wild Bird Food

In [51]:
# Cell 2: Search groups by term

from sklearn.metrics.pairwise import cosine_similarity

def search_groups(groups, search_term):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    search_embedding = model.encode([search_term])[0]
    
    matching_groups = []
    for group in groups:
        group_embeddings = model.encode([f"{item['brand']} {item['name']}" if item['brand'] else item['name'] for item in group])
        max_similarity = np.max(cosine_similarity([search_embedding], group_embeddings)[0])
        if max_similarity > 0.4:  # Adjust this threshold as needed
            matching_groups.append(group)
    
    return matching_groups

search_term = 'fruit' #input("Enter a search term: ")
matching_groups = search_groups(item_groups, search_term)
print(f"Found {len(matching_groups)} matching groups for '{search_term}'")

# print the groups
for i, group in enumerate(matching_groups):
    print(f"Group {i+1}:")
    for item in group:
        print(f"  {item['name']}")



Found 12 matching groups for 'fruit'
Group 1:
  Select Halloween Decorative Lighting** 24' C6 LED Halloween Light String
  Select Halloween Decorative Lighting**
  Select Halloween Decorative Lighting**
  Select Halloween Decorative Lighting**
  Select Halloween Decorative Lighting**
  Select Halloween Decorative Lighting**
  Select Halloween Decorative Lighting**
  Select Halloween Decorative Lighting**
  9' LED Pumpkin Light String
  Select Halloween Decorative Lighting**
  Halloween Socks
  Holiday Socks
  Halloween Glow
  Halloween Decor
  Halloween Masks or Headpieces
  Halloween Makeup
  Pumpkin Carving Kit
  Halloween Partyware
  Halloween Crafts
  Halloween Costumes
  Light-Up Masks
  Costume Accessories
  Costume Accessories
  Licensed Socks
  Halloween Dress Up for the Family
  Halloween Candles
  Halloween Housewares
  Halloween Tinsel Icons
  Halloween Doormat
  Halloween Make - Up Kits, Hats, Capes, Masks, Treat Bags or Buckets, Dress Up Kits, Assorted Hairspray, Halloween

In [None]:

# Cell 3: Use GPT-4-vision to compare deals within groups

from openai import OpenAI
import os
from dotenv import load_dotenv
load_dotenv()

client = OpenAI()

def compare_deals(group):
    items_text = "\n".join([f"Store: {item['store']}, Name: {item['name']}, Brand: {item['brand']}, Price: ${item['price']:.2f}" for item in group])
    images = [{"type": "image_url", "image_url": {"url": item['image']}} for item in group]
    
    response = client.chat.completions.create(
        model="gpt-4-vision-preview",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": f"Compare these similar items and determine which is the best deal. Consider factors like price, brand, and any visible quality differences. Here are the items:\n\n{items_text}\n\nRespond with the name of the item that's the best deal and a brief explanation why."
                    },
                    *images
                ],
            }
        ],
        max_tokens=150,
    )
    return response.choices[0].message.content.strip()

# Compare deals for each matching group
for i, group in enumerate(matching_groups):
    print(f"\nGroup {i+1}:")
    best_deal = compare_deals(group)
    print(best_deal)