## Install Dependencies

In [None]:
!pip install datasets chromadb bitsandbytes gradio

## Import Libraries

In [2]:
import os
import torch
import re
import torch.nn.functional as F
import gradio as gr
import requests
import gc
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, set_seed
from peft import PeftModel
from huggingface_hub import login

## OpenAI API Setup

In [3]:
from openai import OpenAI

OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"

from google.colab import userdata
api_key = userdata.get('api_key')
hf_token = userdata.get('hf_token')

client = OpenAI(
    base_url=OPENROUTER_BASE_URL,
    api_key=api_key
)

## Define ProductDescriptionAgent Class
Defines a class for an agent that processes product URLs to build prompts.

In [None]:
class ProductDescriptionAgent:
    """
    An agent that takes a product URL and constructs a prompt with a system message and user instructions.
    The prompt instructs the model to read the URL and produce a detailed product description ending with a price
    line in the format "Price is $X". The system message and instructions warn:
      "Be careful with products that are described as '$XXX off' or 'reduced by $XXX' - this isn't the actual price of the product."
    After obtaining the response, the agent extracts the price, removes it from the description, and returns both.
    """

    def __init__(self, model):
        self.system_message = '''You are a product description generator. Your task is to read the product listing at the given URL and produce a detailed description
        of the product, including its key features, and relevant details. IMPORTANT: Be careful with products that are described as
        \"$XXX off\" or \"reduced by $XXX\" - this isn't the actual price of the product.\n
        At the very end of your description, on a new line, append the actual price in the following format:\n
        Price is $X'''

        self.user_message = '''Read the product listing at the provided URL and write a detailed description of the product.
        At the very end of your description, on a new line, append the actual price in the following format:\n
        Price is $X\n'''

        self.model = model

    def process_url(self, url):
        user_message = self.user_message + f"\nHere is the url {url}"
        messages = [
                  {"role": "system", "content": self.system_message},
                  {"role": "user", "content": user_message}
                  ]

        response = client.chat.completions.create(
            model=self.model,
            messages=messages
            )
        result = response.choices[0].message.content

        # The model's response should end with a line like: "Price is $123.45"
        # Extract the price using a regular expression.
        match = re.search(r"Price is \$([0-9]+(?:\.[0-9]+)?)\s*$", result.strip())
        if not match:
            raise ValueError("The model response did not include a price in the expected format.")

        price_str = match.group(1)
        try:
            price = float(price_str)
        except Exception as e:
            raise ValueError("Failed to convert extracted price to float.") from e

        # Remove the price line from the response.
        lines = result.strip().splitlines()
        if lines and re.match(r"Price is \$", lines[-1]):
            description_without_price = "\n".join(lines[:-1]).strip()
        else:
            description_without_price = result.strip()

        description_without_price = description_without_price

        return description_without_price, price

In [5]:
agent = ProductDescriptionAgent(model="google/gemini-2.0-pro-exp-02-05:free")  # Use a model you have access to

test_url = "https://www.saturn.de/de/product/_bose-quietcomfort-ultra-wireless-noise-cancelling-over-ear-kopfhorer-bluetooth-weiss-2891468.html?storeId=&utm_source=google&utm_medium=cpc&utm_source=google&utm_medium=cpc&utm_campaign=RT_shopping_na_sp_na_PLA-WKZ-Bose+Paid+AO+April+24+-+M%C3%A4rz+25-PLA0203025&gad_source=1&gclid=Cj0KCQjws-S-BhD2ARIsALssG0amT7K3S2OPxVP_NwHXoW97NrET0FWnvk_Aj6K4HxM176nGU3VromEaAm9pEALw_wcB&gclsrc=aw.ds"
description, price = agent.process_url(test_url)

print("Constructed Description (without price):\n")
print(description)
print("\nExtracted Price:", price)

Constructed Description (without price):

The Bose QuietComfort Ultra Wireless Noise Cancelling Over-Ear Headphones offer a premium listening experience with cutting-edge technology. These headphones feature world-class noise cancellation, allowing you to immerse yourself in your audio and silence distractions. Bose's innovative CustomTune technology automatically adjusts the noise cancellation and sound performance to perfectly fit your ears, ensuring optimal audio quality.

Enjoy a comfortable and stable fit with these over-ear headphones, designed for extended wear. The plush earcups and headband cradle your ears, while the sleek, sophisticated design reflects a high-end aesthetic.

These headphones don't just excel in audio quality; they also provide crystal-clear call quality, even in noisy environments. The advanced microphone system isolates your voice and filters out ambient sounds, ensuring your conversations are clear and uninterrupted.

With Bluetooth connectivity, you can e

## Load Pricer Dataset

In [6]:
from datasets import load_dataset

DATASET_NAME = "ed-donner/pricer-data"
dataset = load_dataset(DATASET_NAME)

train = dataset['train']
test = dataset['test']

# Select the first 20,000 samples for training
data = train.select(range(20000))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/416 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/185M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/914k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/400000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [7]:
# Extract product descriptions and prices
product_descriptions = data['text']
product_prices = data['price']

## Setup ChromaDB & SentenceTransformer

In [8]:
import chromadb
import os
import shutil
from sentence_transformers import SentenceTransformer
import tqdm

# Define the database path
DB = "products_vectorstore"

# Delete the store if it exists
if os.path.exists(DB):
    shutil.rmtree(DB)

# Initialize ChromaDB
client = chromadb.PersistentClient(path=DB)
collection = client.get_or_create_collection("products")

# Load the embedding model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Generate embeddings for descriptions
embeddings = model.encode(product_descriptions, show_progress_bar=True)

# Store embeddings in ChromaDB
collection.add(
    ids=[str(i) for i in range(len(product_descriptions))],  # String IDs
    embeddings=embeddings.tolist(),
    metadatas=[{"description": desc, "price": price} for desc, price in zip(product_descriptions, product_prices)]
)

print("✅ Vector store successfully created with 20,000 product embeddings!")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

✅ Vector store successfully created with 20,000 product embeddings!


## RAG: Define Similar Product Retrieval Function
Creates a function to retrieve similar products via a RAG pipeline.

In [9]:
def retrieve_similar_products(query, top_k=5):
    """
    Retrieve similar products from a vector store.
    The query is embedded via the global 'model', and the collection is queried for similar items.
    Each result is expected to have a 'description' and 'price'.
    """
    # Convert the query to an embedding (assume 'model' is defined globally)
    query_embedding = model.encode([query])[0]

    # Query the collection (assume 'collection' is defined globally)
    results = collection.query(
        query_embeddings=[query_embedding.tolist()],
        n_results=top_k
    )

    # Extract results from the metadatas (each should have a 'description' and 'price')
    similar_products = [
        {"description": res.get("description", ""), "price": res.get("price", 0)}
        for res in results["metadatas"][0]
    ]

    return similar_products

In [10]:
query = "High-performance wireless noise-canceling headphones"
similar_products = retrieve_similar_products(query)

print("\n🔍 Top 5 Similar Products:")
for i, product in enumerate(similar_products, 1):
    print(f"{i}. {product['description']} - 💰 ${product['price']}")


🔍 Top 5 Similar Products:
1. How much does this cost to the nearest dollar?

Sony Wireless Noise Canceling Over-Ear Headphones (Black) Extra Bass Portable Bluetooth Speaker Bundle (2 Items)
Sony Wireless Noise Canceling Over-Ear Headphones The headphones rewrite the rules for distraction-free listening. 2 processors control 8 microphones for unprecedented noise cancellation and exceptional call quality. With a newly developed driver, DSEE – Extreme, and Hi-Res audio support the headphones provide awe-inspiring audio quality. Industry-leading Noise Cancellation From airplane noise to people’s voices, our wireless headphones with 8 microphones for noise cancellation keep out more high and mid-frequency sounds than ever. Multi Noise Sensor Technology With 8 total microphones, these headphones are our biggest step forward in noise-canceling technology. Incomparable Noise Processing Specially developed by Sony, the Integrated Processor V

Price is $398.00 - 💰 $398.0
2. How much does this c

## Define PricePredictor Class
Defines a class for predicting prices from product descriptions.

In [13]:
class PricePredictor:
    """
    An agent that takes a product description without price and predicts what the price should be
    by finding similar products and asking an LLM to estimate based on those examples.
    """

    def __init__(self, model):
        self.system_message = '''You are a price prediction specialist. Your task is to analyze a product description and similar
        products with known prices, then predict the most likely price for the described product.

        You will be given:
        1. A product description for which you need to predict the price
        2. Information about similar products and their prices

        IMPORTANT INSTRUCTIONS:
        - Consider all the given similar products and their prices as reference points
        - Think about the features, quality, and brand positioning described in the target product
        - Pay attention to any premium features or limitations that might affect pricing
        - Respond ONLY with a numeric price value rounded to the nearest dollar
        - Do not include any explanations, currency symbols, or other text
        - Just return a number representing your price prediction'''

        self.user_message_template = '''How much does this cost to the nearest dollar?

PRODUCT DESCRIPTION:
{description}

SIMILAR PRODUCTS AND THEIR PRICES:
{similar_products}

What would be the most likely price for this product to the nearest dollar?
Remember to respond ONLY with a numeric price value (a number with no dollar sign or text).'''

        self.model = model

    def predict_price(self, description):
        # Retrieve similar products
        similar_products = retrieve_similar_products(description)

        # Format the similar products as text
        similar_products_text = ""
        for i, product in enumerate(similar_products, 1):
            similar_products_text += f"{i}. {product['description']} - ${product['price']}\n"

        # Create the user message with the template
        user_message = self.user_message_template.format(
            description=description,
            similar_products=similar_products_text
        )

        # Prepare messages for the API call
        messages = [
            {"role": "system", "content": self.system_message},
            {"role": "user", "content": user_message},
            {"role": "assistant", "content": "Price is $"}
        ]

        # Call the model
        response = client.chat.completions.create(
            model=self.model,
            messages=messages
        )

        # Extract just the numeric price from the response
        result = response.choices[0].message.content.strip()

        # Look for the first number in the response
        match = re.search(r'\b(\d+(?:\.\d+)?)\b', result)
        if match:
            try:
                predicted_price = float(match.group(1))
                # Round to nearest dollar
                predicted_price = round(predicted_price)
                return predicted_price, user_message
            except ValueError:
                pass

        raise ValueError(f"Failed to extract a valid price from model response: '{result}'")

In [16]:
client = OpenAI(
    base_url=OPENROUTER_BASE_URL,
    api_key=api_key
)

deepseek_predictor = PricePredictor(model="deepseek/deepseek-chat:free")
gemini_predictor = PricePredictor(model="google/gemini-2.0-pro-exp-02-05:free")

test_description = '''The Bose QuietComfort Ultra Wireless Noise Cancelling Over-Ear Headphones offer a premium listening experience with cutting-edge technology. These headphones, available in a sleek white finish, feature Bose's renowned noise-cancelling capabilities, allowing you to immerse yourself in your audio and block out distractions. The over-ear design provides comfort for extended wear, while the Bluetooth connectivity enables wireless freedom.

Key Features:

*   **World-Class Noise Cancellation:** Bose's industry-leading noise cancellation technology effectively minimizes external sounds, creating a quiet and focused listening environment.
*   **Immersive Audio:** Experience high-fidelity audio with rich bass, clear mids, and crisp highs, delivering a captivating sound experience.
*   **Wireless Freedom:** Bluetooth connectivity allows you to connect wirelessly to your devices, eliminating the hassle of tangled cords.
*   **Comfortable Over-Ear Design:** The plush earcups and adjustable headband are designed for long-lasting comfort, even during extended listening sessions.
* CustomTune technology that auto-adjusts to your liking.
* Calls are clear even when it's windy or noisy

These headphones are perfect for anyone seeking a premium audio experience with exceptional noise cancellation, whether for travel, work, or relaxation.'''

predicted_price, prompt = deepseek_predictor.predict_price(test_description)
gemini_predictor, prompt = gemini_predictor.predict_price(test_description)
print(prompt)
print(f"\nActual Price: ${price}")
print(f"\nPredicted Price from DeepSeek: ${predicted_price}")
print(f"\nPredicted Price from Gemini: ${gemini_predictor}")

How much does this cost to the nearest dollar?

PRODUCT DESCRIPTION:
The Bose QuietComfort Ultra Wireless Noise Cancelling Over-Ear Headphones offer a premium listening experience with cutting-edge technology. These headphones, available in a sleek white finish, feature Bose's renowned noise-cancelling capabilities, allowing you to immerse yourself in your audio and block out distractions. The over-ear design provides comfort for extended wear, while the Bluetooth connectivity enables wireless freedom.

Key Features:

*   **World-Class Noise Cancellation:** Bose's industry-leading noise cancellation technology effectively minimizes external sounds, creating a quiet and focused listening environment.
*   **Immersive Audio:** Experience high-fidelity audio with rich bass, clear mids, and crisp highs, delivering a captivating sound experience.
*   **Wireless Freedom:** Bluetooth connectivity allows you to connect wirelessly to your devices, eliminating the hassle of tangled cords.
*   **C

## HuggingFace Login

In [17]:
hf_token = userdata.get('hf_token')
login(hf_token, add_to_git_credential=True)

## Define FineTunedPredictor Class
Defines a class for a fine-tuned model-based price prediction agent.

In [30]:
class FineTunedPredictor:
    """
    A simplified price prediction agent that uses a fine-tuned LLM to predict product prices
    based on descriptions only.
    """

    def __init__(self,
                 base_model="meta-llama/Meta-Llama-3.1-8B",
                 hf_user="HazemAshraf",
                 project_name="pricer",
                 run_name="2025-03-17_23.10.34",
                 revision=None,
                 quant_4_bit=True,
                 device="cuda"):

        self.device = device
        self.project_run_name = f"{project_name}-{run_name}"
        self.finetuned_model_name = f"{hf_user}/{self.project_run_name}"
        self.revision = revision

        # Setup quantization configuration
        if quant_4_bit:
            self.quant_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_compute_dtype=torch.bfloat16,
                bnb_4bit_quant_type="nf4"
            )
        else:
            self.quant_config = BitsAndBytesConfig(
                load_in_8bit=True,
                bnb_8bit_compute_dtype=torch.bfloat16
            )

        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = "right"

        # Load base model
        self.base_model = AutoModelForCausalLM.from_pretrained(
            base_model,
            quantization_config=self.quant_config,
            device_map=self.device,
        )
        self.base_model.generation_config.pad_token_id = self.tokenizer.pad_token_id

        # Load fine-tuned model
        if self.revision:
            self.model = PeftModel.from_pretrained(
                self.base_model,
                self.finetuned_model_name,
                revision=self.revision
            )
        else:
            self.model = PeftModel.from_pretrained(
                self.base_model,
                self.finetuned_model_name
            )

    def predict_price(self, description, top_k=3):
        """
        Uses the fine-tuned model to predict the price based on product description.

        Args:
            description (str): Product description without price
            top_k (int): Number of top tokens to consider for weighted average

        Returns:
            float: Predicted price rounded to nearest dollar
            str: The prompt used for prediction
        """
        # Ensure description starts with the question
        if not description.startswith("How much does this cost to the nearest dollar?"):
            prompt = "How much does this cost to the nearest dollar?\n" + description
        else:
            prompt = description

        # Add the price prefix
        prompt += "\nPrice is $"

        # Tokenize the input
        inputs = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
        attention_mask = torch.ones(inputs.shape, device=self.device)

        # Generate prediction
        with torch.no_grad():
            outputs = self.model(inputs, attention_mask=attention_mask)
            next_token_logits = outputs.logits[:, -1, :].to('cpu')

        # Calculate probabilities and get top tokens
        next_token_probs = F.softmax(next_token_logits, dim=-1)
        top_prob, top_token_id = next_token_probs.topk(top_k)

        # Process top tokens to get weighted price
        prices, weights = [], []
        for i in range(top_k):
            predicted_token = self.tokenizer.decode(top_token_id[0][i])
            # Convert tensor to Python float
            probability = float(top_prob[0][i].item())
            try:
                result = float(predicted_token)
            except ValueError:
                result = 0.0
            if result > 0:
                prices.append(result)
                weights.append(probability)

        # Calculate weighted average price
        if not prices:
            predicted_price = 0.0
        else:
            total = sum(weights)
            weighted_price = sum(price * weight for price, weight in zip(prices, weights)) / total
            # Ensure weighted_price is a Python float before rounding
            if isinstance(weighted_price, torch.Tensor):
                weighted_price = float(weighted_price.item())
            predicted_price = round(weighted_price)

        return predicted_price, prompt

In [21]:
predictor = FineTunedPredictor()

predicted_price, prompt = predictor.predict_price(test_description)
print(f"\nPredicted Price: ${predicted_price}")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Memory footprint: 5700.6 MB

Predicted Price: $385


## Full System Integration

In [50]:
# Instantiate the models
agent = ProductDescriptionAgent(model="google/gemini-2.0-pro-exp-02-05:free")
deepseek_predictor = PricePredictor(model="deepseek/deepseek-chat:free")
gemini_predictor = PricePredictor(model="google/gemini-2.0-pro-exp-02-05:free")
predictor = FineTunedPredictor()

def process_url(url: str) -> str:
    # Process the product URL to extract description and price
    description, price = agent.process_url(url)

    # Predict prices using the different predictors
    predicted_deepseek, prompt_deepseek = deepseek_predictor.predict_price(description)
    predicted_gemini, prompt_gemini = gemini_predictor.predict_price(description)
    predicted_finetuned, _ = predictor.predict_price(description)

    # Calculate average predicted price and discount
    avg_predicted = (predicted_deepseek + predicted_gemini + predicted_finetuned) / 3
    discount = ((avg_predicted - price) / avg_predicted) * 100

    # Collect any available output prompt (you could choose which prompt to display)
    output  = f"Actual Price: ${price}"
    output += f"\nPredicted Price from DeepSeek: ${predicted_deepseek}"
    output += f"\nPredicted Price from Gemini: ${predicted_gemini}"
    output += f"\nPredicted Price from FineTuned: ${predicted_finetuned}"
    output += f"\nAverage Predicted Price: ${avg_predicted}"
    output += f"\nDiscount: {discount:.2f}%"

    if discount > 30:
        output += "\nGood deal!"
    else:
        output += "\nNot a good deal."

    # Clean up GPU memory
    gc.collect()
    torch.cuda.empty_cache()

    return output

# Create a Gradio Blocks interface
with gr.Blocks() as demo:
    gr.Markdown("## Product Price Analysis")
    with gr.Row():
        with gr.Column():
            url_input = gr.Textbox(label="Product URL", placeholder="Enter product URL here...")
            run_button = gr.Button("Analyze Price")
        with gr.Column():
            output_text = gr.Textbox(label="Output", lines=15)

    run_button.click(fn=process_url, inputs=url_input, outputs=output_text)

# Launch the Gradio app
demo.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d7e4482f8e04545dfe.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


