 ### Large Language Models (LLMs) for Zero-Shot

In [1]:
import os
from dotenv import load_dotenv
from google import genai
from google.genai import types
import pandas as pd
from datetime import datetime
import sqlite3
import json
from typing import List, Dict, Any, Optional
import re

load_dotenv()

True

In [2]:
class SimpleProductExtractor:
    """Simple architectural overview of product extraction system"""

    def __init__(self, api_key=None):
        """Initialize with Google Gemini API key"""
        self.api_key = api_key or os.environ.get("GEMINI_API_KEY")
        self.client = genai.Client(api_key=self.api_key)
        self.model = 'gemini-2.0-flash-lite-preview-02-05'

    def extract_from_text(self, text):
        """Extract products and prices from text using Gemini LLM"""
        # System prompt defines the extraction task
        system_prompt = """
        Extract all products and their prices from the text.
        Return a JSON array where each item has:
        {"name": "Product Name", "price": 123.45, "currency": "$"}
        """

        # Generate content with the model
        response = self.client.models.generate_content(
            model=self.model,
            config=types.GenerateContentConfig(
                system_instruction=system_prompt,
                max_output_tokens=5000,
                temperature=0.1
            ),
            contents=[text]
        )

        # Parse JSON from response
        try:
            return json.loads(response.text)
        except json.JSONDecodeError:
            # Try to extract JSON from code block
            json_match = re.search(r'```json\s*(.*?)\s*```', response.text, re.DOTALL)
            if json_match:
                try:
                    return json.loads(json_match.group(1))
                except:
                    pass
            return []

    def save_to_json(self, products: List[Dict[str, Any]], filename: str = "products.json"):
        """Storage component"""
        with open(filename, "w") as f:
            json.dump(products, f, indent=2)
        return filename


# Example usage
def demo():
    # Sample input
    # sample_text = "I bought a Samsung Galaxy S23 for $999 and AirPods Pro for $249"
    sample_text = "sold 12 peaces eggs for 200 taka"

    # Initialize extractor
    extractor = SimpleProductExtractor()

    # Process input text
    print(f"INPUT: {sample_text}")

    # Extract data (inference step)
    products = extractor.extract_from_text(sample_text)

    # Display results
    print("\nEXTRACTED DATA:")
    for product in products:
        print(f"• {product['name']}: {product['currency']}{product['price']}")

    # Store data
    filename = extractor.save_to_json(products)
    print(f"\nSAVED TO: {filename}")


# Run the demo
if __name__ == "__main__":
    demo()

INPUT: sold 12 peaces eggs for 200 taka

EXTRACTED DATA:
• eggs: taka200.0

SAVED TO: products.json


In [2]:
class GeminiProductExtractor:
    """Product and price extractor using Google's Gemini Flash 2.0"""

    def __init__(self, api_key: Optional[str] = None):
        """Initialize the Gemini-based product extractor"""
        # Set up Gemini API
        self.api_key = api_key or os.environ.get("GEMINI_API_KEY")
        if not self.api_key:
            raise ValueError(
                "Google API key is required. Set GOOGLE_API_KEY environment variable or pass api_key parameter.")

        self.client = genai.Client(api_key=self.api_key)
        self.model = "gemini-2.0-flash-thinking-exp"

        # Connect to database
        self.setup_database()

    # Adapter to convert datetime objects to ISO format strings
    def adapt_datetime(dt):
        return dt.isoformat()

    # Converter to parse ISO format strings back to datetime objects
    def convert_datetime(bytestring):
        return datetime.fromisoformat(bytestring.decode())

    # Register the adapter for datetime objects
    sqlite3.register_adapter(datetime, adapt_datetime)

    # Register the converter for 'DATETIME' type
    sqlite3.register_converter('DATETIME', convert_datetime)

    def setup_database(self):
        """Setup SQLite database for storing products"""
        self.conn = sqlite3.connect('products_gemini.db', detect_types=sqlite3.PARSE_DECLTYPES)
        self.cursor = self.conn.cursor()

        # Create products table if it doesn't exist
        self.cursor.execute('''
        CREATE TABLE IF NOT EXISTS products (
            id INTEGER PRIMARY KEY,
            name TEXT,
            price REAL,
            currency TEXT,
            source_text TEXT,
            timestamp DATETIME
        )
        ''')
        self.conn.commit()

    def extract_products(self, text: str) -> List[Dict[str, Any]]:
        """Extract product names and prices from text using Gemini"""
        system_prompt = """
        Extract all products and their prices mentioned in the text.
        Return a JSON array where each item has the following format:
        {
            "name": "Product Name",
            "price": 123.45,
            "currency": "$"
        }

        Rules:
        1. Extract complete product names including brand and model
        2. Convert all prices to numeric values (no currency symbols in the price field)
        3. Identify the currency symbol used ($, €, £, etc.) and include it separately
        4. Return an empty array if no products with prices are detected
        5. Do not make up any information not present in the text
        6. Extract bangla text as well
        """

        try:
            response = self.client.models.generate_content(
                model=self.model,
                config=types.GenerateContentConfig(
                    system_instruction=system_prompt,
                    max_output_tokens=5000,
                    temperature=0.1
                ),
                contents=[text]
            )

            # Parse the response JSON
            try:
                result = json.loads(response.text)

                # Add source text to each product entry
                for product in result:
                    product["source_text"] = text

                return result
            except json.JSONDecodeError:
                # Fallback if response isn't valid JSON - look for code blocks
                json_match = re.search(r'```json\s*(.*?)\s*```', response.text, re.DOTALL)
                if json_match:
                    try:
                        result = json.loads(json_match.group(1))
                        for product in result:
                            product["source_text"] = text
                        return result
                    except:
                        pass

                print(f"Failed to parse Gemini response as JSON: {response.text}")
                return []
        except Exception as e:
            print(f"Error calling Gemini API: {e}")
            return []

    def store_products(self, products: List[Dict[str, Any]]):
        """Store extracted products in the database"""
        for product in products:
            self.cursor.execute(
                "INSERT INTO products (name, price, currency, source_text, timestamp) VALUES (?, ?, ?, ?, ?)",
                (product["name"],
                 product["price"],
                 product["currency"],
                 product["source_text"],
                 datetime.now())
            )
        self.conn.commit()

    def process_text(self, text: str) -> List[Dict[str, Any]]:
        """Process text to extract and store products"""
        products = self.extract_products(text)
        if products:
            self.store_products(products)
        return products

    def process_voice(self, audio_file_path: str):
        """
        Placeholder for voice processing functionality
        In a real implementation, this would use a speech-to-text API
        then pass the text to process_text()
        """
        print(f"Voice processing not implemented in this demo. Would process: {audio_file_path}")
        return []

    def get_stored_products(self, limit: int = 10) -> List[Dict[str, Any]]:
        """Retrieve stored products from database"""
        self.cursor.execute(
            "SELECT id, name, price, currency, source_text, timestamp FROM products ORDER BY timestamp DESC LIMIT ?",
            (limit,)
        )
        columns = ["id", "name", "price", "currency", "source_text", "timestamp"]
        return [dict(zip(columns, row)) for row in self.cursor.fetchall()]

    def close(self):
        """Close database connection"""
        self.conn.close()

In [3]:
# Demo usage
def run_gemini_demo():
    # Initialize the extractor - make sure to set GOOGLE_API_KEY in your environment
    # or pass it explicitly: GeminiProductExtractor(api_key="your-api-key")
    try:
        extractor = GeminiProductExtractor()

        # Sample texts
        sample_texts = [
            "amare 5kg chal den 1000 taka",
        ]

        # Process each sample text
        all_products = []
        for text in sample_texts:
            print(f"\nProcessing text: {text}")
            products = extractor.process_text(text)
            all_products.extend(products)
            print(f"Extracted products: {json.dumps(products, indent=2)}")

        # Show all stored products
        stored_products = extractor.get_stored_products()
        print("\nStored products in database:")
        for product in stored_products:
            print(
                f"ID: {product['id']} | {product['name']} - {product['currency']}{product['price']} | Source: '{product['source_text'][:30]}...'")

        # Close the connection
        extractor.close()
    except ValueError as e:
        print(f"Error: {e}")
        print("To run this demo, you need to provide a Google API key for Gemini.")
        print("Set it as an environment variable: export GOOGLE_API_KEY=your-key-here")


if __name__ == "__main__":
    run_gemini_demo()


Processing text: amare 5kg chal den 1000 taka
Extracted products: [
  {
    "name": "5kg chal",
    "price": 1000,
    "currency": "taka",
    "source_text": "amare 5kg chal den 1000 taka"
  }
]

Stored products in database:
ID: 10 | 5kg chal - taka1000.0 | Source: 'amare 5kg chal den 1000 taka...'
ID: 9 | chal - taka1000.0 | Source: 'amare 5kg chal den 1000 taka...'
ID: 8 | dim - taka200.0 | Source: 'ami dim 200 takay bikri korlam...'
ID: 7 | tomato - taka30.0 | Source: 'nowadays, the price of tomato ...'
ID: 6 | eggs - taka200.0 | Source: 'sold 12 peaces eggs for 200 ta...'
ID: 5 | Sony WH-1000XM4 headphones - $299.99 | Source: 'I found an amazing deal on Ama...'
ID: 4 | Adidas Ultra Boost - $180.0 | Source: 'The Nike Air Max costs $120, a...'
ID: 3 | Nike Air Max - $120.0 | Source: 'The Nike Air Max costs $120, a...'
ID: 2 | iPhone 14 - $799.0 | Source: 'I bought a MacBook Pro for $12...'
ID: 1 | MacBook Pro - $1299.0 | Source: 'I bought a MacBook Pro for $12...'


### Natural Language Processing (NLP) with Named Entity Recognition (NER)

In [None]:
import spacy
import pandas as pd
from datetime import datetime
import sqlite3
import json
from typing import List, Dict, Any, Tuple
import os

In [None]:
class ProductNERExtractor:
    """Product and price extractor using spaCy transformer-based NER"""

    def __init__(self, model_name: str = "en_core_web_trf"):
        """Initialize the extractor with a transformer-based spaCy model"""
        print(f"Loading spaCy model: {model_name}...")
        self.nlp = spacy.load(model_name)

        # Configure custom pipeline components
        # Add price pattern matching rules
        price_pattern = [{"LIKE_NUM": True}, {"TEXT": {"IN": ["$", "USD", "dollars", "€", "£"]}}]
        self.nlp.add_pipe("EntityRuler", config={"patterns": [{"label": "PRICE", "pattern": price_pattern}]})

        # Connect to database
        self.setup_database()

    def setup_database(self):
        """Setup SQLite database for storing products"""
        self.conn = sqlite3.connect('products.db')
        self.cursor = self.conn.cursor()

        # Create products table if it doesn't exist
        self.cursor.execute('''
        CREATE TABLE IF NOT EXISTS products (
            id INTEGER PRIMARY KEY,
            name TEXT,
            price REAL,
            currency TEXT,
            confidence REAL,
            source_text TEXT,
            timestamp DATETIME
        )
        ''')
        self.conn.commit()

    def extract_products(self, text: str) -> List[Dict[str, Any]]:
        """Extract product names and prices from text"""
        doc = self.nlp(text)

        # Extract potential product entities (look for PRODUCT, ORG, and other relevant entity types)
        product_entities = [ent for ent in doc.ents if ent.label_ in ["PRODUCT", "ORG", "FAC", "WORK_OF_ART"]]

        # Extract price entities
        price_entities = [ent for ent in doc.ents if ent.label_ == "MONEY" or ent.label_ == "PRICE"]

        # Match products with prices based on proximity
        products = []

        # Simple heuristic: pair products with the closest following price
        for product_ent in product_entities:
            # Find the closest price entity that follows this product
            closest_price = None
            min_distance = float('inf')

            for price_ent in price_entities:
                # Only consider prices that follow the product
                if price_ent.start > product_ent.end:
                    distance = price_ent.start - product_ent.end
                    if distance < min_distance:
                        min_distance = distance
                        closest_price = price_ent

            if closest_price and min_distance < 10:  # Threshold for proximity
                # Extract numeric value and currency from price entity
                price_text = closest_price.text
                # Simple parsing for demo purposes
                numeric_value = float(''.join(filter(lambda x: x.isdigit() or x == '.', price_text)))
                currency = '$' if '$' in price_text else (
                    '€' if '€' in price_text else ('£' if '£' in price_text else 'USD'))

                products.append({
                    "name": product_ent.text,
                    "price": numeric_value,
                    "currency": currency,
                    "confidence": product_ent._.confidence if hasattr(product_ent, "_") and hasattr(product_ent._,
                                                                                                    "confidence") else 0.8,
                    "source_text": text
                })

        return products

    def store_products(self, products: List[Dict[str, Any]]):
        """Store extracted products in the database"""
        for product in products:
            self.cursor.execute(
                "INSERT INTO products (name, price, currency, confidence, source_text, timestamp) VALUES (?, ?, ?, ?, ?, ?)",
                (product["name"],
                 product["price"],
                 product["currency"],
                 product["confidence"],
                 product["source_text"],
                 datetime.now())
            )
        self.conn.commit()

    def process_text(self, text: str) -> List[Dict[str, Any]]:
        """Process text to extract and store products"""
        products = self.extract_products(text)
        if products:
            self.store_products(products)
        return products

    def get_stored_products(self, limit: int = 10) -> List[Dict[str, Any]]:
        """Retrieve stored products from database"""
        self.cursor.execute(
            "SELECT id, name, price, currency, confidence, source_text, timestamp FROM products ORDER BY timestamp DESC LIMIT ?",
            (limit,)
        )
        columns = ["id", "name", "price", "currency", "confidence", "source_text", "timestamp"]
        return [dict(zip(columns, row)) for row in self.cursor.fetchall()]

    def close(self):
        """Close database connection"""
        self.conn.close()

In [None]:
def run_spacy_demo():
    # Initialize the extractor
    extractor = ProductNERExtractor()

    # Sample texts
    sample_texts = [
        "I bought a MacBook Pro for $1299 and an iPhone 14 for $799 at the Apple Store yesterday.",
        "The Nike Air Max costs $120, and the Adidas Ultra Boost is on sale for $180.",
        "I found an amazing deal on Amazon: Sony WH-1000XM4 headphones for only $299.99!"
    ]

    # Process each sample text
    all_products = []
    for text in sample_texts:
        print(f"\nProcessing text: {text}")
        products = extractor.process_text(text)
        all_products.extend(products)
        print(f"Extracted products: {json.dumps(products, indent=2)}")

    # Show all stored products
    stored_products = extractor.get_stored_products()
    print("\nStored products in database:")
    for product in stored_products:
        print(
            f"ID: {product['id']} | {product['name']} - {product['currency']}{product['price']} | Confidence: {product['confidence']:.2f}")

    # Close the connection
    extractor.close()

In [None]:
if __name__ == "__main__":
    run_spacy_demo()