In [1]:
!pip install selenium webdriver-manager requests

Collecting selenium
  Downloading selenium-4.31.0-py3-none-any.whl.metadata (7.5 kB)
Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Using cached outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Using cached wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.31.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m473.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading webdriv

In [4]:
import os
import time
import requests
import urllib.parse
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException
from webdriver_manager.chrome import ChromeDriverManager
import traceback

def print_debug(message, level="INFO"):
    """Print debug message with timestamp and level for better tracking."""
    timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{level}] {timestamp} - {message}")

def setup_driver():
    """Set up and return a Chrome webdriver with detailed logging."""
    print_debug("Setting up Chrome WebDriver")
    
    chrome_options = Options()
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    
    try:
        print_debug("Installing ChromeDriver")
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
        print_debug("Chrome WebDriver setup complete")
        return driver
    except Exception as e:
        print_debug(f"Failed to set up Chrome WebDriver: {str(e)}", "ERROR")
        print_debug(traceback.format_exc(), "ERROR")
        raise

def download_image(url, filename, download_dir):
    """Download an image from URL and save it with the given filename."""
    try:
        print_debug(f"Downloading image from {url[:50]}...")
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        
        filepath = os.path.join(download_dir, filename)
        with open(filepath, 'wb') as f:
            f.write(response.content)
        print_debug(f"Downloaded: {filename}")
        return True
    except Exception as e:
        print_debug(f"Error downloading {url}: {str(e)}", "ERROR")
        return False

def check_page_loaded(driver):
    """Check if the page has loaded properly and get diagnostic information."""
    print_debug("Checking page load status")
    
    # Get current URL
    current_url = driver.current_url
    print_debug(f"Current URL: {current_url}")
    
    # Check page title
    title = driver.title
    print_debug(f"Page title: {title}")
    
    # Take screenshot for debugging
    screenshot_path = os.path.join(os.getcwd(), "asos_debug_screenshot.png")
    try:
        driver.save_screenshot(screenshot_path)
        print_debug(f"Screenshot saved to {screenshot_path}")
    except Exception as e:
        print_debug(f"Failed to save screenshot: {str(e)}", "ERROR")
    
    # Check page source length
    source_length = len(driver.page_source)
    print_debug(f"Page source length: {source_length} characters")
    
    # Check if we're being blocked or redirected
    if "robot" in driver.page_source.lower() or "captcha" in driver.page_source.lower():
        print_debug("Possible bot detection on the page!", "WARNING")
    
    # Try to find common elements to verify page loaded correctly
    try:
        body_text = driver.find_element(By.TAG_NAME, "body").text
        print_debug(f"Body text preview: {body_text[:100]}...")
    except:
        print_debug("Could not extract body text", "WARNING")
    
    return source_length > 1000  # Basic check if we have substantial content

def download_asos_images(category, max_images=100, base_url="https://www.asos.com/us/"):
    """
    Download images from ASOS for a specific category with a maximum limit.
    
    Args:
        category: String representing the clothing category (e.g., 'tops', 'dresses')
        max_images: Maximum number of images to download
        base_url: Base URL for ASOS
    """
    # Create directory for downloaded images
    download_dir = os.path.join(os.getcwd(), f'asos-{category}')
    os.makedirs(download_dir, exist_ok=True)
    
    # Build the category URL
    if category == 'tops':
        category_url = f"{base_url}women/tops/cat/?cid=4169"
    elif category == 'dresses':
        category_url = f"{base_url}women/dresses/cat/?cid=8799"
    elif category == 'shoes':
        category_url = f"{base_url}women/shoes/cat/?cid=4172"
    elif category == 'jackets':
        category_url = f"{base_url}women/jackets-coats/cat/?cid=2641"
    elif category == 'jeans':
        category_url = f"{base_url}women/jeans/cat/?cid=3630"
    elif category == 'skirts':
        category_url = f"{base_url}women/skirts/cat/?cid=2639"
    elif category == 'bags':
        category_url = f"{base_url}women/bags-purses/cat/?cid=8730"
    elif category == 'suits':
        category_url = f"{base_url}women/suits-separates/cat/?cid=13632"
    else:
        # Default to category-based URL pattern
        category_url = f"{base_url}women/{category}/cat/"
    
    driver = None
    try:
        driver = setup_driver()
        
        # Navigate to the target URL
        print_debug(f"Navigating to {category_url}")
        driver.get(category_url)
        
        # Check if page loaded properly
        if not check_page_loaded(driver):
            print_debug("Page did not load properly", "ERROR")
            return
        
        # Wait a bit to let JavaScript initialize
        print_debug("Waiting for initial page load")
        time.sleep(5)
        
        # List of possible selectors for product elements
        product_selectors = [
            "[data-auto-id='productTile']",
            "article",
            ".productTile",
            ".product-card",
            ".product",
            ".styles__ProductCard",
            "[data-testid='product']"
        ]
        
        # Check for product tiles with more detailed error handling
        print_debug("Looking for product tiles")
        
        product_selector = None
        for selector in product_selectors:
            try:
                WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, selector))
                )
                print_debug(f"Product tiles found with selector: {selector}")
                product_selector = selector
                break
            except TimeoutException:
                print_debug(f"Selector {selector} not found", "INFO")
        
        if not product_selector:
            print_debug("Could not find any products on the page", "ERROR")
            return
        
        # Accept cookies if the banner appears
        try:
            print_debug("Checking for cookie consent banner")
            cookie_selectors = [
                "#onetrust-accept-btn-handler",
                ".cookie-accept",
                "[data-testid='cookie-accept']",
                "[aria-label='Accept cookies']",
                ".cookie-banner button"
            ]
            
            for selector in cookie_selectors:
                try:
                    cookie_button = WebDriverWait(driver, 3).until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
                    )
                    cookie_button.click()
                    print_debug(f"Accepted cookies using selector: {selector}")
                    time.sleep(2)  # Wait for banner to disappear
                    break
                except:
                    continue
        except Exception as e:
            print_debug(f"Cookie handling error or no cookie banner: {str(e)}", "INFO")
        
        downloaded_urls = set()  # To avoid duplicates
        product_count = 0
        page_count = 1
        
        while product_count < max_images:
            print_debug(f"Processing page {page_count}")
            # Wait for products to load
            time.sleep(3)  # Give more time for images to load
            
            # Find all product tiles
            try:
                product_tiles = driver.find_elements(By.CSS_SELECTOR, product_selector)
                print_debug(f"Found {len(product_tiles)} products on page {page_count}")
                
                if not product_tiles:
                    print_debug("No product tiles found on this page", "WARNING")
                    break
            except Exception as e:
                print_debug(f"Error finding product tiles: {str(e)}", "ERROR")
                break
            
            # Extract and download images
            for i, tile in enumerate(product_tiles):
                if product_count >= max_images:
                    print_debug(f"Reached maximum image count ({max_images}). Stopping.")
                    return
                
                try:
                    # Try different ways to find the image
                    img_element = None
                    img_url = None
                    
                    # Methods to find images
                    try:
                        img_element = tile.find_element(By.CSS_SELECTOR, "img")
                        img_url = img_element.get_attribute("src")
                    except:
                        try:
                            # Sometimes images are in background CSS
                            background_img = tile.value_of_css_property('background-image')
                            if background_img and background_img != 'none':
                                img_url = background_img.replace('url("', '').replace('")', '')
                        except:
                            # Try looking for data-src attribute which is common for lazy-loaded images
                            try:
                                img_element = tile.find_element(By.CSS_SELECTOR, "[data-src]")
                                img_url = img_element.get_attribute("data-src")
                            except:
                                pass
                    
                    # Skip if no image URL or already downloaded
                    if not img_url or img_url in downloaded_urls or img_url.startswith('data:'):
                        continue
                    
                    # Clean up URL (some src attributes might have query parameters)
                    img_url = img_url.split('?')[0]
                    
                    # Generate a filename from the URL
                    filename = f"{category}_{product_count}_{os.path.basename(urllib.parse.urlparse(img_url).path)}"
                    if not filename.endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
                        filename += ".jpg"  # Add default extension if none is present
                    
                    # Download the image
                    if download_image(img_url, filename, download_dir):
                        downloaded_urls.add(img_url)
                        product_count += 1
                        print_debug(f"Downloaded {product_count}/{max_images} images")
                
                except Exception as e:
                    print_debug(f"Error processing product: {str(e)}", "ERROR")
            
            # If we haven't reached the maximum, try to load more products
            if product_count < max_images:
                # Try to find and click the "Load more" button
                try:
                    print_debug("Looking for 'Load more' button")
                    # Try multiple selectors for the load more button
                    load_more_selectors = [
                        "[data-auto-id='loadMoreProducts']",
                        ".load-more", 
                        ".loadMore", 
                        "[data-test='load-more']"
                    ]
                    
                    load_more_found = False
                    for selector in load_more_selectors:
                        try:
                            load_more_button = WebDriverWait(driver, 5).until(
                                EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
                            )
                            
                            # Scroll to the button to make it visible
                            driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", load_more_button)
                            time.sleep(2)  # Small pause after scrolling
                            
                            try:
                                load_more_button.click()
                                print_debug("Clicked 'Load more' button")
                                load_more_found = True
                                break
                            except ElementClickInterceptedException:
                                # If the button is intercepted, try using JavaScript to click it
                                driver.execute_script("arguments[0].click();", load_more_button)
                                print_debug("Used JavaScript to click 'Load more' button")
                                load_more_found = True
                                break
                        except:
                            continue
                    
                    if not load_more_found:
                        print_debug("No 'Load more' button found - we may have reached the end", "INFO")
                        break
                    
                    page_count += 1
                    # Wait for new products to load
                    time.sleep(5)
                        
                except Exception as e:
                    print_debug(f"Error with 'Load more' button: {str(e)}", "INFO")
                    print_debug("Completed downloading or encountered an error with pagination.")
                    break
            else:
                break
        
        print_debug(f"Total products downloaded: {product_count}")
    
    except Exception as e:
        print_debug(f"A critical error occurred: {str(e)}", "ERROR")
        print_debug(traceback.format_exc(), "ERROR")
    
    finally:
        if driver:
            print_debug("Closing Chrome WebDriver")
            driver.quit()

def main():
    """
    Main function to download images from ASOS categories.
    Change the parameters below to customize the download:
    - categories: List of clothing categories to download
    - max_images_per_category: Maximum number of images to download per category
    """
    # Configuration - MODIFY THESE VALUES
    categories = ['tops', 'dresses', 'jackets', 'shoes']
    max_images_per_category = 50  # Set the maximum number of images per category
    
    # Process each category
    for category in categories:
        print_debug(f"Starting download for category: {category}")
        download_asos_images(category=category, max_images=max_images_per_category)
        print_debug(f"Completed download for category: {category}")

if __name__ == "__main__":
    main()
    

[INFO] 2025-04-04 21:28:13 - Starting download for category: tops
[INFO] 2025-04-04 21:28:13 - Setting up Chrome WebDriver
[INFO] 2025-04-04 21:28:13 - Installing ChromeDriver
[INFO] 2025-04-04 21:28:15 - Chrome WebDriver setup complete
[INFO] 2025-04-04 21:28:15 - Navigating to https://www.asos.com/us/women/tops/cat/?cid=4169
[INFO] 2025-04-04 21:28:17 - Checking page load status
[INFO] 2025-04-04 21:28:17 - Current URL: https://www.asos.com/us/women/tops/cat/?cid=4169
[INFO] 2025-04-04 21:28:17 - Page title: Shop Women's Tops Online | ASOS
[INFO] 2025-04-04 21:28:18 - Screenshot saved to /Users/lilysu/Documents/git/stylegap/asos_debug_screenshot.png
[INFO] 2025-04-04 21:28:18 - Page source length: 697316 characters
[INFO] 2025-04-04 21:28:18 - Body text preview: Skip to main content
Help & FAQs
WOMEN
MEN
Search
TRENDING
New in
Clothing
Dresses
Shoes
Plus size
A...
[INFO] 2025-04-04 21:28:18 - Waiting for initial page load
[INFO] 2025-04-04 21:28:23 - Looking for product tiles
[INFO] 

In [6]:
!pip install clarifai-grpc

from clarifai_grpc.channel.clarifai_channel import ClarifaiChannel
from clarifai_grpc.grpc.api import service_pb2_grpc, service_pb2, resources_pb2
from clarifai_grpc.grpc.api.status import status_code_pb2




In [None]:
import os
from typing import Dict, List, Optional
import glob
import json
from tqdm import tqdm
import time
import dotenv
import traceback
import base64

def classify_apparel_images(
    directories: List[str],
    max_images_per_dir: Dict[str, int],
    pat: str,
    user_id: str = "clarifai",
    app_id: str = "main",
    model_id: str = "apparel-classification-v2",
    model_version_id: str = "651c5412d53c408fa3b4fe3dcc060be7",
    max_concepts: int = 5,  # New parameter to specify maximum number of concepts to return
    output_file: Optional[str] = "classification_results.json",
) -> Dict:
    """
    Classify apparel images in specified directories using Clarifai API.

    Args:
        directories: List of directory names containing apparel images
        max_images_per_dir: Dictionary mapping directory names to maximum number of images to process
        pat: Clarifai Personal Access Token
        user_id: Clarifai user ID
        app_id: Clarifai app ID
        model_id: Clarifai model ID
        model_version_id: Optional model version ID (defaults to latest if None)
        max_concepts: Maximum number of concepts to return per image
        output_file: Optional file path to save results (None to skip saving)

    Returns:
        Dictionary containing classification results for each image
    """
    print(f"Using model: user_id={user_id}, app_id={app_id}, model_id={model_id}, version={model_version_id}")
    
    # Set up the gRPC client
    channel = ClarifaiChannel.get_grpc_channel()
    stub = service_pb2_grpc.V2Stub(channel)
    metadata = (('authorization', 'Key ' + pat),)
    user_data_object = resources_pb2.UserAppIDSet(user_id=user_id, app_id=app_id)

    # Dictionary to store results
    results = {}

    # Process each directory
    for directory in directories:
        print(f"Processing {directory} directory...")

        # Get image paths in the directory
        image_pattern = os.path.join(directory, "*.jpg")
        image_paths = glob.glob(image_pattern)

        # Determine how many images to process
        max_images = max_images_per_dir.get(directory, 10)  # Default to 10 if not specified
        image_paths = image_paths[:max_images]

        # Process each image
        for image_path in tqdm(image_paths):
            try:
                # Read image file as bytes
                with open(image_path, "rb") as f:
                    image_bytes = f.read()
                
                # Create the request with file bytes instead of URL
                post_model_outputs_response = stub.PostModelOutputs(
                    service_pb2.PostModelOutputsRequest(
                        user_app_id=user_data_object,
                        model_id=model_id,
                        version_id=model_version_id,
                        inputs=[
                            resources_pb2.Input(
                                data=resources_pb2.Data(
                                    image=resources_pb2.Image(
                                        base64=image_bytes
                                    )
                                )
                            )
                        ]
                    ),
                    metadata=metadata
                )

                # Check for errors in the API response
                if post_model_outputs_response.status.code != status_code_pb2.SUCCESS:
                    error_msg = f"API Error: {post_model_outputs_response.status.description}"
                    print(error_msg)
                    results[image_path] = f"Error: {error_msg}"
                    continue

                # Get the output from the response
                output = post_model_outputs_response.outputs[0]
                
                # Extract top concepts
                if len(output.data.concepts) > 0:
                    # Sort concepts by value (confidence score) in descending order
                    concepts = sorted(output.data.concepts, key=lambda c: c.value, reverse=True)
                    
                    # Get up to max_concepts or all available if fewer
                    top_concepts = concepts[:max_concepts]
                    
                    # Format the classifications
                    classifications = [f"{c.name} ({c.value:.2f})" for c in top_concepts]
                    
                    # Join with semicolons for better readability
                    classification_str = "; ".join(classifications)
                else:
                    classification_str = "No concepts found"

                # Store result
                results[image_path] = classification_str
                print(f"Success: {image_path} - {classification_str}")

            except Exception as e:
                # Get detailed error information
                error_details = traceback.format_exc()
                print(f"Error processing {image_path}:")
                print(error_details)
                results[image_path] = f"Error: {str(e)}"

            # Add a delay to avoid rate limiting
            time.sleep(2.0)

    # Save results to file if specified
    if output_file:
        with open(output_file, "w") as f:
            json.dump(results, f, indent=2)
        print(f"Results saved to {output_file}")

    return results

def process_classification_results(results_file: str) -> dict:
    """
    Process classification results to extract only the item names without confidence scores.
    
    Args:
        results_file: Path to the JSON file containing classification results
        
    Returns:
        Dictionary with image paths as keys and lists of item names as values
    """
    # Load the results from the JSON file
    with open(results_file, 'r') as f:
        results = json.load(f)
    
    # Dictionary to store processed results
    processed_results = {}
    
    # Process each image result
    for image_path, classifications in results.items():
        # Skip error entries
        if classifications.startswith("Error") or classifications == "No concepts found":
            processed_results[image_path] = []
            continue
        
        # Split classifications by semicolon
        items_with_scores = classifications.split('; ')
        
        # Extract just the item names (remove the confidence scores)
        items = []
        for item_with_score in items_with_scores:
            # Extract the item name (everything before the opening parenthesis)
            item_name = item_with_score.split(' (')[0]
            items.append(item_name)
        
        # Store in the processed results dictionary
        processed_results[image_path] = items
    
    # Save the processed results to a new JSON file
    output_file = 'clarifai_classification_asos.json'
    with open(output_file, 'w') as f:
        json.dump(processed_results, f, indent=2)
    
    print(f"Processed results saved to {output_file}")
    
    return processed_results


def main():
    # Load environment variables from .env file
    dotenv.load_dotenv()

    # Get PAT from environment variable
    pat = os.environ.get("PAT")

    if not pat:
        print(
            "Error: PAT environment variable not found. Please set PAT in your .env file."
        )
        return None

    print(
        f"Using PAT: {pat[:5]}...{pat[-4:] if len(pat) > 8 else ''} (masked for security)"
    )

    # List of apparel directories
    directories = [
        "asos-tops",
        "asos-shoes",
        "asos-products",
        "asos-jackets",
        "asos-dresses",
    ]

    # Set maximum images to process for each directory
    max_images_per_dir = {
        "asos-tops": 2,
        "asos-shoes": 2,
        "asos-products": 2,
        "asos-jackets": 2,
        "asos-dresses": 2,
    }

    output_file = "clarifai_classification_asos.json"
    
    # Classify images
    results = classify_apparel_images(
        directories=directories,
        max_images_per_dir=max_images_per_dir,
        pat=pat,
        max_concepts=5,  # Get up to 5 classifications per image
        output_file=output_file,
    )

    # Process the results to extract just the item names
    processed_results = process_classification_results(output_file)
    
    return processed_results


# processed_results = process_classification_results("clarifai_classification_asos.json")
# processed_results

results = main()


Using PAT: 54e90...d229 (masked for security)
Using model: user_id=clarifai, app_id=main, model_id=apparel-classification-v2, version=651c5412d53c408fa3b4fe3dcc060be7
Processing asos-tops directory...


  0%|          | 0/2 [00:00<?, ?it/s]

Success: asos-tops/tops_43_.jpg - long-sleeve (0.97); top (0.89); shirt (0.83); v-neck (0.71); 3/4 sleeve (0.62)


 50%|█████     | 1/2 [00:02<00:02,  2.91s/it]

Success: asos-tops/tops_14_.jpg - crewneck (0.96); long-sleeve (0.91); graphic (0.90); coat (0.66); hoodie (0.63)


100%|██████████| 2/2 [00:05<00:00,  2.63s/it]


Processing asos-shoes directory...


  0%|          | 0/2 [00:00<?, ?it/s]

Success: asos-shoes/shoes_27_.jpg - leather (0.98); colorblock (0.97); boots (0.92); pants (0.55); chambray/denim (0.31)


 50%|█████     | 1/2 [00:02<00:02,  2.42s/it]

Success: asos-shoes/shoes_31_.jpg - sleeveless (0.94); leather (0.90); colorblock (0.66); pants (0.62); midi dress (0.60)


100%|██████████| 2/2 [00:04<00:00,  2.39s/it]


Processing asos-products directory...


  0%|          | 0/2 [00:00<?, ?it/s]

Success: asos-products/asos_3_.jpg - floral (0.98); midi dress (0.94); sleeveless (0.94); maxi dress (0.92); v-neck (0.86)


 50%|█████     | 1/2 [00:02<00:02,  2.45s/it]

Success: asos-products/asos_106_.jpg - graphic (1.00); long-sleeve (0.98); crewneck (0.95); sweatshirt (0.84); hoodie (0.68)


100%|██████████| 2/2 [00:04<00:00,  2.42s/it]


Processing asos-jackets directory...


  0%|          | 0/2 [00:00<?, ?it/s]

Success: asos-jackets/jackets_6_.jpg - long-sleeve (0.97); hoodie (0.81); coat (0.69); leather (0.63); colorblock (0.54)


 50%|█████     | 1/2 [00:02<00:02,  2.37s/it]

Success: asos-jackets/jackets_30_.jpg - long-sleeve (0.98); fur (0.92); coat (0.88); hoodie (0.81); jacket (0.75)


100%|██████████| 2/2 [00:04<00:00,  2.35s/it]


Processing asos-dresses directory...


  0%|          | 0/2 [00:00<?, ?it/s]

Success: asos-dresses/dresses_6_.jpg - maxi dress (0.95); midi dress (0.91); chiffon (0.86); crewneck (0.86); v-neck (0.83)


 50%|█████     | 1/2 [00:02<00:02,  2.43s/it]

Success: asos-dresses/dresses_7_.jpg - shoulder bag (0.93); leather (0.88); floral (0.80); maxi dress (0.79); scarf (0.73)


100%|██████████| 2/2 [00:04<00:00,  2.37s/it]

Results saved to clarifai_classification_asos.json
Processed results saved to processed_classifications.json





In [18]:
import os
import json
import base64
import glob
from anthropic import Anthropic
from pathlib import Path
import pandas as pd

def initialize_anthropic_client():
    api_key = os.environ.get("ANTHROPIC_API_KEY")
    if not api_key:
        print("Error: ANTHROPIC_API_KEY not found in environment variables.")
        return None
    return Anthropic(api_key=api_key)

def analyze_attribute_with_claude(client, image_path, attribute):
    """Analyze a single attribute of a garment using Claude."""
    try:
        with open(image_path, "rb") as image_file:
            encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
    except FileNotFoundError:
        print(f"Error: Image file not found at path: {image_path}")
        return {"error": "Image file not found."}
    
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": "image/jpeg",
                        "data": encoded_string
                    }
                },
                {
                    "type": "text",
                    "text": f"""For this garment, focus only on the '{attribute}' aspect. 
                    Return a JSON object with: color, material, occasion, style, season, unique_feature, era, 
                    casual_or_relaxed (boolean), visual_aesthetic, hardware."""
                }
            ]
        }
    ]
    
    try:
        response = client.messages.create(
            model="claude-3-7-sonnet-latest",
            max_tokens=800,
            system="You are a fashion analyst focused on specific garment attributes.",
            messages=messages
        )
        
        response_text = response.content[0].text
        json_start = response_text.find('{')
        json_end = response_text.rfind('}') + 1
        
        if json_start >= 0 and json_end > json_start:
            json_content = response_text[json_start:json_end]
            try:
                return json.loads(json_content)
            except json.JSONDecodeError:
                return {"error": "Failed to parse JSON response"}
        else:
            return {"error": "No JSON in response"}
            
    except Exception as e:
        return {"error": f"Error processing image: {e}"}

def describe_clothing_item_attributes(json_path, base_img_dir=""):
    client = initialize_anthropic_client()
    if not client:
        return {}
    
    try:
        with open(json_path, 'r') as file:
            processed_results = json.load(file)
    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error loading JSON: {e}")
        return {}
    
    detailed_descriptions = {}
    
    for image_path, attributes in processed_results.items():
        # Only process images from folders that start with "asos-"
        if not image_path.startswith("asos-"):
            continue
            
        full_image_path = os.path.join(base_img_dir, image_path) if base_img_dir else image_path
        
        # Create a list to store descriptions for each attribute
        attribute_descriptions = []
        
        for attribute in attributes:
            print(f"Processing {image_path} - {attribute}")
            
            # Get detailed analysis from Claude for this specific attribute
            attribute_details = analyze_attribute_with_claude(client, full_image_path, attribute)
            
            # Add the attribute name to the details
            attribute_details["attribute"] = attribute
            attribute_descriptions.append(attribute_details)
        
        # Store the list of attribute descriptions for this image
        detailed_descriptions[image_path] = attribute_descriptions
    
    return detailed_descriptions

def parse_fashion_data(input_text):
    """
    Parse the fashion description data and convert it to structured formats
    """
    # Convert input to dictionary if it's a string
    if isinstance(input_text, str):
        data_dict = json.loads(input_text.replace("'", '"'))
    else:
        data_dict = input_text  # Assume it's already a dictionary
        
    structured_data = []
    for image_path, descriptions in data_dict.items():
        # Check if this is a folder starting with "asos-"
        if image_path.split('/')[0].startswith("asos-"):
            category = image_path.split('/')[0].split('-')[-1]
            entry = {
                'image_path': image_path,
                'category': category,
                'item_id': image_path.split('_')[1] if len(image_path.split('_')) > 1 else "",
                'description': descriptions
            }
            structured_data.append(entry)
    
    return pd.DataFrame(structured_data)

def format_as_json(df):
    """
    Format the dataframe as JSON
    """
    return df.to_json(orient='records', indent=2)

def save_outputs(data_text, output_dir='fashion_data_output'):
    """
    Save the data in various formats
    """
    # Create output directory
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
        
    return output_path

def process_asos_folders(base_img_dir=""):
    """
    Find all asos-* folders and process their contents
    """
    # Find all asos-* folders
    asos_folders = glob.glob("asos-*")
    
    # Load classification data
    try:
        with open("clarifai_classification_asos.json", 'r') as file:
            processed_results = json.load(file)
    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error loading JSON: {e}")
        return {}
    
    # Initialize Anthropic client
    client = initialize_anthropic_client()
    if not client:
        return {}
    
    # Structure to hold all results
    detailed_descriptions = {}
    
    # Process each image in the classification results
    for image_path, attributes in processed_results.items():
        # Check if this image is from an asos- folder
        folder_name = image_path.split('/')[0]
        if not folder_name.startswith("asos-"):
            continue
            
        full_image_path = os.path.join(base_img_dir, image_path) if base_img_dir else image_path
        
        # Create a list to store descriptions for each attribute
        attribute_descriptions = []
        
        for attribute in attributes:
            print(f"Processing {image_path} - {attribute}")
            
            # Get detailed analysis from Claude for this specific attribute
            attribute_details = analyze_attribute_with_claude(client, full_image_path, attribute)
            
            # Add the attribute name to the details
            attribute_details["attribute"] = attribute
            attribute_descriptions.append(attribute_details)
        
        # Store the list of attribute descriptions for this image
        detailed_descriptions[image_path] = attribute_descriptions
    
    # Save results to a file
    with open("anthropic_asos_descriptions.json", "w") as f:
        json.dump(detailed_descriptions, f, indent=2)
    
    print(f"Analysis complete. Results saved to anthropic_asos_descriptions.json")
    
    return detailed_descriptions

results = process_asos_folders()

# # Save formatted outputs
output_paths = save_outputs(results)
output_paths

Processing asos-tops/tops_43_.jpg - long-sleeve
Processing asos-tops/tops_43_.jpg - top
Processing asos-tops/tops_43_.jpg - shirt
Processing asos-tops/tops_43_.jpg - v-neck
Processing asos-tops/tops_43_.jpg - 3/4 sleeve
Processing asos-tops/tops_14_.jpg - crewneck
Processing asos-tops/tops_14_.jpg - long-sleeve
Processing asos-tops/tops_14_.jpg - graphic
Processing asos-tops/tops_14_.jpg - coat
Processing asos-tops/tops_14_.jpg - hoodie
Processing asos-shoes/shoes_27_.jpg - leather
Processing asos-shoes/shoes_27_.jpg - colorblock
Processing asos-shoes/shoes_27_.jpg - boots
Processing asos-shoes/shoes_27_.jpg - pants
Processing asos-shoes/shoes_27_.jpg - chambray/denim
Processing asos-shoes/shoes_31_.jpg - sleeveless
Processing asos-shoes/shoes_31_.jpg - leather
Processing asos-shoes/shoes_31_.jpg - colorblock
Processing asos-shoes/shoes_31_.jpg - pants
Processing asos-shoes/shoes_31_.jpg - midi dress
Processing asos-products/asos_3_.jpg - floral
Processing asos-products/asos_3_.jpg - m

PosixPath('fashion_data_output')

In [6]:
import os
from typing import Dict, List, Optional
import glob
import json
from tqdm import tqdm
import time
import dotenv
import traceback
import base64

def clarifai_classification(
    pat: str,
    user_id: str = "clarifai",
    app_id: str = "main",
    model_id: str = "apparel-classification-v2",
    model_version_id: str = "651c5412d53c408fa3b4fe3dcc060be7",
    max_concepts: int = 5,
    max_images: int = 10,
) -> Dict:
    """
    Classify apparel images in the user folder using Clarifai API.

    Args:
        pat: Clarifai Personal Access Token
        user_id: Clarifai user ID
        app_id: Clarifai app ID
        model_id: Clarifai model ID
        model_version_id: Optional model version ID (defaults to latest if None)
        max_concepts: Maximum number of concepts to return per image
        max_images: Maximum number of images to process (default: 10)

    Returns:
        Dictionary containing classification results for each image
    """
    print(f"Using model: user_id={user_id}, app_id={app_id}, model_id={model_id}, version={model_version_id}")
    
    # Set up the gRPC client
    channel = ClarifaiChannel.get_grpc_channel()
    stub = service_pb2_grpc.V2Stub(channel)
    metadata = (('authorization', 'Key ' + pat),)
    user_data_object = resources_pb2.UserAppIDSet(user_id=user_id, app_id=app_id)

    # Dictionary to store results
    results = {}

    # Use "user" folder instead of multiple directories
    user_folder = "user"
    print(f"Processing images from {user_folder} folder...")

    # Get image paths in the directory (supporting multiple image formats)
    image_patterns = [
        os.path.join(user_folder, "*.JPG")
    ]
    
    # Collect all image paths
    image_paths = []
    for pattern in image_patterns:
        image_paths.extend(glob.glob(pattern))

    # Limit to max_images
    image_paths = image_paths[:max_images]
    
    print(f"Found {len(image_paths)} images in {user_folder} (processing up to {max_images})")

    # Process each image
    for image_path in tqdm(image_paths):
        try:
            # Read image file as bytes
            with open(image_path, "rb") as f:
                image_bytes = f.read()
            
            # Create the request with file bytes instead of URL
            post_model_outputs_response = stub.PostModelOutputs(
                service_pb2.PostModelOutputsRequest(
                    user_app_id=user_data_object,
                    model_id=model_id,
                    version_id=model_version_id,
                    inputs=[
                        resources_pb2.Input(
                            data=resources_pb2.Data(
                                image=resources_pb2.Image(
                                    base64=image_bytes
                                )
                            )
                        )
                    ]
                ),
                metadata=metadata
            )

            # Check for errors in the API response
            if post_model_outputs_response.status.code != status_code_pb2.SUCCESS:
                error_msg = f"API Error: {post_model_outputs_response.status.description}"
                print(error_msg)
                results[image_path] = f"Error: {error_msg}"
                continue

            # Get the output from the response
            output = post_model_outputs_response.outputs[0]
            
            # Extract top concepts
            if len(output.data.concepts) > 0:
                # Sort concepts by value (confidence score) in descending order
                concepts = sorted(output.data.concepts, key=lambda c: c.value, reverse=True)
                
                # Get up to max_concepts or all available if fewer
                top_concepts = concepts[:max_concepts]
                
                # Format the classifications
                classifications = [f"{c.name} ({c.value:.2f})" for c in top_concepts]
                
                # Join with semicolons for better readability
                classification_str = "; ".join(classifications)
            else:
                classification_str = "No concepts found"

            # Store result
            results[image_path] = classification_str
            print(f"Success: {image_path} - {classification_str}")

        except Exception as e:
            # Get detailed error information
            error_details = traceback.format_exc()
            print(f"Error processing {image_path}:")
            print(error_details)
            results[image_path] = f"Error: {str(e)}"

        # Add a delay to avoid rate limiting
        time.sleep(2.0)

    # Save results with "user_" prefix
    output_file = "user_classification_results.json"
    with open(output_file, "w") as f:
        json.dump(results, f, indent=2)
    print(f"Results saved to {output_file}")

    # Process the results and save with "user_" prefix
    processed_results = process_classification_results(results, "user_processed_classifications.json")
    
    return processed_results

def process_classification_results(results: dict, output_file: str = "user_processed_classifications.json") -> dict:
    """
    Process classification results to extract only the item names without confidence scores.
    
    Args:
        results: Dictionary containing classification results
        output_file: Path to save the processed results
        
    Returns:
        Dictionary with image paths as keys and lists of item names as values
    """
    # Dictionary to store processed results
    processed_results = {}
    
    # Process each image result
    for image_path, classifications in results.items():
        # Skip error entries
        if classifications.startswith("Error") or classifications == "No concepts found":
            processed_results[image_path] = []
            continue
        
        # Split classifications by semicolon
        items_with_scores = classifications.split('; ')
        
        # Extract just the item names (remove the confidence scores)
        items = []
        for item_with_score in items_with_scores:
            # Extract the item name (everything before the opening parenthesis)
            item_name = item_with_score.split(' (')[0]
            items.append(item_name)
        
        # Store in the processed results dictionary
        processed_results[image_path] = items
    
    # Save the processed results to a new JSON file
    with open(output_file, 'w') as f:
        json.dump(processed_results, f, indent=2)
    
    print(f"Processed results saved to {output_file}")
    
    return processed_results

# Load environment variables from .env file
dotenv.load_dotenv()

# Get PAT from environment variable
pat = os.environ.get("PAT")

if not pat:
    print("Error: PAT environment variable not found. Please set PAT in your .env file.")
else:
    print(f"Using PAT: {pat[:5]}...{pat[-4:] if len(pat) > 8 else ''} (masked for security)")
    
    # Run the classification on the user folder
    results = clarifai_classification(pat=pat)

Using PAT: 54e90...d229 (masked for security)
Using model: user_id=clarifai, app_id=main, model_id=apparel-classification-v2, version=651c5412d53c408fa3b4fe3dcc060be7
Processing images from user folder...
Found 8 images in user (processing up to 10)


  0%|          | 0/8 [00:00<?, ?it/s]

Success: user/IMG_7394.JPG - pants (0.90); knit (0.75); long-sleeve (0.74); top (0.54); crewneck (0.51)


 12%|█▎        | 1/8 [00:09<01:05,  9.42s/it]

Success: user/IMG_7344.JPG - pants (0.90); long-sleeve (0.85); knit (0.68); crewneck (0.64); coat (0.58)


 25%|██▌       | 2/8 [00:13<00:37,  6.33s/it]

Success: user/IMG_7268.JPG - pants (0.94); knit (0.67); suit pants (0.66); long-sleeve (0.57); sweatpants (0.54)


 38%|███▊      | 3/8 [00:18<00:27,  5.52s/it]

Success: user/IMG_7446.JPG - long-sleeve (0.88); crewneck (0.74); knit (0.73); pants (0.71); graphic (0.64)


 50%|█████     | 4/8 [00:23<00:22,  5.60s/it]

Success: user/IMG_7453.JPG - long-sleeve (0.93); coat (0.74); pants (0.68); crewneck (0.56); knit (0.50)


 62%|██████▎   | 5/8 [00:29<00:17,  5.75s/it]

Success: user/IMG_7336.JPG - pants (0.90); long-sleeve (0.72); knit (0.69); crewneck (0.67); t-shirt (0.66)


 75%|███████▌  | 6/8 [00:35<00:11,  5.57s/it]

Success: user/IMG_7503.JPG - pants (0.88); crewneck (0.71); long-sleeve (0.68); graphic (0.65); t-shirt (0.61)


 88%|████████▊ | 7/8 [00:39<00:05,  5.09s/it]

Success: user/IMG_7567.JPG - pants (0.86); long-sleeve (0.75); crewneck (0.61); graphic (0.58); knit (0.58)


100%|██████████| 8/8 [00:44<00:00,  5.54s/it]

Results saved to user_classification_results.json
Processed results saved to user_processed_classifications.json





In [9]:
def process_asos_folders(base_img_dir=""):
    """
    Find all asos-* folders and process their contents
    """
    # Find all asos-* folders
    asos_folders = glob.glob("user*")
    
    # Load classification data
    try:
        with open("user_processed_classifications.json", 'r') as file:
            processed_results = json.load(file)
    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error loading JSON: {e}")
        return {}
    
    # Initialize Anthropic client
    client = initialize_anthropic_client()
    if not client:
        return {}
    
    # Structure to hold all results
    detailed_descriptions = {}
    
    # Process each image in the classification results
    for image_path, attributes in processed_results.items():
        # Check if this image is from an asos- folder
        folder_name = image_path.split('/')[0]
        if not folder_name.startswith("user"):
            continue
            
        full_image_path = os.path.join(base_img_dir, image_path) if base_img_dir else image_path
        
        # Create a list to store descriptions for each attribute
        attribute_descriptions = []
        
        for attribute in attributes:
            print(f"Processing {image_path} - {attribute}")
            
            # Get detailed analysis from Claude for this specific attribute
            attribute_details = analyze_attribute_with_claude(client, full_image_path, attribute)
            
            # Add the attribute name to the details
            attribute_details["attribute"] = attribute
            attribute_descriptions.append(attribute_details)
        
        # Store the list of attribute descriptions for this image
        detailed_descriptions[image_path] = attribute_descriptions
    
    # Save results to a file
    with open("anthropic_asos_descriptions.json", "w") as f:
        json.dump(detailed_descriptions, f, indent=2)
    
    print(f"Analysis complete. Results saved to anthropic_asos_descriptions.json")
    
    return detailed_descriptions

results = process_asos_folders()

# Save formatted outputs
output_paths = save_outputs(results)
print("Files saved to:")
for format_name, path in output_paths.items():
    print(f"- {format_name}: {path}")

Processing user/IMG_7394.JPG - pants
Processing user/IMG_7394.JPG - knit
Processing user/IMG_7394.JPG - long-sleeve
Processing user/IMG_7394.JPG - top
Processing user/IMG_7394.JPG - crewneck
Processing user/IMG_7344.JPG - pants
Processing user/IMG_7344.JPG - long-sleeve
Processing user/IMG_7344.JPG - knit
Processing user/IMG_7344.JPG - crewneck
Processing user/IMG_7344.JPG - coat
Processing user/IMG_7268.JPG - pants
Processing user/IMG_7268.JPG - knit
Processing user/IMG_7268.JPG - suit pants
Processing user/IMG_7268.JPG - long-sleeve
Processing user/IMG_7268.JPG - sweatpants
Processing user/IMG_7446.JPG - long-sleeve
Processing user/IMG_7446.JPG - crewneck
Processing user/IMG_7446.JPG - knit
Processing user/IMG_7446.JPG - pants
Processing user/IMG_7446.JPG - graphic
Processing user/IMG_7453.JPG - long-sleeve
Processing user/IMG_7453.JPG - coat
Processing user/IMG_7453.JPG - pants
Processing user/IMG_7453.JPG - crewneck
Processing user/IMG_7453.JPG - knit
Processing user/IMG_7336.JPG -

NameError: name 'extract_key_features' is not defined

In [21]:
import voyageai
import pandas as pd
import os
import json
import time
from dotenv import load_dotenv

# Load environment variables (API key)
load_dotenv(override=True)
VOYAGE_API_KEY = os.getenv("VOYAGE_API_KEY")

# Initialize Voyage client with API key
vo = voyageai.Client(api_key=VOYAGE_API_KEY)

def load_attribute_descriptions(json_file_path):
    """Load the attribute descriptions from a JSON file."""
    with open(json_file_path, 'r') as f:
        return json.load(f)

def prepare_text_for_embedding(attribute_list):
    """Convert attribute dictionary list to a single text string for embedding."""
    text_parts = []
    
    for attr_dict in attribute_list:
        attribute = attr_dict.get('attribute', 'unknown')
        text_parts.append(f"Attribute: {attribute}")
        
        # Add all other fields
        for key, value in attr_dict.items():
            if key != 'attribute' and key != 'error':
                text_parts.append(f"{key}: {value}")
        
        text_parts.append("---")  # Separator between attributes
    
    return " ".join(text_parts)

def embed_fashion_descriptions(json_file_path, output_file="embeddings_asos.csv"):
    """Generate embeddings for each fashion item's full attribute description."""
    # Load the attribute descriptions
    descriptions = load_attribute_descriptions(json_file_path)
    
    # Prepare DataFrame to store results
    results = []
    
    # Process each image path
    for image_path, attributes in descriptions.items():
        print(f"Processing: {image_path}")
        
        # Convert the list of attribute dictionaries to a single text
        full_description = prepare_text_for_embedding(attributes)
        
        try:
            # Generate embedding for the entire dictionary as one text
            embedding_result = vo.embed(
                texts=[full_description],
                model="voyage-3-large",
                input_type="document",
                output_dimension=256,
                output_dtype="float"
            )
            
            # Store the result
            results.append({
                'image_path': image_path,
                'embedding': embedding_result.embeddings[0],
                'raw_attributes': attributes  # Store original attributes for reference
            })
            
            print(f"Successfully embedded {image_path}")
            
        except Exception as e:
            print(f"Error embedding {image_path}: {e}")
        
        # Add a small delay to respect rate limits
        time.sleep(3)
    
    # Convert results to DataFrame
    df = pd.DataFrame(results)
    
    # Save to CSV (embeddings will be stored as strings)
    df_to_save = df.copy()
    df_to_save['embedding'] = df_to_save['embedding'].apply(lambda x: ','.join(map(str, x)))
    df_to_save['raw_attributes'] = df_to_save['raw_attributes'].apply(json.dumps)
    df_to_save.to_csv(output_file, index=False)
    
    print(f"Saved embeddings to {output_file}")
    return df

# Function to search similar items using the embeddings
def find_similar_fashion_items(df, query_image_path=None, query_text=None, top_n=5):
    """Find similar fashion items based on embeddings similarity."""
    if query_image_path is not None and query_image_path in df['image_path'].values:
        # Get the embedding for the query image
        query_embedding = df[df['image_path'] == query_image_path]['embedding'].iloc[0]
    elif query_text is not None:
        # Generate embedding for the query text
        try:
            query_result = vo.embed(
                texts=[query_text],
                model="voyage-3-large",
                input_type="document",
                output_dimension=256,
                output_dtype="float"
            )
            query_embedding = query_result.embeddings[0]
        except Exception as e:
            print(f"Error generating embedding for query text: {e}")
            return None
    else:
        print("Either query_image_path or query_text must be provided")
        return None
    
    # Calculate similarity scores
    df['similarity'] = df['embedding'].apply(
        lambda emb: sum(a*b for a, b in zip(emb, query_embedding)) / 
        (sum(a*a for a in emb)**0.5 * sum(b*b for b in query_embedding)**0.5)
    )
    
    # Sort by similarity and return top matches
    return df.sort_values('similarity', ascending=False).head(top_n)

# Example usage in a Jupyter notebook
# Run this in a cell

# Load and embed fashion descriptions
json_file_path = "anthropic_asos_descriptions.json"
embeddings_asos_df = embed_fashion_descriptions(json_file_path)

embeddings_asos_df.head(2)


Processing: asos-tops/tops_43_.jpg
Successfully embedded asos-tops/tops_43_.jpg
Processing: asos-tops/tops_14_.jpg
Successfully embedded asos-tops/tops_14_.jpg
Processing: asos-shoes/shoes_27_.jpg
Successfully embedded asos-shoes/shoes_27_.jpg
Processing: asos-shoes/shoes_31_.jpg
Successfully embedded asos-shoes/shoes_31_.jpg
Processing: asos-products/asos_3_.jpg
Successfully embedded asos-products/asos_3_.jpg
Processing: asos-products/asos_106_.jpg
Successfully embedded asos-products/asos_106_.jpg
Processing: asos-jackets/jackets_6_.jpg
Successfully embedded asos-jackets/jackets_6_.jpg
Processing: asos-jackets/jackets_30_.jpg
Successfully embedded asos-jackets/jackets_30_.jpg
Processing: asos-dresses/dresses_6_.jpg
Successfully embedded asos-dresses/dresses_6_.jpg
Processing: asos-dresses/dresses_7_.jpg
Successfully embedded asos-dresses/dresses_7_.jpg
Saved embeddings to embeddings_asos.csv


Unnamed: 0,image_path,embedding,raw_attributes
0,asos-tops/tops_43_.jpg,"[-0.0986347496509552, 0.11231251060962677, 0.0...","[{'color': 'white', 'material': 'appears to be..."
1,asos-tops/tops_14_.jpg,"[-0.12911318242549896, 0.040842585265636444, 0...","[{'color': 'white', 'material': 'cotton jersey..."


In [22]:
json_file_path = "anthropic_user_descriptions.json"
embeddings_user_df = embed_fashion_descriptions(json_file_path, output_file="embeddings_user.csv")

embeddings_user_df.head(2)

Processing: user/IMG_7394.JPG
Successfully embedded user/IMG_7394.JPG
Processing: user/IMG_7344.JPG
Successfully embedded user/IMG_7344.JPG
Processing: user/IMG_7268.JPG
Successfully embedded user/IMG_7268.JPG
Processing: user/IMG_7446.JPG
Successfully embedded user/IMG_7446.JPG
Processing: user/IMG_7453.JPG
Successfully embedded user/IMG_7453.JPG
Processing: user/IMG_7336.JPG
Successfully embedded user/IMG_7336.JPG
Processing: user/IMG_7503.JPG
Successfully embedded user/IMG_7503.JPG
Processing: user/IMG_7567.JPG
Successfully embedded user/IMG_7567.JPG
Saved embeddings to embeddings_user.csv


Unnamed: 0,image_path,embedding,raw_attributes
0,user/IMG_7394.JPG,"[-0.05037498474121094, 0.041566699743270874, 0...","[{'color': 'black', 'material': 'knit/stretch ..."
1,user/IMG_7344.JPG,"[-0.06044689193367958, 0.05194960534572601, 0....","[{'color': 'black', 'material': 'likely nylon/..."
