In [43]:
import json
import os
import glob 
import sys # Import sys to allow for graceful exit on errors
import regex
from typing import List, Dict, Any, Optional


INPUT_FILE = 'uomo_catalog/blazers.json'
OUTPUT_FILE = 'test.json'

with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    new_products = json.load(f)


for product in new_products:
    
    # 1. Image Link Extraction (Existing Logic)
    images_list = product.get('images', [])

    if isinstance(images_list, list) and len(images_list) >= 2:
        product['image_link'] = images_list[-2]
    elif isinstance(images_list, list) and len(images_list) == 1:
        product['image_link'] = images_list[0]
    else:
        product['image_link'] = None
        print("Warning: No images found for product.")
        print(product)

    # 2. Robust ID Extraction (The new, sound proof logic)
    url: Optional[str] = product.get('url')

    if url:
        # Use the raw string prefix 'r' to correctly handle backslashes
        # The pattern looks for a literal dot, then one or more digits (\d+), 
        # followed by a literal dot and 'html'. We capture the digits.
        match = regex.search(r'\.(\d+)\.html', url)
        
        # *** THIS IS THE CRUCIAL CHECK ***
        if match:
            product_id = match.group(1)
            product['id'] = product_id
        else:
            # If no match, print an error and mark the record
            print(f"Warning: Failed to extract product ID from URL: {url}")
            product['id'] = None # Assign None or another placeholder
            products_skipped_count += 1
    else:
        print("Warning: Product record is missing 'url' field.")
        product['id'] = None
        products_skipped_count += 1


# --- Write the Combined Data ---
# The output JSON will be one large array of all product objects
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    # Use indent=4 for a human-readable, pretty-printed output
    json.dump(new_products, f, indent=4) 
    

print(f"File saved to: {OUTPUT_FILE}")



File saved to: test.json


In [44]:
cpunt = 0
for product in new_products:
    cpunt += 1

print(f"Total products processed: {cpunt}")

Total products processed: 35


In [45]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch, requests
from io import BytesIO

model = CLIPModel.from_pretrained("patrickjohncyh/fashion-clip")
proc  = CLIPProcessor.from_pretrained("patrickjohncyh/fashion-clip")
device = "cpu"
model.to(device)
model.eval()

def load_img(url):
    img = Image.open(BytesIO(requests.get(url, timeout=20).content)).convert("RGB")
    return img

def clip_embed(images=None, texts=None):
    inputs = proc(text=texts, images=images, return_tensors="pt", padding=True, max_length=77,truncation=True)
    with torch.no_grad():
        out = model(**{k: v.to(device) for k,v in inputs.items()})
    img = out.image_embeds if images is not None else None
    txt = out.text_embeds  if texts is not None else None
    if img is not None: img = torch.nn.functional.normalize(img, dim=-1)
    if txt is not None: txt = torch.nn.functional.normalize(txt, dim=-1)
    return img, txt

In [46]:
from tqdm import tqdm
# Assume Image and clip_embed are defined/imported elsewhere

# Create a dummy image to satisfy the model's requirements
dummy_image = Image.new('RGB', (224, 224), color='white')

print("\nGenerating text embeddings for all records...")
for i in tqdm(range(len(new_products)), desc="Generating Embeddings"):
    record = new_products[i]
    
    # --- Option 1: Embedding the description (No change needed here for 'pattern') ---
    if 'schema_description' in record and record['schema_description']: # Added check for value
        description = record['schema_description']
        # NOTE: Using the 'embedding' column for description as per your table schema
        _, txt_emb = clip_embed(images=[dummy_image], texts=[description])
        record['embedding'] = txt_emb[0].cpu().numpy().tolist()
        # print(f"Record ID {record['id']}: Description embedding generated.") # Uncomment for detailed logs
    
    # --- Option 2: Generating the Detailed Embedding (Focus on this) ---
    # We will build a list of non-null/non-empty fields and join them.
    if 'schema_description' in record and record['schema_description']: # Check if description exists before generating detailed embedding
        # print(f"Record ID {record['id']}: generating detailed embedding.") # Uncomment for detailed logs
        
        # 1. Collect all relevant text parts into a list
        details_parts = []
        
        # Safely get and append fields if they are not None/empty
        # Check 1: Title
        if record.get('title'):
            details_parts.append(record['title'])
            
        # Check 2: Category
        if record.get('category'):
            details_parts.append(record['category'])
            
        # Check 3: Role
        if record.get('role'):
            details_parts.append(record['role'])
            
        # Check 4: Color
        if record.get('schema_color'):
            details_parts.append(record['schema_color'])
            
        # Check 5: ***The Pattern Field***
        if record.get('pattern'):
            details_parts.append(record['pattern']) # This is now safely included only if it exists
            
        # 2. Join the parts with a space to create the final text
        details = " ".join(details_parts).strip()
        
        if details:
            # Generate embeddings for the combined details
            # NOTE: Using the 'detail_embedding' column as per your table schema
            _, txt_emb = clip_embed(images=[dummy_image], texts=[details])
   
            record['detail_embedding'] = txt_emb[0].cpu().numpy().tolist()

    if 'image_link' in record and record['image_link']:
        image = load_img(record['image_link'])
        img_emb, _ = clip_embed(images=[image], texts=[""])
        record['img_embedding'] = img_emb[0].cpu().numpy().tolist()

print(f"✅ Embeddings generated for {len(new_products)} records")


Generating text embeddings for all records...


Generating Embeddings: 100%|██████████| 35/35 [00:23<00:00,  1.47it/s]

✅ Embeddings generated for 35 records





In [47]:
from supabase import create_client, Client
from dotenv import load_dotenv
import os
from tqdm import tqdm
# Assuming other necessary imports like 'Image' and 'clip_embed' are still available

# --- Configuration (Your setup) ---
load_dotenv()
SUPABASE_URL = os.environ.get("SUPABASE_URL")
SUPABASE_SECRET_KEY = os.environ.get("SUPABASE_SECRET_KEY") 
TABLE_NAME = "product_data"          

supabase: Client = create_client(SUPABASE_URL, SUPABASE_SECRET_KEY)
print(f"✅ Supabase client initialized for table: {TABLE_NAME}")

from supabase import create_client, Client
from dotenv import load_dotenv
import os
from tqdm import tqdm


ALLOWED_COLUMNS = [
    "id", "title", "url", "schema_description", "material", "brand", 
    "main_category", "role", "schema_color", "category", "audience", 
    "price", "embedding", "detail_embedding", "image_link", "img_embedding"
]



print(f"\nStarting upsert operation into '{TABLE_NAME}'...")


# --- FILTERING STEP ---
filtered_products = []

print(f"Filtering {len(new_products)} records against the schema...")
for record in tqdm(new_products, desc="Filtering Records"):
    
    # Create a new dictionary containing only the allowed keys
    filtered_record = {
        key: value for key, value in record.items() if key in ALLOWED_COLUMNS
    }
    
    # Ensure the Primary Key is present before appending
    if "id" in filtered_record:
        filtered_products.append(filtered_record)
    else:
        print(f"Skipping record due to missing primary key 'id': {record.get('title', 'Unknown Title')}")

print(f"Filtered down to {len(filtered_products)} valid records for upsert.")
# print(f"filtered products sample: {filtered_products[:1]}")  # Print a sample for verification
# --- UPSERT STEP ---
try:
    response = supabase.table(TABLE_NAME).upsert(filtered_products).execute()
    
    if response.data:
        print(f"✅ Upsert successful! Processed {len(response.data)} record(s).")
    else:
        print("⚠️ Upsert returned no data. Check for potential RLS policies or other issues.")

except Exception as e:
    print(f"❌ Upsert failed due to a database error. Error: {e}")

# # Example of calling the function with your list:
# # upload_products_to_supabase(new_products, TABLE_NAME, supabase)

✅ Supabase client initialized for table: product_data

Starting upsert operation into 'product_data'...
Filtering 35 records against the schema...


Filtering Records: 100%|██████████| 35/35 [00:00<00:00, 126770.85it/s]


Filtered down to 35 valid records for upsert.
✅ Upsert successful! Processed 35 record(s).
