In [1]:
import json
import requests
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import image
from io import BytesIO

# --- 1. SETUP & DATA LOADING ---
# NOTE: Replace 'products_initial.json' with the path to the file you upload/mount.

# Load the initial JSON file you created
json_file_path = 'products_initial.json'
with open(json_file_path, 'r') as f:
    products_data = json.load(f)

print(f"Loaded {len(products_data)} products for embedding.")

# --- 2. LOAD PRE-TRAINED MODEL ---
# MobileNetV2 is a good, fast choice. We use 'include_top=False' to remove the final
# classification layer, leaving us with the feature vector (embedding).

try:
    # Load MobileNetV2 and use 'avg' pooling for the final embedding vector
    base_model = MobileNetV2(weights='imagenet', include_top=False, pooling='avg')
    feature_extractor = Model(inputs=base_model.input, outputs=base_model.output)
    print("MobileNetV2 feature extractor loaded successfully.")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Please ensure you are connected to a runtime with GPU enabled (Runtime -> Change runtime type).")
    exit()

# --- 3. EMBEDDING GENERATION FUNCTION ---

def get_image_embedding(image_url, model):
    try:
        # Download image content
        response = requests.get(image_url, timeout=10)
        if response.status_code != 200:
            print(f"Skipping ID {product.get('id')}: HTTP Error {response.status_code}")
            return None

        # Load image from bytes
        img = image.load_img(BytesIO(response.content), target_size=(224, 224))

        # Pre-process image for the model
        img_array = image.img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)
        img_array = tf.keras.applications.mobilenet_v2.preprocess_input(img_array)

        # Generate the embedding vector
        embedding = model.predict(img_array, verbose=0)

        # Convert numpy array (1, 1280) to list (1280) for JSON storage
        return embedding.flatten().tolist()

    except Exception as e:
        # Catch common errors like connection timeouts, bad URLs, or corrupted files
        print(f"Skipping product due to error: {e}")
        return None


# --- 4. RUN EMBEDDING PROCESS ---

embedded_products = []
count = 0
total_products = len(products_data)

for product in products_data:
    count += 1

    # Generate embedding for the product's image
    embedding_vector = get_image_embedding(product['image_url'], feature_extractor)

    if embedding_vector is not None:
        # Add the generated embedding to the dictionary
        product['embedding'] = embedding_vector
        embedded_products.append(product)
        print(f"Processed {count}/{total_products} | ID: {product['id']} | Success")
    else:
        # If embedding failed, the product is skipped
        print(f"Processed {count}/{total_products} | ID: {product['id']} | SKIPPED")

    # Optional: Save checkpoint every 100 products to avoid losing progress
    if count % 100 == 0:
        with open('products_checkpoint.json', 'w') as f:
            json.dump(embedded_products, f, indent=2)
        print(f"\n--- CHECKPOINT: Saved {len(embedded_products)} records. ---\n")

print(f"\nFinished processing. Successfully embedded {len(embedded_products)} products.")

# --- 5. SAVE FINAL RESULT ---
final_output_path = 'products_final_with_embeddings.json'
with open(final_output_path, 'w') as f:
    json.dump(embedded_products, f, indent=2)

print(f"\nFINAL OUTPUT SAVED to {final_output_path}")
print("Download this file and place it in your Node.js backend.")

Loaded 1650 products for embedding.


  base_model = MobileNetV2(weights='imagenet', include_top=False, pooling='avg')


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
MobileNetV2 feature extractor loaded successfully.
Processed 1/1650 | ID: 31192 | Success
Processed 2/1650 | ID: 17375 | Success
Processed 3/1650 | ID: 25064 | Success
Processed 4/1650 | ID: 25053 | Success
Processed 5/1650 | ID: 17379 | Success
Processed 6/1650 | ID: 42066 | Success
Processed 7/1650 | ID: 31184 | Success
Processed 8/1650 | ID: 31206 | Success
Processed 9/1650 | ID: 25051 | Success
Processed 10/1650 | ID: 17366 | Success
Processed 11/1650 | ID: 25059 | Success
Processed 12/1650 | ID: 58600 | Success
Processed 13/1650 | ID: 19934 | Success
Processed 14/1650 | ID: 58525 | Success
Processed 15/1650 | ID: 25066 | Success
Processed 16/1650 | ID: 25068 | Success
Processed 17/1650 | ID: 25076 | Success
Processed 18/1650 | ID: 51329 | S