In [1]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import requests
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm

# pretrain model
# model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
# processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# finetuneed model
model = CLIPModel.from_pretrained("clip-finetuned")
processor = CLIPProcessor.from_pretrained("clip-finetuned")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [10]:
import pandas as pd
df=pd.read_csv("amazon_com_ecommerce.csv")

In [11]:
# delete columns where entire are NAN
df=df.dropna(axis=1,how="all")
df=df.drop(columns=["Upc Ean Code",]) 

fill_values = {
    "Category": "Not available",
    "Selling Price": "Not available",
    "Model Number": "Not available",
    "About Product": "Not available",
    "Product Specification": "Not available",
    "Technical Details": "Not available",
    "Shipping Weight": "Not available",
    "Product Dimensions": "Not available",
    "Variants": "Not available"
}
df.fillna(value=fill_values, inplace=True)

def build_text_for_embedding(row):
    parts = []

    parts.append(f"Product Name: {row['Product Name']}")

    if pd.notnull(row['Category']):
        parts.append(f"Category: {row['Category']}")

    # if pd.notnull(row['Selling Price']):
    #     parts.append(f"Price: {row['Selling Price']}")

    # if pd.notnull(row['Model Number']):
    #     parts.append(f"Model: {row['Model Number']}")

    # if pd.notnull(row['About Product']):
    #     parts.append(f"About Product: {row['About Product']}")

    # if pd.notnull(row['Product Specification']):
    #     parts.append(f"Specifications: {row['Product Specification']}")

    # if pd.notnull(row['Technical Details']):
    #     parts.append(f"Technical Details: {row['Technical Details']}")

    # Convert "Y"/NaN into a clean phrase
    # if row.get("Is Amazon Seller") == "Y":
    #     parts.append("Sold by Amazon: Yes")
    # else:
    #     parts.append("Sold by Amazon: No")

    return " | ".join(parts)

df['combined_text'] = df.apply(build_text_for_embedding, axis=1)

In [12]:
# craete matadata list, for retrieval information,
metadata_list = []
for _, row in df.iterrows():
    metadata_list.append({
        # all other column not included in combine_text
        # "product_id": row["Uniq Id"],
        "image_url": row["Image"],
        "product_url": row["Product Url"],
        "Variants products link": row["Variants"],
        "Shipping Weight":row["Shipping Weight"],
        "Product Dimensions":row["Product Dimensions"],
        "Product Specification":row["Product Specification"],
        "Technical Details":row["Technical Details"],
        "Is Amazon Seller":row["Is Amazon Seller"],
        "Selling Price":row["Selling Price"],
        "Model Number":row["Model Number"],
        "About Product":row["About Product"],

        "combined_text": row["combined_text"] # all text contents
    }
                        )

In [2]:
# Define Embedding Function
def get_text_embedding(text):
    inputs = processor(text=[text], return_tensors="pt", truncation=True)
    with torch.no_grad():
        return model.get_text_features(**inputs).squeeze().numpy()

def get_image_embedding(url):
    try:
        image = Image.open(requests.get(url, stream=True, timeout=5).raw).convert("RGB")
        inputs = processor(images=image, return_tensors="pt")
        with torch.no_grad():
            return model.get_image_features(**inputs).squeeze().numpy()
    except:
        return None


from tqdm import tqdm
# used for parsing Image column when there are multiple images
def get_first_image_url(image_url_string):
    """Returns the first image URL from a pipe-separated string."""
    if pd.isna(image_url_string):
        return None
    urls = [u.strip() for u in str(image_url_string).split("|") if u.strip()]
    return urls[0] if urls else None

In [13]:
import numpy as np

# load bakc when need
final_embs = np.load("final_clip_embeddings.npy")

In [14]:
import faiss

d = 512  # dimension of embeddings
index = faiss.IndexFlatL2(d)  # use IndexFlatIP for cosine similarity

index.add(final_embs.astype("float32"))

In [15]:
# test query
query_vector = final_embs[0].reshape(1, -1)
D, I = index.search(query_vector, k=3)
# I for Indexes of top k most similar vectors
# D for Distances scores

# Get metadata for top match, These are the matched products — ready to send to an LLM, or display in a UI.
top_matches = [metadata_list[i] for i in I[0]]

In [19]:
# LLM have context length limits
def build_prompt(query, top_metadata):
    # get entire metadata into context for prompt
    prompt_parts = []
    for i, item in enumerate(top_metadata):
        section = f"{i+1}."
        for key, value in item.items():
            if pd.notnull(value) and str(value).strip() != "":
                section += f"\n  - {key}: {value}"
        prompt_parts.append(section)

    context = "\n\n".join(prompt_parts)

    prompt = f"""
You are a helpful AI assistant for an e-commerce website. Your job is to answer customer questions based on available product details.

User question:
{query}

Here are some product descriptions that may be relevant:
{context}

Provide an informative and accurate response using the product information above. If multiple items are relevant, mention them.
If you don't know the answer, say "I’m not sure based on the available product data.
If query request to show picture relate to the product, please return the corresponding iamge URL from the context.
"""
    return prompt.strip()

def get_query_embedding(text_query=None, image_query=None,k=2): #
    # skip image if it's not provided
    text_emb = get_text_embedding(text_query) if text_query else None
    image_emb = get_image_embedding(image_query) if image_query else None

    # embed query score
    query_emb=None
    if text_emb is not None and image_emb is not None:
        # query_emb=((text_emb + image_emb) / 2).reshape(1, -1)
        query_emb = (0.3 * text_emb + 0.7 * image_emb).reshape(1, -1) # emphasize image emb
    elif text_emb is not None:
        query_emb=text_emb.reshape(1, -1)
    elif image_emb is not None:
        query_emb=image_emb.reshape(1, -1)
    else:
        raise ValueError("Must provide at least text or image.")
    # faiss serach & retrieval
    D, I = index.search(query_emb, k)
    retrieved_items = [metadata_list[i] for i in I[0]]

    prompt = build_prompt(text_query, retrieved_items)
    return prompt, retrieved_items

In [36]:
# query
# image_path=None
# text_query="What are the features of the Samsung Galaxy S21?"

# text ask for information
# image_path=None
# text_query="Can you show me the price of DB Longboards? Also show me the product picture"
# text_query="Can you show me some Funko Pops? Also show their pictures"
# text_query="Can you show me the price of Funko pop? Also show me the product picture"

# image +text ask for information
# image_path="https://images-na.ssl-images-amazon.com/images/I/31An4Vzy6eL.jpg"
#image_path="https://images-na.ssl-images-amazon.com/images/I/31An4Vzy6eL.jpg"
# text_query="what's this product and its price"
#text_query="can you variant product of this product, I need their product name and some pictures"

# image +text ask for im
# image_path=None
text_query="Can you show me the price of Bandai Spirits Bandai Hobby Mecha Collection Astro Battleship? Also show me the product picture"
# text_query="what's this product what are the similar products? show me some pictures"


prompt, items = get_query_embedding(text_query, image_query=image_path,k=4)

In [37]:
from transformers import pipeline
from huggingface_hub import InferenceClient

client = InferenceClient(
    model="mistralai/Mixtral-8x7B-Instruct-v0.1",
    token="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"
    # token="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"
    # token="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"
)

response = client.text_generation(
    prompt=prompt,
    max_new_tokens=500,
    temperature=0.05
)

print(response)



Answer:
I’m not sure based on the available product data. However, if you're looking for a Bandai Spirits Bandai Hobby Mecha Collection Astro Battleship, I couldn't find specific information about it in the provided data. I can share details about some related products:

1. Megahouse Cosmo Fleet Collection: Mobile Suit Gundam Zeta: A.E.U.G. Assault Cruiser Argama Figure
   - Price: $25.99
   - Image URL: https://images-na.ssl-images-amazon.com/images/I/41tUjlb9qZL.jpg|https://images-na.ssl-images-amazon.com/images/I/41-M8%2B-Mr5L.jpg|https://images-na.ssl-images-amazon.com/images/I/319FuT62ZKL.jpg|https://images-na.ssl-images-amazon.com/images/I/41gUW5t63bL.jpg|https://images-na.ssl-images-amazon.com/images/I/41aYa2ZS%2BzL.jpg|https://images-na.ssl-images-amazon.com/images/I/41Fd6EcU6sL.jpg|https://images-na.ssl-images-amazon.com/images/I/41eez5XpgIL.jpg|https://images-na.ssl-images-amazon.com/images/I/41Fd6EcU6sL.jpg|https://images-na.ssl-images-amazon.com/images/G/01/x-locale/commo

In [38]:
import re
from IPython.display import display, Image
from PIL import UnidentifiedImageError

# Extract all .jpg image URLs from the response
def extract_all_jpg_urls(response_text):
    return re.findall(r"https:\/\/[^\s\"']+?\.jpg", response_text, flags=re.IGNORECASE)

# Display all image URLs found
def display_images_from_urls(urls):
    for url in urls:
        try:
            display(Image(url=url))
        except UnidentifiedImageError:
            print(f"[Unidentified Image] Skipped: {url}")
        except Exception as e:
            print(f"[Error] {url} — {e}")

# Extract and display
urls = extract_all_jpg_urls(response)
print(f"Extracted {len(urls)} image URLs.")
display_images_from_urls(urls)


Extracted 10 image URLs.
