In [None]:
!pip install torch torchvision transformers pandas openpyxl pillow requests reportlab opencv-python
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git


In [None]:
import pandas as pd
import numpy as np
import torch
import requests
import json
import cv2

from PIL import Image
from io import BytesIO
from typing import List
from pydantic import BaseModel

import clip
from transformers import BlipProcessor, BlipForConditionalGeneration


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"


In [None]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-base"
).to(device)
blip_model.eval()


In [None]:
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)
clip_model.eval()


In [None]:
from google.colab import files
files.upload()  # upload A1.0_data_product_images.xlsx


In [None]:
df = pd.read_excel("A1.0_data_product_images.xlsx")
df.columns = df.columns.str.strip()

image_columns = [c for c in df.columns if c.startswith("Image") and c != "Image Count"]


In [None]:
def load_image_from_url(url):
    try:
        r = requests.get(url, timeout=5)
        r.raise_for_status()
        return Image.open(BytesIO(r.content)).convert("RGB")
    except:
        return None


In [None]:
def analyze_image_with_blip(image: Image.Image) -> str:
    inputs = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        out = blip_model.generate(**inputs, max_new_tokens=30)
    return processor.decode(out[0], skip_special_tokens=True)


In [None]:
def extract_basic_visual_features(image: Image.Image):
    img = np.array(image)
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

    edges = cv2.Canny(gray, 50, 150)
    edge_density = edges.mean()

    brightness = gray.mean()

    h, w = gray.shape
    aspect_ratio = w / h if h > 0 else 1.0

    return {
        "edge_density": edge_density,
        "brightness": brightness,
        "aspect_ratio": aspect_ratio
    }


In [None]:
def clip_similarity(image, prompts):
    image_input = clip_preprocess(image).unsqueeze(0).to(device)
    text_inputs = clip.tokenize(prompts).to(device)

    with torch.no_grad():
        image_feat = clip_model.encode_image(image_input)
        text_feat = clip_model.encode_text(text_inputs)

        image_feat /= image_feat.norm(dim=-1, keepdim=True)
        text_feat /= text_feat.norm(dim=-1, keepdim=True)

        sims = (image_feat @ text_feat.T).squeeze(0)

    return {prompts[i]: sims[i].item() for i in range(len(prompts))}


In [None]:
def scale_to_range(value, in_min=-0.05, in_max=0.05):
    value = max(in_min, min(in_max, value))
    norm = (value - in_min) / (in_max - in_min)
    return round(-5 + norm * 10, 2)


def infer_gender_expression_with_clip(image):
    scores = clip_similarity(
        image,
        ["masculine eyeglasses", "unisex eyeglasses", "feminine eyeglasses"]
    )
    raw = scores["feminine eyeglasses"] - scores["masculine eyeglasses"]
    return scale_to_range(raw)


def infer_dominant_color_with_clip(image):
    scores = clip_similarity(
        image,
        ["black eyeglasses", "brown eyeglasses", "clear eyeglasses", "colored eyeglasses"]
    )
    return max(scores, key=scores.get).split()[0]


def infer_visual_weight_with_clip(image):
    scores = clip_similarity(
        image,
        ["thin lightweight eyeglasses", "bold heavy eyeglasses"]
    )
    raw = scores["bold heavy eyeglasses"] - scores["thin lightweight eyeglasses"]
    return scale_to_range(raw)


def infer_embellishment_with_clip(image):
    scores = clip_similarity(
        image,
        ["simple minimalist eyeglasses", "ornate decorative eyeglasses"]
    )
    raw = scores["ornate decorative eyeglasses"] - scores["simple minimalist eyeglasses"]
    return scale_to_range(raw)


In [None]:
def infer_transparency(brightness):
    if brightness > 180:
        return "transparent"
    elif brightness > 130:
        return "semi-transparent"
    else:
        return "opaque"


In [None]:
def infer_frame_geometry_with_clip(image, aspect_ratio):
    scores = clip_similarity(
        image,
        [
            "round eyeglasses",
            "rectangular eyeglasses",
            "square eyeglasses",
            "cat-eye eyeglasses"
        ]
    )

    # Sort by similarity
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    top_label, top_score = sorted_scores[0]
    second_label, second_score = sorted_scores[1]

    # 1️⃣ Strong CLIP confidence → trust CLIP
    if top_score - second_score > 0.015:
        return top_label.split()[0]

    # 2️⃣ Weak CLIP signal → use visual geometry fallback
    if aspect_ratio >= 1.4:
        return "rectangular"
    elif aspect_ratio <= 1.15:
        return "round"

    # 3️⃣ Only now call it ambiguous
    return "ambiguous"



In [None]:
class VisualScores(BaseModel):
    gender_expression: float
    visual_weight: float
    embellishment: float
    unconventionality: float
    formality: float


class VisualAttributes(BaseModel):
    wirecore: bool
    frame_geometry: str
    transparency: str
    dominant_colors: List[str]
    textures: List[str]
    suitable_for_kids: bool


class ProductAnalysis(BaseModel):
    scores: VisualScores
    attributes: VisualAttributes
    confidence: float
    notes: str


In [None]:
def caption_to_measurements(
    image,
    caption,
    product_id,
    frame_geometry,
    dominant_color,
    texture,
    brightness,
    gender_expression,
    transparency,
    visual_weight,
    embellishment
):
    formality = 1.0 if "metal" in caption.lower() else 0.0
    unconventionality = 1.0 if frame_geometry in ["cat-eye", "ambiguous"] else 0.0

    scores = VisualScores(
        gender_expression=gender_expression,
        visual_weight=visual_weight,
        embellishment=embellishment,
        unconventionality=unconventionality,
        formality=formality
    )

    attributes = VisualAttributes(
        wirecore="metal" in caption.lower(),
        frame_geometry=frame_geometry,
        transparency=transparency,
        dominant_colors=[dominant_color],
        textures=[texture],
        suitable_for_kids=(
            visual_weight <= 0 and embellishment < 0 and frame_geometry != "cat-eye"
        )
    )

    return ProductAnalysis(
        scores=scores,
        attributes=attributes,
        confidence=0.85,
        notes="Derived exclusively from observable visual characteristics."
    )


In [None]:
import re
from collections import Counter

def most_frequent_caption(captions, top_k=12):
    """
    Builds a representative caption using the most frequent
    meaningful words across all BLIP captions.
    """

    stopwords = {
        "a", "the", "and", "of", "with", "on", "in", "is", "are",
        "this", "that", "pair", "glasses", "eyeglasses", "frame", "frames"
    }

    words = []
    for c in captions:
        tokens = re.findall(r"[a-zA-Z]+", c.lower())
        words.extend([t for t in tokens if t not in stopwords])

    if not words:
        return captions[0]

    most_common = [w for w, _ in Counter(words).most_common(top_k)]

    return " ".join(most_common)


In [None]:
results = []
MAX_PRODUCTS = 10

for _, row in df.head(MAX_PRODUCTS).iterrows():
    product_id = row["Product Id"]
    print(f"\nStarting product {product_id}")

    # ---- Load images safely ----
    urls = [row[c] for c in image_columns if isinstance(row[c], str)]
    images = []
    for u in urls:
        img = load_image_from_url(u)
        if img is not None:
            images.append(img)

    if not images:
        print("No valid images, skipping")
        continue

    captions = []
    genders = []
    weights = []
    embellishments = []
    geometries = []
    colors = []
    transparencies = []
    textures = []
    brightness_vals = []

    # ---- Per-image processing ----
    for image in images:
        caption = analyze_image_with_blip(image)
        features = extract_basic_visual_features(image)

        captions.append(caption)
        genders.append(infer_gender_expression_with_clip(image))
        weights.append(infer_visual_weight_with_clip(image))
        embellishments.append(infer_embellishment_with_clip(image))
        geometries.append(
            infer_frame_geometry_with_clip(image, features["aspect_ratio"])
        )
        colors.append(infer_dominant_color_with_clip(image))
        transparencies.append(infer_transparency(features["brightness"]))
        textures.append("smooth" if features["edge_density"] < 30 else "textured")
        brightness_vals.append(features["brightness"])

    # ---- BUILD REPRESENTATIVE CAPTION (KEY CHANGE) ----
    representative_caption = most_frequent_caption(captions)
    print("BLIP Caption (Most Frequent Words):", representative_caption)

    # ---- Aggregate measurements ----
    analysis = caption_to_measurements(
        image=images[0],
        caption=representative_caption,
        product_id=product_id,
        frame_geometry=max(set(geometries), key=geometries.count),
        dominant_color=max(set(colors), key=colors.count),
        texture=max(set(textures), key=textures.count),
        brightness=sum(brightness_vals) / len(brightness_vals),
        gender_expression=sum(genders) / len(genders),
        transparency=max(set(transparencies), key=transparencies.count),
        visual_weight=sum(weights) / len(weights),
        embellishment=sum(embellishments) / len(embellishments)
    )

    results.append({
        "product_id": product_id,
        "image_count": len(images),
        "visual_description": representative_caption,
        "analysis": analysis.model_dump()
    })

    print(f"Processed {product_id} | images={len(images)}")


In [None]:
print("\n========== FINAL JSON OUTPUT ==========\n")
print(json.dumps(results, indent=2))



In [None]:
from reportlab.platypus import (
    SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
)
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.pagesizes import A4
from reportlab.lib import colors


def generate_stylish_pdf(results, filename="visual_product_report.pdf"):
    doc = SimpleDocTemplate(
        filename,
        pagesize=A4,
        rightMargin=36,
        leftMargin=36,
        topMargin=36,
        bottomMargin=36,
    )

    styles = getSampleStyleSheet()
    styles.add(
        ParagraphStyle(
            name="SectionHeader",
            fontSize=13,
            spaceAfter=8,
            textColor=colors.darkblue,
        )
    )

    elements = []

    # ---- Title ----
    elements.append(Paragraph(
        "<b>Visual Product Measurement Report</b>",
        styles["Title"]
    ))
    elements.append(Spacer(1, 20))

    for r in results:
        # ---- Product Header ----
        elements.append(Paragraph(
            f"<b>Product ID:</b> {r['product_id']}",
            styles["Heading2"]
        ))
        elements.append(Paragraph(
            f"<b>Image Count:</b> {r['image_count']}",
            styles["Normal"]
        ))
        elements.append(Spacer(1, 8))

        # ---- Caption ----
        elements.append(Paragraph("Visual Description", styles["SectionHeader"]))
        elements.append(Paragraph(
            r["visual_description"],
            styles["BodyText"]
        ))
        elements.append(Spacer(1, 10))

        # ---- Scores Table ----
        elements.append(Paragraph("Visual Measurements", styles["SectionHeader"]))
        scores = r["analysis"]["scores"]
        score_rows = [["Dimension", "Score (-5 → +5)"]] + [
            [k.replace("_", " ").title(), f"{v:.2f}"]
            for k, v in scores.items()
        ]

        score_table = Table(score_rows, colWidths=[240, 120])
        score_table.setStyle(TableStyle([
            ("BACKGROUND", (0, 0), (-1, 0), colors.whitesmoke),
            ("GRID", (0, 0), (-1, -1), 0.8, colors.grey),
            ("FONT", (0, 0), (-1, 0), "Helvetica-Bold"),
            ("ALIGN", (1, 1), (-1, -1), "CENTER"),
        ]))

        elements.append(score_table)
        elements.append(Spacer(1, 12))

        # ---- Attributes ----
        elements.append(Paragraph("Observable Attributes", styles["SectionHeader"]))
        attrs = r["analysis"]["attributes"]
        for k, v in attrs.items():
            elements.append(
                Paragraph(f"<b>{k.replace('_', ' ').title()}:</b> {v}", styles["Normal"])
            )

        elements.append(Spacer(1, 18))

    doc.build(elements)
    print(f"PDF generated: {filename}")


In [None]:
generate_stylish_pdf(results)
from google.colab import files
files.download("visual_product_report.pdf")
