In [26]:
import clip #for model classification
import torch
from PIL import Image
import requests
from pymongo import MongoClient
from io import BytesIO
from tqdm import tqdm#lib to show progress


In [27]:
from dotenv import load_dotenv
import os

In [28]:
load_dotenv()
db_url=os.getenv("DB_URL")

In [29]:
client = MongoClient(db_url)
db= client["Honda_cars"]
collection = db["listings"]


In [30]:
def set_clip_function():
    LABELS = [
    "a clear photo of a car exterior",
    "a clear photo of a car interior with seats and dashboard",
    "a clear photo of a car engine bay"
]
    CATEGORY_MAP = {
    0: "exterior_images",
    1: "interior_images",
    2: "engine_images"
}
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, preprocess = clip.load("ViT-B/32", device=device)
    text_tokens = clip.tokenize(LABELS).to(device)
    session=requests.Session()
    
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.pakwheels.com/used-cars/honda/32"
}

    return LABELS,CATEGORY_MAP,device,model,preprocess,text_tokens,session,headers

In [31]:
LABELS,CATEGORY_MAP,device,model,preprocess,text_tokens,session,headers=set_clip_function()

In [32]:
def classify_image_url(
    url,
    model,
    preprocess,
    text_tokens,
    labels,
    device,
    headers,
    session,
    confidence_threshold=0.5
):
    try:
        response = session.get(url,headers=headers, timeout=10)
        response.raise_for_status()

        img = Image.open(BytesIO(response.content)).convert("RGB")
        image_tensor = preprocess(img).unsqueeze(0).to(device)

        with torch.no_grad():
            image_features = model.encode_image(image_tensor)
            text_features = model.encode_text(text_tokens)
            similarity = (image_features @ text_features.T).softmax(dim=-1)

        best_idx = similarity.argmax().item()
        confidence = similarity[0][best_idx].item()
        label = labels[best_idx]

        if confidence < confidence_threshold:
            return None, confidence

        return label, confidence

    except Exception as e:
        print(f"[ERROR] Failed on {url}: {e}")
        return None, 0.0


In [33]:
def process_documents(
    collection,
    model,
    preprocess,
    text_tokens,
    labels,
    device,
    headers,
    session
):
    docs = collection.find({"images": {"$exists": True}})

    for doc in tqdm(docs):
        exterior, interior, engine = [], [], []

        for url in doc.get("images", []):

            label, conf = classify_image_url(
                url=url,
                model=model,
                preprocess=preprocess,
                text_tokens=text_tokens,
                labels=labels,
                device=device,
                headers=headers,
                session=session
            )

            if label is None:
                continue

            # Simple rule mapping
            if "exterior" in label:
                exterior.append(url)
            elif "interior" in label:
                interior.append(url)
            elif "engine" in label:
                engine.append(url)

        # Update MongoDB document
        collection.update_one(
            {"_id": doc["_id"]},
            {"$set": {
                "exterior_images": exterior,
                "interior_images": interior,
                "engine_images": engine
            }}
        )


In [None]:
process_documents(collection,model,preprocess,text_tokens,LABELS,device,headers,session)


1066it [1:53:23,  5.51s/it]