In [1]:
import clip #for model classification
import torch
from PIL import Image
import requests
from pymongo import MongoClient
from io import BytesIO
from tqdm import tqdm#lib to show progress


In [2]:
from dotenv import load_dotenv
import os

In [3]:
load_dotenv()
db_url=os.getenv("DB_URL")

In [4]:
client = MongoClient(db_url)
db= client["Honda_cars"]
collection = db["listings"]


In [None]:
def set_clip_function():
    LABELS = [
    # Core
    "a clear photo of a car exterior (outside of the car)",
    "a clear photo of a car interior showing seats and dashboard",
    "a clear photo of a car engine bay with engine visible",

    # Interior breakdown
    "a close-up photo of car dashboard or meter cluster",
    "a photo of car seats (front or rear seats)",
    "a photo of car infotainment system or screen",
    "a photo of car door panels or interior side",

    # Components
    "a photo of car trunk or boot space",
    "a photo of car tires or wheels",

    # Noise / special
    "a photo of car keys or keychain",
    "a photo of documents, papers, or registration book",
    "a close-up photo of car damage, scratches, dents or broken parts",
    "a blurry or low quality photo",
    "a random unrelated photo (garbage, ground, wall, people, hand, etc)"
]

    CATEGORY_MAP = {
    0: "exterior_images",
    1: "interior_images",
    2: "engine_images",
    3: "dashboard_images",
    4: "key_images",
    5: "blurry_images",
    6: "document_images",
    7: "trunk_images",
    8: "tire_images",
    9: "door_panel_images",
    10: "junk_images"
}
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, preprocess = clip.load("ViT-B/32", device=device)
    text_tokens = clip.tokenize(LABELS).to(device)
    session=requests.Session()
    
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.pakwheels.com/used-cars/honda/32"
}

    return LABELS,CATEGORY_MAP,device,model,preprocess,text_tokens,session,headers

In [6]:
LABELS,CATEGORY_MAP,device,model,preprocess,text_tokens,session,headers=set_clip_function()

In [7]:
def classify_image_url(
    url,
    model,
    preprocess,
    text_tokens,
    labels,
    device,
    headers,
    session,
    confidence_threshold=0.5
):
    try:
        response = session.get(url,headers=headers, timeout=10)
        response.raise_for_status()

        img = Image.open(BytesIO(response.content)).convert("RGB")
        image_tensor = preprocess(img).unsqueeze(0).to(device)

        with torch.no_grad():
            image_features = model.encode_image(image_tensor)
            text_features = model.encode_text(text_tokens)
            similarity = (image_features @ text_features.T).softmax(dim=-1)

        best_idx = similarity.argmax().item()
        confidence = similarity[0][best_idx].item()
        label = labels[best_idx]

        if confidence < confidence_threshold:
            return None, confidence

        return label, confidence

    except Exception as e:
        print(f"[ERROR] Failed on {url}: {e}")
        return None, 0.0


In [22]:
def process_documents(
    collection,
    model,
    preprocess,
    text_tokens,
    labels,
    device,
    headers,
    session,
    skip_count=2300
):
    # Get all document IDs first (fast operation, no timeout)
    print("Fetching document IDs...")
    all_ids = [
        doc["_id"] 
        for doc in collection.find(
            {"images": {"$exists": True}}, 
            {"_id": 1}
        ).skip(skip_count)
    ]
    
    print(f"Found {len(all_ids)} documents to process, starting from position {skip_count}")
    
    # Process each document individually
    for doc_id in tqdm(all_ids, desc="Processing documents"):
        # Fetch the document
        doc = collection.find_one({"_id": doc_id})
        
        if not doc:
            continue
            
        exterior, interior, engine = [], [], []
        
        for url in doc.get("images", []):
            label, conf = classify_image_url(
                url=url,
                model=model,
                preprocess=preprocess,
                text_tokens=text_tokens,
                labels=labels,
                device=device,
                headers=headers,
                session=session
            )
            if label is None:
                continue
            
            if "exterior" in label:
                exterior.append(url)
            elif "interior" in label:
                interior.append(url)
            elif "engine" in label:
                engine.append(url)
        
        # Update MongoDB document
        collection.update_one(
            {"_id": doc_id},
            {"$set": {
                "exterior_images": exterior,
                "interior_images": interior,
                "engine_images": engine
            }}
        )

In [23]:
process_documents(collection,model,preprocess,text_tokens,LABELS,device,headers,session)


Fetching document IDs...
Found 313 documents to process, starting from position 2300


Processing documents:  80%|████████  | 251/313 [53:22<18:10, 17.58s/it]  

[ERROR] Failed on https://cache3.pakwheels.com/ad_pictures/1341/honda-civic-vti-oriel-2-2000-134103310.webp: HTTPSConnectionPool(host='cache3.pakwheels.com', port=443): Read timed out.


Processing documents: 100%|██████████| 313/313 [1:07:59<00:00, 13.03s/it]
