In [43]:
import pandas as pd
from datasets import load_dataset
import numpy as np
from transformers import CLIPProcessor, CLIPModel
from sklearn.cluster import KMeans
import cv2
import os

In [34]:

dataset = load_dataset("Francesco/furniture-ngpea", split="train")

In [35]:
dataset = dataset.to_pandas()

In [36]:
dataset.shape

(454, 5)

In [37]:
dataset.columns

Index(['image_id', 'image', 'width', 'height', 'objects'], dtype='str')

In [38]:
dataset.head()

Unnamed: 0,image_id,image,width,height,objects
0,406,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,640,640,"{'id': [406], 'area': [219402], 'bbox': [[142...."
1,164,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,640,640,"{'id': [164], 'area': [28743], 'bbox': [[268.0..."
2,329,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,640,640,"{'id': [329], 'area': [206784], 'bbox': [[42.0..."
3,379,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,640,640,"{'id': [379], 'area': [230545], 'bbox': [[2.0,..."
4,60,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,640,640,"{'id': [60], 'area': [207142], 'bbox': [[25.0,..."


In [39]:
import torch
import clip
from PIL import Image
import numpy as np
import os

# # Load CLIP model (load only once)
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model, preprocess = clip.load("ViT-B/32", device=device)
# print(f"CLIP loaded on {device}")


In [40]:

# Room label
room_labels = [
    "living room",
    "bedroom",
    "kitchen",
    "bathroom",
    "dining room",
    "home office",
    "hallway",
    "study room",
    "balcony",
    "garage",
    "gaming room"
]

# Precompute text features (embeddings don't change. Run only once)
text_tokens = clip.tokenize(room_labels).to(device)
with torch.no_grad():
    text_features = model.encode_text(text_tokens)
    text_features /= text_features.norm(dim=-1, keepdim=True)


In [46]:
def infer_room_type(image_path, top_k=1):
    try:
        img = Image.open(image_path).convert("RGB") #opens the image using Pillow
    except Exception as e:
        return f"Error: {e}"

    img_input = preprocess(img).unsqueeze(0).to(device)

    with torch.no_grad():
        img_features = model.encode_image(img_input) #encodes the images
        img_features /= img_features.norm(dim=-1, keepdim=True) #normalize

        # Cosine similarity → softmax for probs
        logits = (100.0 * img_features @ text_features.T).softmax(dim=-1)
        probs = logits.cpu().numpy()[0]

    # Top results
    top_idx = np.argsort(probs)[::-1][:top_k]
    
    print("Top matches:")
    results = []
    for i, idx in enumerate(top_idx):
        score = probs[idx] * 100
        label = room_labels[idx]
        print(f"  {score:5.1f}% → {label}")
        results.append((score, label))

    # Simple best guess
    best_score, best_label = results[0]
    print(f"\nBest room type: {best_label} ({best_score:.1f}%)")
    
    return results, best_label

if __name__ == "__main__":
    image = os.path.join("images", "459189625.jpg")    
    if os.path.exists(image):
        infer_room_type(image, top_k=1)
    else:
        print("Add the image and update the path!")

Top matches:
   78.3% → bedroom

Best room type: bedroom (78.3%)
