As you will see below, the dataset is not very consistent, and since there is no data from a device like a Ridar device that measures distance, I am conducting this work using images of over 25,000 products shared in the review sections of selected products on an e-commerce site, with the goal of answering the question: Can the dimensions of products be estimated solely by taking pictures from different angles?

**My aim is not to achieve a successful result, but to answer the question of how accurately the dimensions can be predicted.**

In [None]:
import torch
import torchvision.transforms as transforms
from torchvision import models
from torch import nn, optim
from PIL import Image
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import os
from PIL import Image
import random
import warnings
import re
import matplotlib.pyplot as plt
warnings.simplefilter("ignore")

In [None]:
images_folder_paths = '/kaggle/input/dimension-photos'
images_name=os.listdir(images_folder_paths)

In [None]:
len(images_name)

In [None]:
images_name[:5]

In [None]:
[name for name in images_name if 'English_Home' in name] # On average there are 60 photos of each product

In [None]:
full_photo_paths = []
for image_name in images_name:
    full_path = os.path.join(images_folder_paths,image_name)
    full_photo_paths.append(full_path)

In [None]:
full_photo_paths[:5]

In [None]:
Image.open(full_photo_paths[100])


In [None]:
df_csv = pd.read_csv('/kaggle/input/dimension-csv/finaldata.csv')
df_csv.head()

In [None]:
df_csv['product_name'] = df_csv['product_name'].str.replace('-', '_')
df_csv['product_name'] = df_csv['product_name'].str.replace(',', '')
df_csv['product_name'] = df_csv['product_name'].str.replace('(','')
df_csv['product_name'] = df_csv['product_name'].str.replace(')','')
df_csv['product_name'] = df_csv['product_name'].str.replace('™','')

In [None]:
df_csv.info()

In [None]:
images_name[:4]

In [None]:
full_photo_paths[:4]

In [None]:
df_csv.tail()

In [None]:
images_only_name = []
for i in images_name:
    images_only_name.append(i.rsplit('.', 1)[0])

images_only_name[:4]

In [None]:
keys = ['Id','product_name','height','width','image_path']
df = pd.DataFrame(columns=keys)

for inx, i  in enumerate(df_csv['product_name']):
    ID = df_csv['Id'].iloc[inx]
    result = f"{i}_{ID}"
    if result in images_only_name:
        img_index = images_only_name.index(result)
        df.loc[len(df)]=[df_csv['Id'].iloc[inx],df_csv['product_name'].iloc[inx], df_csv['height'].iloc[inx],df_csv['width'].iloc[inx],full_photo_paths[img_index]]

    else:
        print(result)

In [None]:
df.tail(4)

In [None]:
product_name_list = []
for img_path in df['image_path']:
    result1 = img_path.split('/')[4]
    result2 = result1.rsplit('_', 1)[0]
    product_name_list.append(result2)

unique_listt = list(set(product_name_list))
print(len(unique_listt)) #i have 511 different product.

In [None]:
unique_listt[:5] #some examples

In [None]:
def show_some_photos(product_name = '.', already_list = []):
    images = list()
    vertical_images=[]
    counter = 0
    gal_list = [name for name in full_photo_paths if product_name in name]
    if len(already_list)>0:
        gal_list = already_list
        
    for path in gal_list:
        image = Image.open(path)
        images.append(image)            
        
    fig, axes = plt.subplots(len(images)//5, len(images) //(len(images)//5), figsize=(15, len(images)//1.5))
    for ax, img in zip(axes.ravel(), images):
        ax.imshow(img)
        ax.axis("off")
    plt.suptitle(f"{str(len(images))} samples of {product_name} of the dataset", fontsize=16, y=0.9)
    plt.show()

In [None]:
#show_some_photos('Rota_Hediyelik')

In [None]:
#show_some_photos('B101_LED_Işık_Micro_USB_Type_C_Girişli_10.000_mAh_Taşınabilir_Şarj_Cihazı_Powerbank_Gri')

### ****As you can see some pictures are irrelevant
### so I am going to use VGG16 to determine these

## How does Keras handle resizing?
Keras' image.load_img(img_path, target_size=(224, 224)) uses nearest neighbor interpolation (or another interpolation method) to directly stretch or shrink the image to the given size. This means:

If the image is larger, it will shrink.

If the image is smaller, it will expand.

The aspect ratio will not be preserved (it will distort the image instead of adding black bars).

**So at first I need to convert the images to 224x224 so I'm going to use padding because I want to keep the ratio of photos**

In [None]:
#from collections import Counter
#image_sizes = []
#
## Tüm görüntülerin boyutlarını topla
#for path in full_photo_paths:
#    img = Image.open(path)
#    image_sizes.append(img.size) # (width, height)
#
#Counter(image_sizes)

In [None]:
#Image.open(full_photo_paths[50])

In [None]:
#counter = 0
#for i in range(len(full_photo_paths)):
#    img = Image.open(full_photo_paths[i])
#    if img.size == (140, 311):
#        Image.open(full_photo_paths[i])
#        # I cant use img.show() beacuse of Kaggle
#        plt.imshow(img)
#        plt.axis("off")
#        plt.show()
#        print(f"index: {i}")
#        counter+=1
#        if counter %5==0:
#            break

If all crops are at very different ratios, some images may have large black bands. The network may try to learn whether black (“0,0,0”) is the background or something important.

In [None]:
import cv2
def resize_with_padding(image_paths=None, target_size=(224, 224), image=None):
    padded_images = []
    if image_paths is not None:
        
        for img_path in image_paths:
            # Load the image
            image = cv2.imread(img_path)
            if image is None:
                raise ValueError(f"Image at {img_path} could not be loaded.")
            
            old_size = image.shape[:2]  # (height, width)
            ratio = min(target_size[0] / old_size[0], target_size[1] / old_size[1])
    
            new_size = (int(old_size[1] * ratio), int(old_size[0] * ratio))  # (width, height)
            resized_image = cv2.resize(image, new_size, interpolation=cv2.INTER_AREA)
    
            mean_color = list(map(int, resized_image.mean(axis=(0,1))))
            delta_w, delta_h = target_size[1]-new_size[0], target_size[0]-new_size[1]
            top, bottom = delta_h // 2, delta_h - (delta_h // 2)
            left, right = delta_w // 2, delta_w - (delta_w // 2)
    
            padded_image = cv2.copyMakeBorder(resized_image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=mean_color)
    
            padded_images.append({'image': padded_image, 'path': img_path})
        
        return padded_images

    if image is not None:
        if not isinstance(image, np.ndarray):
            image = np.array(image)

        if image.ndim == 2:  # Eğer siyah-beyaz ise, 3 kanal yap
            image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
        old_size = image.shape[:2]  # (height, width)
        ratio = min(target_size[0] / old_size[0], target_size[1] / old_size[1])
        new_size = (int(old_size[1] * ratio), int(old_size[0] * ratio))  # (width, height)
        resized_image = cv2.resize(image, new_size, interpolation=cv2.INTER_AREA)
        mean_color = list(map(int, resized_image.mean(axis=(0,1))))
        delta_w, delta_h = target_size[1]-new_size[0], target_size[0]-new_size[1]
        top, bottom = delta_h // 2, delta_h - (delta_h // 2)
        left, right = delta_w // 2, delta_w - (delta_w // 2)
        padded_image = cv2.copyMakeBorder(resized_image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=mean_color)
        return padded_image


In [None]:
type(img['image'])

In [None]:
img = Image.open(full_photo_paths[96])
plt.imshow(img)

In [None]:
 cv2.imread(full_photo_paths[96]).shape[:2]

In [None]:
import matplotlib.pyplot as plt

img = Image.open(full_photo_paths[96])
#img = np.array(img)
padded_img = resize_with_padding(image=img, target_size=(224, 224))

plt.imshow(cv2.cvtColor(padded_img, cv2.COLOR_BGR2RGB))
plt.axis('off')
plt.show()

In [None]:
padded_images = resize_with_padding(image_paths = [full_photo_paths[96]], target_size=(224, 224))

# Display the resized images using matplotlib
for i, img in enumerate(padded_images):
    plt.subplot(1, len(padded_images), i + 1)
    img_rgb = cv2.cvtColor(img['image'], cv2.COLOR_BGR2RGB)
    plt.imshow(img_rgb)
    plt.axis('off')

plt.show()

# Object Detection

## Visual Outlier Detection with VGG16

In [None]:
import torch
from torchvision import transforms
from PIL import Image
import numpy as np

def extract_features(img_path, model, device='cpu'):
    img = Image.open(img_path).convert('RGB')

    # Define preprocessing pipeline
    preprocess = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize
    ])
    img_tensor = preprocess(img)
    img_tensor = img_tensor.unsqueeze(0) # to add batch 

    img_tensor = img_tensor.to(device)
    
    model.eval() #extract features
    with torch.no_grad(): #avoid waste of memory and something like that
        features = model(img_tensor)
    features = features.cpu().numpy().flatten()

    return features


In [None]:
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.ensemble import IsolationForest

#model = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1)
#model.avgpool = nn.AdaptiveAvgPool2d(1)  # Global average pooling layer
#model.classifier = nn.Identity()  # Remove the classification layer (optional)
#
#device = 'cuda' if torch.cuda.is_available() else 'cpu'
#model = model.to(device)
#
#product_name = 'Beyaz_Kupa_Bardak_Winnie_Arkadaşlık_Dostluk_Kardeşli'
#image_files = [name for name in full_photo_paths if product_name in name]
#
## Extract features for each image
#feature_vectors = np.array([extract_features(img, model) for img in image_files])
#
##PCA for dimensionality reduction:
#pca = PCA(n_components=10)  
#reduced_features = pca.fit_transform(feature_vectors)
#
## Cosine similarity hesaplama
#similarity_matrix = cosine_similarity(reduced_features)
#
## Outlier tespiti için Isolation Forest kullanma
#iso_forest = IsolationForest(contamination=0.1)  # Kontaminasyon oranını ayarlayabilirsin (0.1, %10'luk dilim)
#outlier_predictions = iso_forest.fit_predict(reduced_features)
#
## Outlier fotoğrafları bulma
#outlier_indices = np.where(outlier_predictions == -1)[0]  # -1 outlier anlamına gelir
#outlier_images = [image_files[i] for i in outlier_indices]
#
#print("Outlier (kupa içermeyen) resimler:", outlier_images)

In [None]:
#show_some_photos(';',outlier_images) #but it isint work as i expected

If I crop the image to show only the object after detecting it in an image with yolo, the model cannot learn the auxiliary objects around that object (hand, ruler, etc. if there is one), but if I do not delete it, the model will be trained according to the objects around it instead of the object enclosed in a rectangle (the actual desired object), this time it will be an error. What should I do at this stage?

My decision is this: I need to crop because the backgrounds are so different and I have too many class (511) and every class has only 60 photos

## YOLO
### YOLO (You Only Look Once) is a state-of-the-art object detection algorithm known for its speed and efficiency. It processes an entire image in a single forward pass of the network, predicting both the class and bounding box coordinates for multiple objects simultaneously.

### YOLO models are pre-trained on datasets like COCO and can detect dozens of object classes in real-time. However, for detecting custom classes, the model must be fine-tuned or retrained on labeled data. While not as flexible as CLIP+SAM for zero-shot tasks, YOLO remains a top choice for real-time applications such as surveillance, autonomous driving, and robotics due to its high accuracy and low latency.



In [None]:
#from IPython.display import clear_output
#
#!pip install ultralytics
#clear_output()
#
#from ultralytics import YOLO
#model = YOLO("yolov8x.pt")

In [None]:
def detect_and_plot_from_ndarray_list(image_list, model):
    records = []
    for idx, image in enumerate(image_list):
        if image is None or not isinstance(image['image'], np.ndarray):
            print(f"LList element {idx} is not a valid image, skipping.")
            continue
            
        temp_path = f"temp_padded_{idx}.jpg"
        cv2.imwrite(temp_path, image['image'])

        results = model(temp_path)[0]
        img = cv2.cvtColor(image['image'], cv2.COLOR_BGR2RGB)

        plt.figure(figsize=(6, 6))
        plt.imshow(img)
        plt.axis('off')
        image_h, image_w = img.shape[:2]

        if results.boxes:
            best_box = max(results.boxes, key=lambda b: b.conf[0].item())

            x1, y1, x2, y2 = best_box.xyxy[0].cpu().numpy()
            cls = int(best_box.cls[0].item())
            conf = float(best_box.conf[0].item())

            box_width = x2 - x1
            box_height = y2 - y1
            box_area = box_width * box_height
            box_center_x = x1 + box_width / 2
            box_center_y = y1 + box_height / 2

            bounding_box_ratio = box_width / box_height if box_height != 0 else 0
            box_relative_width = box_width / image_w
            box_relative_height = box_height / image_h
            image_area = image_h * image_w

            print('--------------------------------------------------------')
            img_id = int(re.search(r'_(\d+)\.jpg', image['path']).group(1))
            print(re.search(r'_(\d+)\.jpg', image['path']).group(1))
            df.loc[df['Id']== img_id, ['bounding_box_width',
                                       'bounding_box_height', 
                                       'bounding_box_ratio',
                                       'box_relative_width', 
                                       'box_relative_height',
                                       'confidence',
                                       'image_area', 
                                       'box_area', 
                                       'box_center_x', 'box_center_y']] = [
                                                                           box_width, 
                                                                           box_height, 
                                                                           bounding_box_ratio,   
                                                                           box_relative_width, 
                                                                           box_relative_height, 
                                                                           conf, 
                                                                           image_area, 
                                                                           box_area, 
                                                                           box_center_x, 
                                                                           box_center_y]

            
            plt.gca().add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1,
                                              edgecolor='lime', facecolor='none', linewidth=2))
            
        else:
            print(f"⚠️ No box detected in image {idx}.")

        plt.show()


In [None]:
img_path = [name for name in full_photo_paths if 'Cep_Çanta_Aynas' in name][:5]

padded_images = resize_with_padding(img_path, target_size=(224, 224))
#detect_and_plot_from_ndarray_list(padded_images, model)

## CLIP + SAM
### CLIP (Contrastive Language–Image Pretraining) and SAM (Segment Anything Model) can be combined to enable a more flexible and language-guided form of object detection. CLIP maps both images and text into a shared embedding space, allowing users to specify objects using natural language (e.g., "a person with a red hat"). SAM, on the other hand, excels at segmenting objects at the pixel level with high precision.

### When used together, CLIP identifies which parts of the image are semantically similar to the text prompt, while SAM precisely segments those regions. This combination is particularly powerful in zero-shot scenarios where no training on the specific object class is required. It’s ideal for interactive tools, content search, or dataset creation where labeled data is limited.



to use CLIP I need to rewrite the df['product_name'] in enligsh 

In [None]:
df.loc[df['product_name']=='glass_sphere', ['product_name']] = 'glass light emitting sphere'
df.loc[df['product_name']=='Potted_Cactu_Plush_Toy', ['product_name']] = 'Toy cactus'
df.loc[df['product_name']=='Tarak', ['product_name']] = 'black women comb'


df.loc[df['product_name']=='torch', ['product_name']] = 'flashlight in green box'
df.loc[df['product_name']=='glove2', ['product_name']] = 'white cloth'
df.loc[df['product_name']=='parfumm', ['product_name']] = 'perfume'
df.loc[df['product_name']=='Genel_Markalar', ['product_name']] = 'plastic  light emitting sphere'
df.loc[df['product_name']=='Orkide_Yetiştirme', ['product_name']] = 'black seed'
df.loc[df['product_name']=='BEYZANA', ['product_name']] = 'pencil bag'
df.loc[df['product_name']=='DEMPOWER', ['product_name']] = 'white small tool'

df.loc[df['product_name']=='Midilli', ['product_name']] = 'pink notebook'
df.loc[df['product_name']=='Tonny_Black', ['product_name']] = 'black leather wallet'
df.loc[df['product_name']=='LET_SCRUB', ['product_name']] = 'white cloth'
df.loc[df['product_name']=='woys', ['product_name']] = 'black handbag'
df.loc[df['product_name']=='Dekals', ['product_name']] = 'pink buckle'
df.loc[df['product_name']=='EMBHOME', ['product_name']] = 'steel ring iron'
df.loc[df['product_name']=='Paşabahçe', ['product_name']] = 'rectangular glass'
#df.loc[df['product_name']=='Küçük_El_Feneri_4_Adet', ['product_name']] = 'plastic flashlights'




In [None]:
img_path = [name for name in full_photo_paths if 'Genel_Markalar' in name]

In [None]:
df[df['product_name'] == 'rectangular glass']

In [None]:
#!pip install git+https://github.com/openai/CLIP.git
#!pip install opencv-python matplotlib
#!pip install git+https://github.com/facebookresearch/segment-anything.git
#clear_output()

In [None]:
padded_images = resize_with_padding(img_path, target_size=(224, 224))
type(padded_images[0]['image'])

In [None]:
def extract_id_from_filename(filename):
    # lamba_23.jpg -> 23
    return int(filename.split("_")[-1].split(".")[0])


In [None]:
from segment_anything import sam_model_registry, SamPredictor
#import clip
from tqdm import tqdm


images_path = "/kaggle/input/dimension-photos" 
sam_checkpoint = "/kaggle/input/sam_model/pytorch/default/1/sam_vit_h_4b8939.pth"
device = "cuda" if torch.cuda.is_available() else "cpu"

# CLIP
#clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)

# SAM
#sam = sam_model_registry["vit_h"](checkpoint=sam_checkpoint).to(device)
#predictor = SamPredictor(sam)


def run_clip_sam(image_path, prompt):
    
    image = Image.open(image_path).convert("RGB")

    padded_image = resize_with_padding([image_path], target_size=(224, 224))
    image_np = np.array(padded_image[0]['image'])
    image_tensor = clip_preprocess(image).unsqueeze(0).to(device)

    # CLIP: prompt and the picture
    with torch.no_grad():
        text = clip.tokenize([prompt]).to(device)
        image_features = clip_model.encode_image(image_tensor)
        text_features = clip_model.encode_text(text)
        similarity = torch.cosine_similarity(image_features, text_features).item()

    # SAM segmentation
    predictor.set_image(image_np)
    masks, scores, _ = predictor.predict(box=None, multimask_output=True)

    if masks is None or len(masks) == 0:
        return None

    best_mask = masks[np.argmax(scores)]
    ys, xs = np.where(best_mask)
    if len(xs) == 0 or len(ys) == 0:
        return None
    x_min, x_max, y_min, y_max = xs.min(), xs.max(), ys.min(), ys.max()

    # Normalize (YOLO formatı)
    h, w = image_np.shape[:2]
    x_center = ((x_min + x_max) / 2) / w
    y_center = ((y_min + y_max) / 2) / h
    box_w = (x_max - x_min) / w
    box_h = (y_max - y_min) / h

    return [x_center, y_center, box_w, box_h]



In [None]:
def draw_box_on_image(image_path, box):
    padded_image = resize_with_padding([image_path], target_size=(224, 224))
    image = padded_image[0]['image']
    
    h, w = image.shape[:2]

    x_center, y_center, box_w, box_h = box
    x_min = int((x_center - box_w / 2) * w)
    x_max = int((x_center + box_w / 2) * w)
    y_min = int((y_center - box_h / 2) * h)
    y_max = int((y_center + box_h / 2) * h)

    # Kutuyu çiz
    cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)

    plt.figure(figsize=(6, 6))
    plt.imshow(image)
    plt.axis('off')
    plt.show()


In [None]:
images_path

In [None]:
df['product_name'].unique()

In [None]:
#for i, image_file in enumerate(sorted(img_path[:10])):
#    image_id = extract_id_from_filename(image_file)
#    row = df[df["Id"] == image_id].iloc[0]
#    product_name = row["product_name"]
#    image_path_full = os.path.join(images_path, image_file)
#    box = run_clip_sam(image_path_full, prompt=product_name)
#    if box:
#        print(f"Image: {image_file} | Prompt: {product_name}")
#        draw_box_on_image(image_path_full, box)


it put in a boxt all of the photos

## GroundingDINO

In [None]:
if not os.path.exists("GroundingDINO"):
    !git clone https://github.com/IDEA-Research/GroundingDINO.git

%cd GroundingDINO
!pip install -e .
!pip install -q groundingdino transformers torchvision matplotlib
!pip install -q -e GroundingDINO
!pip install -q git+https://github.com/openai/CLIP.git
from IPython.display import clear_output
clear_output()

In [None]:
import requests
import os

model_url = "https://github.com/IDEA-Research/GroundingDINO/releases/download/0.1.0/groundingdino_swint_ogc.pth"
model_path = "/kaggle/input/groundingdino_model/pytorch/default/1/groundingdino_swint_ogc.pth"

if not os.path.exists(model_path):
    print("📥 Model downloading with stream...")
    with requests.get(model_url, stream=True) as r:
        r.raise_for_status()
        with open(model_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
    print("✅ Model downloaded.")
else:
    print("✅ Model already available.")


In [None]:
from groundingdino.util.inference import load_model, load_image, predict, annotate

# Load the Model
config_path = "groundingdino/config/GroundingDINO_SwinT_OGC.py"
model = load_model(config_path, model_path)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
def groundingdino(image_path):
    for img in image_path:
        image, image_tensor = load_image(img)
        image_id  = extract_id_from_filename(img)
        row = df[df["Id"] == image_id].iloc[0]
        caption = product_name = row["product_name"] # it will be our prompt
        device = next(model.parameters()).device
        image_tensor = image_tensor.to(device=next(model.parameters()).device, dtype=torch.float32)
        
        boxes, logits, phrases = predict(
            model=model,
            image=image_tensor,
            caption=caption,
            box_threshold=0.3,
            text_threshold=0.25,
            device=str(device)
        )

        if logits is None or len(logits) == 0:
            print(f"No object detected in image: {img}")
            continue
            
        best_index = int(logits.argmax())
        best_box = boxes[best_index].unsqueeze(0)
        
        annotated_frame = annotate(
            image_source=image, 
            boxes=best_box, 
            phrases=[""],
            logits=[logits[best_index]]
        )
        print('logits', logits)
        print('boxes',best_box)
        
        plt.imshow(annotated_frame)
        plt.axis('off')
        plt.show()

In [None]:
image_path = [name for name in full_photo_paths if 'Küçük_El_Feneri_4_Adet' in name][:5]
groundingdino(image_path)

In [None]:
df.loc[df['product_name']=='Küçük_El_Feneri_4_Adet', ['product_name']] = 'plastic flashlights'
image_path = [name for name in full_photo_paths if 'Küçük_El_Feneri_4_Adet' in name][:5]
groundingdino(image_path) #im checking if i rename the prompt english

In [None]:
#as you see when i change the prompt it detected better.

### I tried and it is the best object detection model for my data so im gonna creat a func for the model

In [None]:
df_box = pd.DataFrame()
df_box['bounding_box_width'] = 0.0
df_box['bounding_box_height'] = 0.0
df_box['box_relative_width'] = 0.0  # bounding_box_width / image_width
df_box['box_relative_height'] = 0.0  # bounding_box_height / image_height
df_box['confidence'] = 0.0  # Confidence score
df_box['box_area'] = 0.0
df_box['bbox_aspect_ratio'] = 0  # bounding_box_width / bounding_box_height
df_box['bbox_diag'] = 0  # sqrt(bw²+bh²)
df_box['log_box_area'] = 0  # log_box_area
df_box['norm_confidence'] = 0  # Normalized confidence

df_box.head()

In [None]:
image_path = [name for name in full_photo_paths if 'Küçük_El_Feneri_4_Adet' in name][:5]
image_path

In [None]:
df_box = pd.DataFrame()
df_box['Image_ID'] = np.zeros(len(df))
df_box['bounding_box_width'] = 0.0
df_box['bounding_box_height'] = 0.0
df_box['box_relative_width'] = 0.0  # bounding_box_width / image_width
df_box['box_relative_height'] = 0.0  # bounding_box_height / image_height
df_box['confidence'] = 0.0  # Confidence score
df_box['box_area'] = 0.0
df_box['bbox_aspect_ratio'] = 0  # bounding_box_width / bounding_box_height
df_box['bbox_diag'] = 0  # sqrt(bw²+bh²)
df_box['log_box_area'] = 0  # log_box_area
df_box['norm_confidence'] = 0  # Normalized confidence


def groundingdino_last_version(image_path):
    #output_crop_dir="cropped_images"
    #os.makedirs(output_crop_dir, exist_ok=True)
    
    for img in tqdm(image_path, desc="Processing Images"):
        print('img',img)
        image, image_tensor = load_image(img)
        image_id  = extract_id_from_filename(img)
        row = df[df["Id"] == image_id].iloc[0]
        caption = product_name = row["product_name"] # it will be our prompt
        device = next(model.parameters()).device
        image_tensor = image_tensor.to(device=device, dtype=torch.float32)
        
        #model pridect 
        boxes, logits, phrases = predict(
            model=model,
            image=image_tensor,
            caption=caption,
            box_threshold=0.3,
            text_threshold=0.25,
            device=str(device)
        )

        if logits is None or len(logits) == 0:
            print(f"No object detected in image: {img}")
            continue
            
        best_idx = logits.argmax().item()
        box = boxes[best_idx]
        
        
        x_min_rel, y_min_rel, x_max_rel, y_max_rel = box.tolist()
        

       #bbox_width = x_max - x_min
       #bbox_height = y_max - y_min
       #image_width, image_height = image.shape[1], image.shape[0]
#
       #box_area = bbox_width * bbox_height
       #bbox_aspect_ratio = bbox_width / bbox_height if bbox_height != 0 else 0
       #bbox_diag = np.sqrt(bbox_width**2 + bbox_height**2)
       #log_box_area = np.log(box_area) if box_area > 0 else 0
       #norm_confidence = logits[best_index].item()
#
       ##addding the new information to the df_box
       #df_box.loc[len(df_box)] = [
       #    image_id,
       #    bbox_width,
       #    bbox_height,
       #    bbox_width / image_width,
       #    bbox_height / image_height,
       #    logits[best_index].item(),
       #    box_area,
       #    bbox_aspect_ratio,
       #    bbox_diag,
       #    log_box_area,
       #    norm_confidence
       #]
       #
        
        h, w = image.shape[:2]
        x_min = int(x_min_rel * w)
        y_min = int(y_min_rel * h)
        x_max = int(x_max_rel * w)
        y_max = int(y_max_rel * h)

        
        x_min, y_min = max(0, x_min), max(0, y_min)
        x_max, y_max = min(w, x_max), min(h, y_max)
        print(f"Crop coords: x_min={x_min}, y_min={y_min}, x_max={x_max}, y_max={y_max}")
        crop = image[y_min:y_max, x_min:x_max]

        #reshape
        padded = resize_with_padding(image=crop, target_size=(224, 224))
        

        #record the path
        #out_path = os.path.join(output_crop_dir, os.path.basename(img))
        #cv2.imwrite(out_path, padded_image)

        
        plt.imshow(cv2.cvtColor(padded, cv2.COLOR_BGR2RGB))
        plt.axis('off')
        plt.show()

    #df_box.to_csv("/kaggle/working/df_box.csv", index=False)
    return df_box 
        

In [None]:
image_path = [name for name in full_photo_paths if 'Küçük_El_Feneri_4_Adet' in name][:5]
groundingdino_last_version(image_path)

In [None]:
image_path = [name for name in full_photo_paths if 'Profesyonel_El_Feneri_Usb_Şarjlı_6_Modlu_Mor_Işıklı_Çakarlı_Mıknatıslı_Özel_Kutulu' in name][:10]
groundingdino(image_path)