In [4]:
import os
import requests
import torch
from PIL import Image
from transformers import AutoProcessor, OwlViTForObjectDetection

### Initialize the OWL-ViT model and processor

In [8]:
processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")

### Create results directory if it doesn't exist

In [9]:
results_dir = os.path.join(os.path.dirname(os.getcwd()), "results/OwlViT")
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

### Define text queries

In [1]:
# texts = ['Food waste', 'Broken glass', 'Glass bottle', 'Glass jar', 'Glass cup','Aerosol',
#             'Aluminium foil', 'Clear plastic bottle', 'Disposable food container',
#             'Disposable plastic cup', 'Drink can', 'Drink carton', 'Food Can', 'Food can',
#             'Metal bottle cap', 'Metal lid', 'Other plastic', 'Other plastic bottle',
#             'Other plastic container', 'Other plastic cup', 'Plastic Film', 'Plastic bottle cap',
#             'Plastic film', 'Plastic glooves', 'Plastic lid', 'Plastic straw', 'Plastic utensils',
#             'Polypropylene bag', 'Pop tab', 'Scrap metal', 'Single-use carrier bag',
#             'Six pack rings', 'Spread tub', 'Tupperware', 'Aluminium blister pack',
#             'Carded blister pack', 'Cigarette', 'Crisp packet', 'Foam cup', 'Foam food container',
#             'Garbage bag', 'Meal carton', 'Meal carton', 'Other plastic wrapper', 'Paper cup',
#             'Paper straw', 'Pizza box', 'Plastified paper bag', 'Rope', 'Rope & strings', 'Shoe',
#             'Squeezable tube', 'Styrofoam piece', 'Tissues', 'Wrapping paper', "Battery",
#             'Corrugated carton', 'Egg carton', 'Toilet tube', 'Other carton', 'Magazine paper',
#             'Normal paper', 'Paper bag', 'Unlabeled litter']

Since results were not good enough, we go for less specific labels:

In [1]:
texts = ['Food waste', 'glass trash', 'metals and plastic trash', 'non recyclable trash', 'Battery', 'paper trash', 'Unrecognized trash']

### Inference over TACO dataset, Saving in results/OwlViT

In [7]:
if os.listdir(os.path.join(os.path.dirname(os.getcwd()), "results/OwlViT")):
    print("Owl-ViT inference is already done and results are saved.")
else:
    # Loop through the images in the folder
    image_folder = os.path.join(os.path.dirname(os.getcwd()), "imgs/TACO/images")
    for image_name in os.listdir(image_folder):
        image_path = os.path.join(image_folder, image_name)
        image = Image.open(image_path)
        
        # Prepare inputs and perform inference
        inputs = processor(text=texts, images=image, return_tensors="pt")
        outputs = model(**inputs)
    
        # Post-process the outputs
        target_sizes = torch.Tensor([image.size[::-1]])
        results = processor.post_process_object_detection(outputs=outputs, threshold=0.1, target_sizes=target_sizes)
    
        # Save results to TXT file in YOLO format
        result_file_path = os.path.join(results_dir, f"{image_name.replace('/', '_').split('.')[0]}.txt")
        with open(result_file_path, 'w') as f:
            for i, result in enumerate(results):
                boxes, scores, labels = result["boxes"], result["scores"], result["labels"]
                for box, label in zip(boxes, labels):
                    x_center, y_center, width, height = [round(coord, 6) for coord in box.tolist()]
                    f.write(f"{label} {x_center} {y_center} {width} {height}\n")
    print("Done inferencing annotations over images using OwlViT")

Owl-ViT inference is already done and results are saved.
