In [21]:
from transformers import AutoImageProcessor, AutoModelForObjectDetection
import torch
from torch import nn
from torch.nn import functional as F
from PIL import Image
import requests

# URL of an image to classify
url = "./data/images/0.jpg"
image = Image.open(url)

# Load the image processor and pretrained model (for feature extraction)
image_processor = AutoImageProcessor.from_pretrained("hustvl/yolos-tiny")
model = AutoModelForObjectDetection.from_pretrained("hustvl/yolos-tiny")

# Process the image
inputs = image_processor(images=image, return_tensors="pt")

# Extract the feature size by inspecting the model's backbone outputs
with torch.no_grad():
    #feature_sample = model.backbone(inputs['pixel_values'])[0]
    feature_dim = outputs.logits.shape[0]#feature_sample.shape[1]

# Create a new classification head for the 19-label multilabel classification task
class MultiLabelClassifier(nn.Module):
    def __init__(self, feature_dim, num_classes):
        super(MultiLabelClassifier, self).__init__()
        self.fc = nn.Linear(feature_dim, num_classes)
        self.num_classes = num_classes

    def forward(self, x):
        # Global Average Pooling to convert to feature vector
        x = F.adaptive_avg_pool2d(x, (1, 1)).view(x.size(0), -1)
        # Fully connected layer with sigmoid activation for multilabel classification
        x = torch.sigmoid(self.fc(x))
        return x

# Instantiate the classifier head for 19 classes
num_classes = 19
classifier = MultiLabelClassifier(feature_dim, num_classes)

# Freeze the pretrained YOLO backbone
for param in model.parameters():
    param.requires_grad = False

# Define a function that uses the frozen backbone and the new classifier head
def forward_with_classifier(inputs):
    # Pass through the feature extraction backbone of the YOLO model
    features = model(inputs['pixel_values'])[0]
    # Pass through the new classification head
    return classifier(features)

# Perform inference using the newly defined function
outputs = forward_with_classifier(inputs)
predicted_labels = torch.where(outputs > 0.5, 1, 0)

# Print results (convert tensor to list for display purposes)
print(f"Predicted multilabel classification results: {predicted_labels.squeeze().tolist()}")




Predicted multilabel classification results: [1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0]


In [1]:
from transformers import AutoImageProcessor, AutoModelForObjectDetection
import torch
from PIL import Image
import requests

url = "./data/images/0.jpg"
image = Image.open(url)

image_processor = AutoImageProcessor.from_pretrained("hustvl/yolos-tiny")
model = AutoModelForObjectDetection.from_pretrained("hustvl/yolos-tiny")

inputs = image_processor(images=image, return_tensors="pt")
outputs = model(**inputs)

# convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
target_sizes = torch.tensor([image.size[::-1]])
results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
    0
]
for label in results["labels"]:
    print(model.config.id2label[label.item()])

  from .autonotebook import tqdm as notebook_tqdm


person
umbrella


In [3]:
zero_shot_yolo_labels = {}
directory = "./data/images/{}.jpg"
count = 0
for i in range(40000):
    image = Image.open(directory.format(i))

    inputs = image_processor(images=image, return_tensors="pt")
    outputs = model(**inputs)

    # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
    target_sizes = torch.tensor([image.size[::-1]])
    results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[0]

    zero_shot_yolo_labels["{}.jpg".format(i)] = [x.item() for x in results['labels']]  

    if count % 10 == 0:
        print(count)
    count += 1
    image.close()
    

0
10


KeyboardInterrupt: 