In [None]:
%pip install open_clip_torch

In [None]:
import torch
from PIL import Image
from tkinter import filedialog
import open_clip

open_clip.list_pretrained()

In [15]:
model, _, preprocess = open_clip.create_model_and_transforms('convnext_base_w', pretrained='laion2b_s13b_b82k_augreg')
model.eval()  # model in train mode by default, impacts some models with BatchNorm or stochastic depth active
tokenizer = open_clip.get_tokenizer('ViT-B-32')

In [32]:
image_path = filedialog.askopenfilename(title="Choose a image", filetypes=[("Image files", "*.png *.jpg *.jpeg")])

image = preprocess(Image.open(image_path)).unsqueeze(0)

location_labels = ["flower", "garden", "flower garden", "beach", "mountain", "city", "forest", "desert", "village"]

locationn_description = tokenizer(["This is located at " + location for location in location_labels])

In [35]:
with torch.no_grad(), torch.cuda.amp.autocast():
    image_features = model.encode_image(image)
    text_features = model.encode_text(locationn_description)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    location_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)

# Sort the location descriptions by their probabilities in descending order
sorted_probs_and_labels = sorted(zip(location_labels, location_probs[0]), key=lambda x: x[1], reverse=True)

# Print the sorted results
for location, prob in sorted_probs_and_labels:
    print(f"{location}: {100*prob.item():.4f}%")

flower garden: 71.4617%
garden: 16.1532%
flower: 12.2007%
village: 0.1202%
forest: 0.0400%
mountain: 0.0121%
city: 0.0121%
beach: 0.0001%
desert: 0.0000%
