In [1]:
import torch
import clip
from PIL import Image
import matplotlib.pyplot as plt
import openai
import requests
from io import BytesIO
import torchvision
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

In [None]:
image = preprocess(Image.open("clip.jpg")).unsqueeze(0).to(device)
text = clip.tokenize(["eiffel tower", "pisa tower", "burj khalifa"]).to(device)

In [None]:
plt.imshow(image.squeeze().permute(1, 2, 0))

In [None]:
with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    
    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs)

In [6]:
# Load the GPT-2 model and tokenizer
model_name = 'gpt2-large'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

In [7]:
# Define the function that generates features for a category

def generate_features(category):
    # Generate the input text for the GPT-2 model
    input_text = f"Q: What are useful features for distinguishing a {category} in a photo without any explanation?\nA: There are several useful visual features to tell there is a {category} in a photo:\n-"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    # Generate text from the GPT-2 model
    output = model.generate(input_ids, max_length=100,pad_token_id=tokenizer.eos_token_id, do_sample=True, temperature=0.7)
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)



    return output_text


In [8]:
# Example usage
category = 'store'
features = generate_features(category)
print(features)

Q: What are useful features for distinguishing a store in a photo without any explanation?
A: There are several useful visual features to tell there is a store in a photo:
- The store is large
The store is in a well-lit area
The store is in the center of the photo
- The store is in a location that is not the one shown on a map
- The store is in a well-lit area with no trees
- The store is in a
