In [1]:
import os
import glob

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau

from transformers import CLIPProcessor, CLIPModel
from utils.data import Transform, ImageTextDataset, collate_fn

In [2]:
device = 0 if torch.cuda.is_available() else "cpu"
device

0

In [6]:
clip_checkpoint = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained('./out/lr1e-06_7/')
processor = CLIPProcessor.from_pretrained(clip_checkpoint)
tokenizer = processor.tokenizer

In [7]:
DATA_ROOT = 'data'
test_loader = DataLoader(
    ImageTextDataset(DATA_ROOT, "test", transform=Transform(224, False)),
    batch_size=1,
    collate_fn=collate_fn
)

In [8]:
from PIL import Image
import requests

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities

In [9]:
probs

tensor([[9.9962e-01, 3.7815e-04]], grad_fn=<SoftmaxBackward0>)