In [None]:
img_list = [
    'https://phinf.pstatic.net/checkout.phinf/20220420_107/1650431688263oXIpO_JPEG/review-attachment-76798663-a32c-4720-abad-9ec0781a10c1.jpeg',
    'https://phinf.pstatic.net/checkout.phinf/20220421_6/1650531633862SX5EH_JPEG/review-attachment-3066b80a-e489-468b-8a18-e0593d4b96e1.jpeg',
    'https://phinf.pstatic.net/checkout.phinf/20220419_146/1650349817496rTDSY_JPEG/review-attachment-1c3c4b55-3732-4880-9a3c-f88e98a96a23.jpeg',
    'https://phinf.pstatic.net/checkout.phinf/20220419_275/1650376220797ta2qd_JPEG/review-attachment-b53b80cd-8dee-410d-837b-5a229f2dde93.jpeg',
    'https://phinf.pstatic.net/checkout.phinf/20220419_256/1650374731624Waben_JPEG/review-attachment-3e8585fd-2ca8-45ac-b3f8-f6ec43e64396.jpeg',
    'https://phinf.pstatic.net/checkout.phinf/20220424_80/1650761869485UUE0B_JPEG/review-attachment-98c24e8b-05d5-4eff-a1a8-b43180021f82.jpeg'
    ]


### (미분류)

In [8]:
# sample code

from transformers import DetrFeatureExtractor, DetrForObjectDetection
from PIL import Image
import requests

url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)

feature_extractor = DetrFeatureExtractor.from_pretrained('facebook/detr-resnet-50')
model = DetrForObjectDetection.from_pretrained('facebook/detr-resnet-50')

inputs = feature_extractor(images=image, return_tensors="pt")
outputs = model(**inputs)

# model predicts bounding boxes and corresponding COCO classes
logits = outputs.logits
bboxes = outputs.pred_boxes

In [23]:
outputs.auxiliary_outputs

### Feature Extraction

In [65]:
from PIL import Image
import requests

from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# url = "http://images.cocodataset.org/val2017/000000039769.jpg"
url = img_list[3]

image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text=["a photo of a cat", "a photo of a wet box", "a photo of a crumpled box"], images=image, return_tensors="pt", padding=True)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities

ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.


In [66]:
logits_per_image

tensor([[20.7982, 28.6741, 29.2548]], grad_fn=<PermuteBackward>)

In [67]:
probs[0][1], probs[0][2]

(tensor(0.3587, grad_fn=<SelectBackward>),
 tensor(0.6412, grad_fn=<SelectBackward>))

In [68]:
url

'https://phinf.pstatic.net/checkout.phinf/20220419_275/1650376220797ta2qd_JPEG/review-attachment-b53b80cd-8dee-410d-837b-5a229f2dde93.jpeg'

###  image captioning

In [None]:
def predict_step(image_paths):
    
    from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
    import torch

    model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
    feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
    tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    max_length = 16
    num_beams = 4
    gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
    
    images = []
    for image_path in image_paths:
        i_image = Image.open(requests.get(image_paths, stream=True).raw)
        if i_image.mode != "RGB":
            i_image = i_image.convert(mode="RGB")
    images.append(i_image)

    pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)

    output_ids = model.generate(pixel_values, **gen_kwargs)

    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    preds = [pred.strip() for pred in preds]
    return preds

In [77]:
# vanilla
predict_step('http://images.cocodataset.org/val2017/000000039769.jpg')

['a cat laying on top of a couch next to another cat']

In [80]:
# test_00
predict_step(img_list[0])

Wall time: 0 ns


['a box filled with lots of different types of boxes']

In [78]:
# test_04
predict_step(img_list[4])

['a box that has a piece of paper on it']