In [52]:
import torch
from tqdm import tqdm
import pandas as pd
from PIL import Image
import requests

tqdm.pandas()

from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration, CLIPProcessor, CLIPModel, AutoTokenizer
from datasets import load_dataset

from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead, AutoModelForSeq2SeqLMWithValueHead, create_reference_model
from trl.core import respond_to_batch

In [53]:
#ds = load_dataset("Multimodal-Fatima/COCO_captions_train", split="train")

In [54]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")

url = "http://farm3.staticflickr.com/2519/4377463269_6c0e733b1b_z.jpg"
image = Image.open(requests.get(url, stream=True).raw)
text = "A picture of"

inputs = processor(image, text, return_tensors="pt").to("cuda")

out = blip_model.generate(**inputs)
out1 = processor.decode(out[0], skip_special_tokens=True)
print(out1)



a picture of two kids eating pizza


In [55]:
# unconditional image captioning
inputs = processor(image, return_tensors="pt").to("cuda")

out = blip_model.generate(**inputs)
out2 = processor.decode(out[0], skip_special_tokens=True)
print(out2)

two little girls sitting at a table eating pizza


In [56]:
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

inputs = processor(text=[out1, out2], images=image, return_tensors="pt", padding=True)

print(f'Caption 1: {out1}, caption 2: {out2}')
outputs = clip_model(**inputs)
logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
print(logits_per_image)
print(probs)

inputs = processor(text=[out2, "two little girls eating a piece of pizza at a table"], images=image, return_tensors="pt", padding=True)

print(f'Caption from clip model: {out2}, from dataset: "two little girls eating a piece of pizza at a table"')
outputs = clip_model(**inputs)
logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
print(logits_per_image)
print(probs)

Caption 1: a picture of two kids eating pizza, caption 2: two little girls sitting at a table eating pizza
tensor([[32.6343, 34.2727]], grad_fn=<TBackward0>)
tensor([[0.1627, 0.8373]], grad_fn=<SoftmaxBackward0>)
Caption from clip model: two little girls sitting at a table eating pizza, from dataset: "two little girls eating a piece of pizza at a table"
tensor([[34.2727, 35.4178]], grad_fn=<TBackward0>)
tensor([[0.2414, 0.7586]], grad_fn=<SoftmaxBackward0>)


In [57]:
inputs = processor(text=[out2], images=image, return_tensors="pt", padding=True)

print(f'Caption 1: {out2}')
outputs = clip_model(**inputs)
logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
print(logits_per_image)
print(probs)

Caption 1: two little girls sitting at a table eating pizza
tensor([[34.2727]], grad_fn=<TBackward0>)
tensor([[1.]], grad_fn=<SoftmaxBackward0>)


In [58]:
# get models
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
tokenizer = processor.tokenizer
blip_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")
model_ref = create_reference_model(blip_model)

# model = AutoModelForCausalLMWithValueHead.from_pretrained("Salesforce/blip-image-captioning-base")
# model_ref = create_reference_model(model)

#tokenizer = AutoTokenizer.from_pretrained("Salesforce/blip-image-captioning-base")

# initialize trainer
ppo_config = PPOConfig(
    batch_size=1,
)

# encode a query
# query_txt = "This morning I went to the "
# query_tensor = tokenizer.encode(query_txt, return_tensors="pt")
url = "http://farm3.staticflickr.com/2519/4377463269_6c0e733b1b_z.jpg"
image = Image.open(requests.get(url, stream=True).raw)
text = "A picture of"

inputs = processor(image, text, return_tensors="pt").to("cuda")
query_tensor = tokenizer.encode(text, return_tensors="pt")

# get model response
response_tensor  = blip_model.generate(**inputs)

# create a ppo trainer
ppo_trainer = PPOTrainer(ppo_config, blip_model, model_ref, tokenizer)

# define a reward for response
# (this could be any reward such as human feedback or output from another model)
reward = [torch.tensor(1.0)]

# train model for one step with ppo
train_stats = ppo_trainer.step([query_tensor[0]], [response_tensor[0]], reward)

ValueError: Unrecognized configuration class <class 'transformers.models.blip.configuration_blip.BlipConfig'> for this kind of AutoModel: AutoModelForSeq2SeqLM.
Model type should be one of BartConfig, BigBirdPegasusConfig, BlenderbotConfig, BlenderbotSmallConfig, EncoderDecoderConfig, FSMTConfig, GPTSanJapaneseConfig, LEDConfig, LongT5Config, M2M100Config, MarianConfig, MBartConfig, MT5Config, MvpConfig, NllbMoeConfig, PegasusConfig, PegasusXConfig, PLBartConfig, ProphetNetConfig, SeamlessM4TConfig, SeamlessM4Tv2Config, SwitchTransformersConfig, T5Config, UMT5Config, XLMProphetNetConfig.