In [None]:
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration, BitsAndBytesConfig, TextStreamer, TextIteratorStreamer
from PIL import Image
import requests
import torch
from time import time
from threading import Thread
from accelerate import cpu_offload, disk_offload

In [None]:
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
image = Image.open(requests.get(url, stream=True).raw)
image

In [None]:
model_id = '/home/jovyan/models/paligemma_original/snapshots/d1d8734c9c3ad0ccfeea4afc270faa356c2ba515'
device = "cuda:0"
dtype = torch.bfloat16

quantization_config = BitsAndBytesConfig(load_in_8bit=True)

t1 = time()
model = PaliGemmaForConditionalGeneration.from_pretrained(
    model_id, low_cpu_mem_usage=True, quantization_config=quantization_config
).eval()
processor = AutoProcessor.from_pretrained(model_id)
t2 = time()
print(f'time spent loading: {t2 - t1}')

In [None]:
model.save_pretrained?

In [None]:
processor.save_pretrained?

In [None]:
# model.save_pretrained('./models/paligemma_4bit', safe_serialization=False)
# processor.save_pretrained('./models/paligemma_4bit', safe_serialization=False)

# model.save_pretrained('./models/paligemma_8bit', safe_serialization=False)
# processor.save_pretrained('./models/paligemma_8bit', safe_serialization=False)

In [None]:
# load quantized
model_id = '/home/jovyan/models/paligemma_4bit'
device = "cuda:0"
dtype = torch.bfloat16

t1 = time()
model = PaliGemmaForConditionalGeneration.from_pretrained(
    model_id, low_cpu_mem_usage=True, device_map=device
).eval()
processor = AutoProcessor.from_pretrained(model_id)
t2 = time()
print(f'time spent loading: {t2 - t1}')

In [None]:
prompt = "caption en in great detail"
image = Image.open('./408Importance of taking a vacation.jpg')
model_inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
input_len = model_inputs["input_ids"].shape[-1]

t1 = time()
with torch.inference_mode():
    generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
    generation_c = generation[0][input_len:]
    decoded = processor.decode(generation_c, skip_special_tokens=True)
    print(decoded)
t2 = time()
t2 - t1

In [None]:
prompt = "caption en in great detail"
image = Image.open('./408Importance of taking a vacation.jpg')
model_inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
input_len = model_inputs["input_ids"].shape[-1]
streamer = TextStreamer(processor, skip_prompt=True, skip_special_tokens=True)

t1 = time()
with torch.inference_mode():
    generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=True, streamer=streamer)
    # generation = generation[0][input_len:]
    # decoded = processor.decode(generation, skip_special_tokens=True)
    # print(decoded)
t2 = time()
t2 - t1

In [None]:
model_inputs = processor(text=prompt, images=image, return_tensors="pt").to('cuda')

In [None]:
t1 = time()
with torch.inference_mode():
    generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=True, streamer=streamer)
    # generation = generation[0][input_len:]
    # decoded = processor.decode(generation, skip_special_tokens=True)
    # print(decoded)
t2 = time()

In [None]:
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
def threaded_gen(streamer):
    with torch.inference_mode():
        prompt = "caption en in great detail."
        image = Image.open('./408Importance of taking a vacation.jpg')
        model_inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
        input_len = model_inputs["input_ids"].shape[-1]
        generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=True, streamer=streamer)

In [None]:
t = Thread(target=threaded_gen, args=(streamer,))
t.start()
out = ''
for i in streamer:
    out += i
    print(out)
t.join()