In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
from datasets import load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset = load_from_disk("../image_story_description/image_story_dataset")

In [4]:
split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

In [5]:
system_message = """You are a poetic and vivid storyteller guiding a blind person through what the camera sees. 
Describe the images like a flowing story, using rich adjectives, sensory details, and present continuous tense. Avoid guessing."""

prompt = """Describe the image vividly in present continuous tense, using rich adjectives and sensory details. Write like a story."""

In [6]:
def format_data(sample):
    return [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_message}],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": sample["image"],
                },
                {
                    "type": "text",
                    "text": prompt,
                },
            ],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": sample["description"]}],
        },
    ]

In [7]:
train_dataset_formatted = [format_data(sample) for sample in train_dataset]
eval_dataset_formatted = [format_data(sample) for sample in eval_dataset]

In [8]:
eval_dataset_formatted[0]

[{'role': 'system',
  'content': [{'type': 'text',
    'text': 'You are a poetic and vivid storyteller guiding a blind person through what the camera sees. \nDescribe the images like a flowing story, using rich adjectives, sensory details, and present continuous tense. Avoid guessing.'}]},
 {'role': 'user',
  'content': [{'type': 'image',
    'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x333>},
   {'type': 'text',
    'text': 'Describe the image vividly in present continuous tense, using rich adjectives and sensory details. Write like a story.'}]},
 {'role': 'assistant',
  'content': [{'type': 'text',
    'text': 'A beautiful woman holding a tennis racquet stands on a tennis court, her eyes fixed on the ball in flight. She swings the racquet with precision, aiming to send the ball back to the opponent. The court is alive with the sound of the ball hitting the strings and the rhythmic motion of her feet. The woman in a white outfit moves fluidly across the court, d

In [None]:
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [11]:
model_id = "HuggingFaceTB/SmolVLM-500M-Instruct"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
base_model = AutoModelForVision2Seq.from_pretrained(
    model_id,
    device_map=DEVICE,
    torch_dtype=torch.bfloat16,
    _attn_implementation="eager",
).to(DEVICE)

processor = AutoProcessor.from_pretrained(model_id)

In [10]:
def generate_text_from_sample(model, processor, sample, max_new_tokens=200, DEVICE="cuda"):

    # Prepare inputs
    prompt = processor.apply_chat_template(sample[0:2], add_generation_prompt=True) # passing the prompt without the system message or the assistant's response
    
    image = sample[1]['content'][0]['image']
    if image.mode != "RGB":
        image = image.convert("RGB")

    inputs = processor(text=prompt, images=[image], return_tensors="pt")
    inputs = inputs.to(DEVICE)

    # Generate text with the model
    generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)

    # Trim the generated ids to remove the input ids
    trimmed_generated_ids = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]

    # Decode the output text
    output_text = processor.batch_decode(
        trimmed_generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    return output_text[0]  

In [13]:
# Create a csv file
import csv
csv_file_path = "base_model_response.csv"
csv_file = open(csv_file_path, mode="w", newline="", encoding="utf-8")
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["image", "description", "model_output"])

32

In [14]:
for sample in tqdm(eval_dataset_formatted, total=len(eval_dataset_formatted), desc="Generating model outputs"):
    image = sample[1]['content'][0]['image']
    description = sample[2]['content'][0]['text']
    model_output = generate_text_from_sample(base_model, processor, sample, max_new_tokens=200, DEVICE=DEVICE)  
    csv_writer.writerow([image, description, model_output])
    
csv_file.close()

Generating model outputs: 100%|██████████| 150/150 [13:12<00:00,  5.28s/it]
