In [None]:
!pip install --upgrade bitsandbytes
!pip install --upgrade datasets tokenizers
!pip install --upgrade transformers

In [None]:
import torch
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from transformers import LlavaForConditionalGeneration
from transformers import AutoProcessor
from transformers import BitsAndBytesConfig
from transformers import GenerationConfig

from PIL import Image

In [None]:
data = open('/kaggle/input/coco-vqa-dataset/vaq2.0.TrainImages.txt', 'r')
lines = data.readlines()
print(lines[:5])  # Print the first 5 lines

In [None]:
# Load train data
train_data = []
train_path = '/kaggle/input/coco-vqa-dataset/vaq2.0.TrainImages.txt'
with open(train_path, 'r') as f:
    for i, line in enumerate(f.readlines()):
        
        full_sentence = line.split('\t')
        if (i < 3):
            print("Full sentence: ", full_sentence)
        
        img_path = full_sentence[0][:-2]
        if (i < 3):
            print("Image Path: ", img_path)
        
        qa = full_sentence[1].split('?')
        
        question = qa[0]
        if (i < 3):
            print("Question: ", question)

        # Error handling in case
        if len(qa) == 3:
            answer = qa[2]
        else:
            answer = qa[1]
        
        # Remove any trailing newline characters or extra spaces from the answer
        answer = answer.strip()
        
        if (i < 3):
            print("Answer: ", answer)
            
        if (i < 3):
            print(" ")
            
        data_sample = {
            'Image Path': img_path,
            'Question': question + '?',
            'Answer': answer  # No trailing newline
        }
        train_data.append(data_sample)

In [None]:
# Define quantization configuration
quantization_config = BitsAndBytesConfig(load_in_4bit=True, 
                                          bnb_4bit_compute_dtype=torch.float16)

# Define model ID
model_id = "llava-hf/llava-1.5-7b-hf"

# Set device (cuda if available, else cpu)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load processor (e.g., tokenizer, feature extractor, etc.)
processor = AutoProcessor.from_pretrained(model_id)

# Correct way to load the model with the quantization config and device map as keyword arguments
model = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config=quantization_config,  # Pass as keyword argument 
    torch_dtype=torch.float16                 # Set dtype to float16 (optional, but often used with quantization)
)

# Move the model to the specified device
model = model.to(device)


In [None]:
# Guide said don't change unless know Prompt Engineering (I don't know wtf that is)
def create_prompt(question):
    prompt = f""" ### INSTRUCTION:
Your task is to answer the question based on the given image. You can only answer 'yes' or 'not'.
### USER: <image>
{question}
### ASSISTANT:"""
    return prompt

In [None]:
generation_config = GenerationConfig(
    max_new_tokens=10,
    do_sample=True,
    temperature=0.1,
    top_p=0.95,
    top_k = 50,
    eos_token_id = model.config.eos_token_id,
    pad_token=model.config.pad_token_id
)

In [None]:
idx = 0
question = train_data[idx]['Question']
img = train_data[idx]['Image Path']
img_path = os.path.join('/kaggle/input/coco-vqa-dataset/val2014-resised', img)
label = train_data[idx]['Answer']
image = Image.open(img_path)

prompt = create_prompt(question)
inputs = processor(prompt,
                  image,
                  padding = True,
                  return_tensors = 'pt').to(device)

output = model.generate(**inputs, generation_config=generation_config)
generated_text = processor.decode(output[0], skip_special_tokens = True)

plt.imshow(image)
plt.axis("off")
plt.show()
print(f"Question: {question}")
print(f"Label: {label}")
print(f"Prediction: {generated_text.split('### ASSISTANT: ')[-1]}")