<a href="https://colab.research.google.com/github/Michael-L-i-1/CS231N-Final-Project/blob/main/Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load Model

We will be using SmolVLM

In [None]:
!pip install hf_xet
!pip install flash-attn

In [None]:
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-500M-Instruct")
model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM-500M-Instruct",
                                                torch_dtype=torch.bfloat16,
                                                _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager").to(DEVICE)
model.to('cuda')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Test Single Image

In [None]:
from PIL import Image
from transformers.image_utils import load_image

# load test image
image = Image.open("/content/test.png")

question = """Given the diagram, list the labels of the circles in order from leftmost to rightmost
          (provide name only). You should have all the names included. The name for the cicle corresponds
          To the arrow that the name points to."""

messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": question}
        ]
    },
]

# prepare inputs
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
inputs = inputs.to(DEVICE)

In [None]:
# generate outputs
generated_ids = model.generate(**inputs, max_new_tokens=500, do_sample=True)
generated_texts = processor.batch_decode(
    generated_ids,
    skip_special_tokens=True,
)

print(generated_texts[0])


# Evaluating Baseline on Dataset

In [None]:
import json
import os
from tqdm.notebook import tqdm


base_drive_path = '/content/drive/My Drive/CS231N Colabs/dataset'
json_file_path = os.path.join(base_drive_path, 'metadata.json')

# load in the dataset
with open(json_file_path, 'r') as f:
    data = json.load(f)

count = 0
correct = 0
total = 0

# process all the images
for entry in tqdm(data, desc="Processing Images"):
  count += 1
  if count > 250:
      break
  image_relative_path = entry['image_path']
  image_full_path = os.path.join('/content/drive/My Drive/CS231N Colabs', image_relative_path)

  image = Image.open(image_full_path)

  question = """Given the diagram, list the labels of the circles in order from leftmost to rightmost
            (provide name only)"""

  messages = [
      {
          "role": "user",
          "content": [
              {"type": "image"},
              {"type": "text", "text": question}
          ]
      },
  ]

  # prepare inputs
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
  inputs = processor(text=prompt, images=[image], return_tensors="pt")
  inputs = inputs.to(DEVICE)

  # generate outputs
  with torch.no_grad():
      generated_ids = model.generate(**inputs, max_new_tokens=500)
  generated_texts = processor.batch_decode(
      generated_ids,
      skip_special_tokens=True,
  )

  # process the output
  predicted_order = generated_texts[0].strip()
  predicted_order = predicted_order.split("Assistant:")[-1].strip()
  predicted_order = [name.strip() for name in predicted_order.split(",")]

  expected_order = entry['order']

  if predicted_order == expected_order:
    correct += 1
  total += 1

print(f"Accuracy: {correct / total}")

In [None]:
import os

# If you have a `dataset/` folder, list that too
dataset_path = '/content/drive/My Drive/CS231N Colabs/dataset'
os.listdir(dataset_path)
print("image_0.png" in os.listdir(dataset_path))

# Baseline Supervised Fine Tuning w/ Cross Entropy Loss

CS231N Colabs/dataset - 2500 images
2000 train, 250 val, 250 test

0-1999, 2000-2249, 2250-2499

