<a href="https://colab.research.google.com/github/Michael-L-i-1/CS231N-Final-Project/blob/main/Supervised_Learning_Attempt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load Model

We will be using SmolVLM

In [None]:
!pip install hf_xet
!pip install flash-attn

In [None]:
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-256M-Instruct")
model     = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM-256M-Instruct",
                                                torch_dtype=torch.bfloat16,
                                                _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager").to(DEVICE)
model.to('cuda')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Test Single Image

In [None]:
from PIL import Image
from transformers.image_utils import load_image


# load test image
image = Image.open("/content/test.png")

question = """Given the diagram, list the labels of the circles in order from leftmost to rightmost
          (provide name only)"""

messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": question}
        ]
    },
]

# prepare inputs
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
inputs = inputs.to(DEVICE)

In [None]:
# generate outputs
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_texts = processor.batch_decode(
    generated_ids,
    skip_special_tokens=True,
)

print(generated_texts[0])


# Evaluating Baseline on Dataset

In [None]:
import json
import os
from tqdm.notebook import tqdm


base_drive_path = '/content/drive/My Drive/CS231N Colabs/dataset'
json_file_path = os.path.join(base_drive_path, 'metadata.json')

# load in the dataset
with open(json_file_path, 'r') as f:
    data = json.load(f)

count = 0
correct = 0
total = 0

# process all the images
for entry in tqdm(data, desc="Processing Images"):
  count += 1
  if count > 250:
      break
  image_relative_path = entry['image_path']
  image_full_path = os.path.join('/content/drive/My Drive/CS231N Colabs', image_relative_path)

  image = Image.open(image_full_path)

  question = """Given the diagram, list the labels of the circles in order from leftmost to rightmost
            (provide name only)"""

  messages = [
      {
          "role": "user",
          "content": [
              {"type": "image"},
              {"type": "text", "text": question}
          ]
      },
  ]

  # prepare inputs
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
  inputs = processor(text=prompt, images=[image], return_tensors="pt")
  inputs = inputs.to(DEVICE)

  # generate outputs
  with torch.no_grad():
      generated_ids = model.generate(**inputs, max_new_tokens=500)
  generated_texts = processor.batch_decode(
      generated_ids,
      skip_special_tokens=True,
  )

  # process the output
  predicted_order = generated_texts[0].strip()
  predicted_order = predicted_order.split("Assistant:")[-1].strip()
  predicted_order = [name.strip() for name in predicted_order.split(",")]

  expected_order = entry['order']

  if predicted_order == expected_order:
    correct += 1
  total += 1

print(f"Accuracy: {correct / total}")

In [None]:
import os

# If you have a `dataset/` folder, list that too
dataset_path = '/content/drive/My Drive/CS231N Colabs/dataset'
os.listdir(dataset_path)
print("image_0.png" in os.listdir(dataset_path))

# Baseline Supervised Fine Tuning w/ Cross Entropy Loss

CS231N Colabs/dataset - 2500 images
2000 train, 250 val, 250 test

0-1999, 2000-2249, 2250-2499



In [None]:
import os
import json
from PIL import Image
from pathlib import Path

import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm

# Assume `processor` and `model` are already defined (and moved to DEVICE).
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

#######################################
# 1) Define a custom Dataset class  ##
#######################################

class VLMOrderDataset(Dataset):
    def __init__(self, metadata_json_path, base_image_dir, processor, max_length=128):
        """
        Args:
            metadata_json_path (str): path to metadata.json
            base_image_dir (str): root folder where images live
            processor (AutoProcessor): the Hugging Face processor for SmolVLM
            max_length (int): maximum token length for textual sequences
        """
        with open(metadata_json_path, 'r') as f:
            self.data = json.load(f)

        self.base_image_dir = base_image_dir
        self.processor = processor
        self.max_length = max_length

        # Fixed question text (same for every example):
        self.question_text = (
            "Given the diagram, list the labels of the circles in order from leftmost to rightmost "
            "(provide name only)."
        )

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        entry = self.data[idx]
        image_rel_path = entry["image_path"]
        image_rel_path = image_rel_path.replace("dataset/", "")
        order_list = entry["order"]  # e.g. ["Hannah","Grace","Alice","Emily","David"]

        # 1) Load the image from disk:
        image_path = os.path.join(self.base_image_dir, image_rel_path)
        image = Image.open(image_path).convert("RGB")

        # 2) Build the “prompt” exactly as at inference:
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image"},
                    {"type": "text", "text": self.question_text}
                ]
            }
        ]
        prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)

        # 3) Convert the target list into a comma‐separated string:
        #    e.g. "Hannah, Grace, Alice, Emily, David"
        target_text = ", ".join(order_list)

        # 4) Tokenize both with truncation disabled so that the <image> token is never dropped:
        encoding = self.processor(
            text=prompt,
            images=[image],
            text_target=target_text,
            return_tensors="pt",
            padding="max_length",
            truncation=False,       # ← disable truncation for the input
            max_length=self.max_length
        )

        # encoding["input_ids"] is shape (1, seq_len), same for attention_mask & labels;
        # encoding["pixel_values"] is shape (1, 3, H, W).
        # We .squeeze(0) to make them 1D/3D instead of batch‐size=1.
        pixel_values = encoding["pixel_values"].squeeze(0)     # (3, H, W)
        input_ids     = encoding["input_ids"].squeeze(0)       # (seq_len,)
        attention_mask= encoding["attention_mask"].squeeze(0)  # (seq_len,)
        labels        = encoding["labels"].squeeze(0)          # (target_seq_len,)

        return {
            "pixel_values": pixel_values,
            "input_ids":     input_ids,
            "attention_mask":attention_mask,
            "labels":        labels
        }

###############################################
# 2) Instantiate train/val splits & DataLoaders ##
###############################################

# Paths—adjust to wherever you mounted your drive in Colab:
base_drive_path   = "/content/drive/My Drive/CS231N Colabs"
metadata_path     = os.path.join(base_drive_path, "dataset/metadata.json")
images_base_dir = os.path.join(base_drive_path, "dataset", "images")

# Load full dataset
full_dataset = VLMOrderDataset(
    metadata_json_path=metadata_path,
    base_image_dir=images_base_dir,
    processor=processor,
    max_length=128
)

# Split indices: 0–1999 = train, 2000–2249 = val, 2250–2499 = test
num_examples = len(full_dataset)  # should be 2500
assert num_examples == 2500, "Check that metadata.json has 2500 entries."

train_indices = list(range(0, 2000))
val_indices   = list(range(2000, 2250))

from torch.utils.data import Subset
train_dataset = Subset(full_dataset, train_indices)
val_dataset   = Subset(full_dataset, val_indices)

# DataLoader (you can adjust batch_size as GPU allows; 4–8 is a good starting point)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=2, pin_memory=True)
val_loader   = DataLoader(val_dataset,   batch_size=4, shuffle=False, num_workers=2, pin_memory=True)

###################################################
# 3) Set up optimizer, training loop, and metrics ##
###################################################

#  We’ll use AdamW and a small learning rate. Feel free to tune later.
optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3   # start small; you can increase if you have time

# (Optional) learning‐rate scheduler, gradient clipping, etc. can be added here.

best_val_loss = float("inf")

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    train_steps = 0

    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1} / {num_epochs} (train)", leave=False)
    for batch in pbar:
        # Move everything to DEVICE
        pixel_values  = batch["pixel_values"].to(DEVICE)      # (B, 3, H, W)
        input_ids      = batch["input_ids"].to(DEVICE)        # (B, seq_len)
        attention_mask = batch["attention_mask"].to(DEVICE)   # (B, seq_len)
        labels         = batch["labels"].to(DEVICE)           # (B, target_len)

        # Forward pass: since we passed `labels`, HF will compute cross‐entropy internally
        outputs = model(
            pixel_values=pixel_values,
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        # (Optional) torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        train_loss += loss.item()
        train_steps += 1
        pbar.set_postfix_str(f"loss={train_loss/train_steps:.4f}")

    avg_train_loss = train_loss / train_steps
    print(f"\nEpoch {epoch+1} — Avg train loss: {avg_train_loss:.4f}")

    # 4) Validate after each epoch
    model.eval()
    val_loss = 0.0
    val_steps = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} / {num_epochs} (val)", leave=False):
            pixel_values  = batch["pixel_values"].to(DEVICE)
            input_ids      = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels         = batch["labels"].to(DEVICE)

            outputs = model(
                pixel_values=pixel_values,
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )
            val_loss += outputs.loss.item()
            val_steps += 1

    avg_val_loss = val_loss / val_steps
    print(f"Epoch {epoch+1} — Avg val loss: {avg_val_loss:.4f}\n")

    # (Optional) Save the best‐performing checkpoint
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        save_path = "./best_smolvlm_baseline.pt"
        torch.save(model.state_dict(), save_path)
        print(f"  → New best model saved at epoch {epoch+1} (val loss {avg_val_loss:.4f})\n")

print("Training complete.")


In [None]:
# Cell 1: Install dependencies (if not already installed)
!pip install --quiet hf_xet flash-attn
!pip install --quiet transformers torch pillow tqdm
!pip install --quiet torch torchvision transformers pillow tqdm
!pip install --quiet --upgrade torch torchvision transformers pillow tqdm

In [None]:
import os
import json
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq
from torch.optim import AdamW
from tqdm.notebook import tqdm

In [None]:
# Cell 2: Imports and Paths
import os
import json
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm.notebook import tqdm
from transformers import AutoProcessor, AutoModelForVision2Seq

# Device selection
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# ----------------------------
# Adjust these two paths:
# ----------------------------
# 1) Where your images live (e.g. image_0.png, image_1.png, ...)
IMAGE_DIR = "/content/drive/My Drive/CS231N Colabs/dataset"

# 2) Where your metadata.json lives
METADATA_PATH = "/content/drive/My Drive/CS231N Colabs/dataset/metadata.json"

# # 3) (Optional) A place to save checkpoints
# CHECKPOINT_DIR = "/content/checkpoints"
# os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Print confirmations
print("Device:", DEVICE)
print("Image directory:", IMAGE_DIR)
print("Metadata path:", METADATA_PATH)


In [None]:
# Cell 3 – Load the pre‐trained processor & model
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM-Instruct").to(DEVICE)
print("✓ Loaded SmolVLM‐Instruct")


In [None]:
# Cell 4 – Define Dataset that returns raw PIL images + chat‐prompt + target string
class CircleOrderDataset(Dataset):
    def __init__(self, metadata_list, image_dir, processor):
        self.entries   = metadata_list
        self.image_dir = image_dir
        self.processor = processor

    def __len__(self):
        return len(self.entries)

    def __getitem__(self, idx):
        entry = self.entries[idx]

        # 1) Load image
        img_name = os.path.basename(entry["image_path"])
        img_path = os.path.join(self.image_dir, img_name)
        image    = Image.open(img_path).convert("RGB")

        # 2) Build chat‐style prompt (one <image> token)
        question = "Given the diagram, list the labels of the circles in order from leftmost to rightmost."
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image"},
                    {"type": "text", "text": question}
                ]
            }
        ]
        chat_prompt = self.processor.apply_chat_template(
            messages,
            add_generation_prompt=False
        )

        # 3) Build target (comma‐separated)
        target_text = ", ".join(entry["order"])

        return {
            "image":       image,
            "chat_prompt": chat_prompt,  # contains one <image> token
            "target_text": target_text
        }


In [None]:
# Cell 5 – Load metadata.json & split into train/val/test lists
with open(METADATA_PATH, "r") as f:
    all_metadata = json.load(f)

assert len(all_metadata) == 2500, f"Expected 2500 entries, got {len(all_metadata)}"

# Split indices
train_metadata = all_metadata[0:2000]    # 2000 for training
val_metadata   = all_metadata[2000:2250] # 250 for validation
test_metadata  = all_metadata[2250:2500] # 250 for testing

# Instantiate Datasets
train_dataset = CircleOrderDataset(train_metadata, IMAGE_DIR, processor)
val_dataset   = CircleOrderDataset(val_metadata,   IMAGE_DIR, processor)
test_dataset  = CircleOrderDataset(test_metadata,  IMAGE_DIR, processor)

print("✓ Train size:", len(train_dataset))
print("✓ Val   size:", len(val_dataset))
print("✓ Test  size:", len(test_dataset))


In [None]:
def data_collator(batch):
    """
    batch: list of dicts, each with keys:
      - "image"       : PIL.Image
      - "chat_prompt" : str (contains exactly one <image> token)
      - "target_text" : str (comma‐separated labels)

    Returns:
      {
        pixel_values:   Tensor (B, 1, 3, H, W),
        input_ids:      Tensor (B, L1),
        attention_mask: Tensor (B, L1),
        labels:         Tensor (B, L2) with pad_token_id → -100
      }
    """
    images       = [item["image"]       for item in batch]
    chat_prompts = [item["chat_prompt"] for item in batch]
    targets      = [item["target_text"] for item in batch]

    # 1) Multimodal tokenization: text with <image> + images
    model_inputs = processor(
        text=chat_prompts,
        images=images,
        return_tensors="pt",
        padding=True,       # pad to longest in batch
        truncation=True     # truncate if too long
    )
    # model_inputs.pixel_values has shape (B, 1, C, H, W)—do NOT squeeze!

    pixel_values   = model_inputs.pixel_values    # (B, 1, 3, H, W)
    input_ids      = model_inputs.input_ids       # (B, L1)
    attention_mask = model_inputs.attention_mask  # (B, L1)

    # 2) Tokenize & pad the target strings
    label_encodings = processor.tokenizer(
        targets,
        return_tensors="pt",
        padding=True,
        truncation=True
    )["input_ids"]  # (B, L2)

    # 3) Replace pad_token_id with -100 so loss ignores those positions
    pad_id = processor.tokenizer.pad_token_id
    label_encodings[label_encodings == pad_id] = -100

    return {
        "pixel_values":   pixel_values.to(DEVICE),     # (B, 1, 3, H, W)
        "input_ids":      input_ids.to(DEVICE),        # (B, L1)
        "attention_mask": attention_mask.to(DEVICE),   # (B, L1)
        "labels":         label_encodings.to(DEVICE),  # (B, L2)
    }


In [None]:
# Cell 7 – Create DataLoaders
from torch.utils.data import DataLoader

train_loader = DataLoader(
    train_dataset,
    batch_size=1,
    shuffle=True,
    collate_fn=data_collator
)

val_loader = DataLoader(
    val_dataset,
    batch_size=1,
    shuffle=False,
    collate_fn=data_collator
)

test_loader = DataLoader(
    test_dataset,
    batch_size=1,
    shuffle=False,
    collate_fn=data_collator
)

print("✓ # train batches:", len(train_loader))
print("✓ # val   batches:", len(val_loader))


In [None]:
# 1) Immediately after loading processor, shrink images to 128×128
processor.image_processor.size = {"height": 128, "width": 128}

# 2) Load the model and enable checkpointing
model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM-Instruct").to(DEVICE)
model.gradient_checkpointing_enable()

# 3) In your DataLoaders, use batch_size=1
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=data_collator)
val_loader   = DataLoader(val_dataset,   batch_size=1, shuffle=False, collate_fn=data_collator)

# 4) Use mixed precision + smaller prompt length in collator
#    (modify your collator to include max_length=128 if desired)

from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()
optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    model.train()
    total_loss = 0.0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        optimizer.zero_grad()
        with autocast():
            outputs = model(
                pixel_values=batch["pixel_values"],
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                labels=batch["labels"],
            )
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
        torch.cuda.empty_cache()

    print(f"Epoch {epoch+1} train loss: {total_loss/len(train_loader):.4f}")

    model.eval()
    total_val = 0.0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            with autocast():
                outputs = model(
                    pixel_values=batch["pixel_values"],
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    labels=batch["labels"],
                )
                total_val += outputs.loss.item()
            torch.cuda.empty_cache()

    print(f"Epoch {epoch+1} val   loss: {total_val/len(val_loader):.4f}")


In [None]:
import gc
from torch.cuda.amp import autocast, GradScaler

# (Assume processor, model, train_loader, val_loader, optimizer, etc. are already defined)

scaler = GradScaler()

for epoch in range(3):
    # ----- Training -----
    model.train()
    total_loss = 0.0

    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        optimizer.zero_grad()

        # 1) Move all inputs to the same device
        pixel_values   = batch["pixel_values"].to(DEVICE)
        input_ids      = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels         = batch["labels"].to(DEVICE)

        # 2) Now run the forward pass under autocast
        with autocast():
            outputs = model(
                pixel_values=pixel_values,
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

        # 2) Delete intermediate tensors to break Python references
        del outputs, loss

        # 3) Force garbage collection, then free any unused cached GPU memory
        gc.collect()
        torch.cuda.empty_cache()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} — Avg Train Loss: {avg_train_loss:.4f}")

    # ----- Validation -----
    model.eval()
    total_val = 0.0

    with torch.no_grad():
      for batch in val_loader:
          pixel_values   = batch["pixel_values"].to(DEVICE)
          input_ids      = batch["input_ids"].to(DEVICE)
          attention_mask = batch["attention_mask"].to(DEVICE)
          labels         = batch["labels"].to(DEVICE)

          with autocast():
              outputs = model(
                  pixel_values=pixel_values,
                  input_ids=input_ids,
                  attention_mask=attention_mask,
                  labels=labels,
              )
              val_loss = outputs.loss

    avg_val_loss = total_val / len(val_loader)
    print(f"Epoch {epoch+1} — Avg Val Loss: {avg_val_loss:.4f}\n")

    # Optionally, you can also call empty_cache once more here:
    gc.collect()
    torch.cuda.empty_cache()


In [None]:
# Cell X: Verify that all inputs and model parameters are on the same device

import torch

# Grab one batch from the training DataLoader
batch = next(iter(train_loader))

# 1) Print devices of raw batch tensors
print("Raw batch tensor devices:")
print("  pixel_values:   ", batch["pixel_values"].device)
print("  input_ids:      ", batch["input_ids"].device)
print("  attention_mask: ", batch["attention_mask"].device)
print("  labels:         ", batch["labels"].device)

# 2) Move each tensor to the target DEVICE
pixel_values   = batch["pixel_values"].to(DEVICE)
input_ids      = batch["input_ids"].to(DEVICE)
attention_mask = batch["attention_mask"].to(DEVICE)
labels         = batch["labels"].to(DEVICE)

# 3) Print devices after .to(DEVICE)
print("\nAfter .to(DEVICE):")
print("  pixel_values:   ", pixel_values.device)
print("  input_ids:      ", input_ids.device)
print("  attention_mask: ", attention_mask.device)
print("  labels:         ", labels.device)

# 4) Check every model parameter’s device
print("\nModel parameter devices (non-GPU parameters, if any):")
for name, param in model.named_parameters():
    if param.device != torch.device(DEVICE):
        print(f"  ⚠️  {name} is on {param.device}")


In [None]:
# Cell Z: Move the entire model (all submodules) to GPU
model = model.to(DEVICE)

# Verify that no parameters remain on CPU
import torch
for name, param in model.named_parameters():
    if param.device != torch.device(DEVICE):
        print(f"⚠️ {name} is still on {param.device}")

print("✓ All model parameters are now on", DEVICE)


In [None]:
import torch, gc
from transformers import AutoProcessor, AutoModelForVision2Seq

# 1) Force‐free any leftover GPU cache
torch.cuda.empty_cache()
gc.collect()

# 2) Check that GPU is truly empty
print("Free CUDA memory before loading:",
      torch.cuda.mem_get_info()[0] / (1024**3), "GB")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# 3) Load processor and shrink images to 32×32 (tiny)
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
processor.image_processor.size = {"height": 32, "width": 32}

# 4) Load model in FP16, enable checkpointing, and move to GPU
model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM-Instruct").half().to(DEVICE)
model.gradient_checkpointing_enable()

# 5) Immediately delete everything except the model and processor
del processor
gc.collect()
torch.cuda.empty_cache()

# 6) Check GPU usage now
print("Free CUDA memory after loading:",
      torch.cuda.mem_get_info()[0] / (1024**3), "GB")


In [None]:
# First, uninstall any existing flash‐attn that may be incompatible:
!pip uninstall -y flash-attn

# Clone the official FlashAttention repo and build it against the current PyTorch:
!git clone https://github.com/Dao-AILab/flash-attention.git
%cd flash-attention
!pip install .

# Go back to your notebook root and clear cache
%cd ..
import torch, gc
torch.cuda.empty_cache()
gc.collect()

# Now retry loading SmolVLM
from transformers import AutoProcessor, AutoModelForVision2Seq
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
processor.image_processor.size = {"height": 64, "width": 64}

model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM-Instruct").half().to(DEVICE)
model.gradient_checkpointing_enable()

print("✅ FlashAttention built and model loaded on:", DEVICE)


In [None]:
# Cell: Train SmolVLM‐Instruct on CPU only (no CUDA)

import os
import json
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from PIL import Image
from tqdm.notebook import tqdm
from transformers import AutoProcessor, AutoModelForVision2Seq

# 1) Force everything onto CPU
DEVICE = "cpu"

# 2) Load the processor and do a small image resize (e.g. 128×128)
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
processor.image_processor.size = {"height": 128, "width": 128}

# 3) Define Dataset (returns PIL images, chat‐style prompts, target texts)
class CircleOrderDataset(Dataset):
    def __init__(self, metadata_list, image_dir, processor):
        self.entries   = metadata_list
        self.image_dir = image_dir
        self.processor = processor

    def __len__(self):
        return len(self.entries)

    def __getitem__(self, idx):
        entry = self.entries[idx]
        img_name = os.path.basename(entry["image_path"])
        img_path = os.path.join(self.image_dir, img_name)
        image    = Image.open(img_path).convert("RGB")

        question = "Given the diagram, list the labels of the circles in order from leftmost to rightmost."
        messages = [
            {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": question}]}
        ]
        chat_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=False)
        target_text = ", ".join(entry["order"])
        return {"image": image, "chat_prompt": chat_prompt, "target_text": target_text}

# 4) Data collator: tokenize on CPU (no .to(cuda))
def data_collator(batch):
    images       = [item["image"] for item in batch]
    chat_prompts = [item["chat_prompt"] for item in batch]
    targets      = [item["target_text"] for item in batch]

    # Encode text+image together
    model_inputs = processor(
        text=chat_prompts,
        images=images,
        return_tensors="pt",
        padding=True,
        truncation=True
    )
    # model_inputs.pixel_values has shape (B, 1, 3, 128, 128) on CPU
    pixel_values   = model_inputs.pixel_values   # (B,1,3,128,128)
    input_ids      = model_inputs.input_ids      # (B, L1)
    attention_mask = model_inputs.attention_mask # (B, L1)

    # Tokenize and pad targets
    label_encodings = processor.tokenizer(
        targets,
        return_tensors="pt",
        padding=True,
        truncation=True
    )["input_ids"]  # (B, L2)
    pad_id = processor.tokenizer.pad_token_id
    label_encodings[label_encodings == pad_id] = -100

    return {
        "pixel_values":   pixel_values,
        "input_ids":      input_ids,
        "attention_mask": attention_mask,
        "labels":         label_encodings,
    }

# 5) Load metadata.json and split
BASE_DIR     = "/content"                         # adjust if needed
IMAGE_DIR    = os.path.join(BASE_DIR, "dataset")  # folder with images
METADATA_PATH = os.path.join(BASE_DIR, "metadata.json")

with open(METADATA_PATH, "r") as f:
    all_metadata = json.load(f)

train_meta = all_metadata[0:2000]
val_meta   = all_metadata[2000:2250]

train_dataset = CircleOrderDataset(train_meta, IMAGE_DIR, processor)
val_dataset   = CircleOrderDataset(val_meta,   IMAGE_DIR, processor)

# 6) DataLoaders with batch_size=1 (keep small to reduce CPU RAM)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=data_collator)
val_loader   = DataLoader(val_dataset,   batch_size=1, shuffle=False, collate_fn=data_collator)

print("✓ Train batches:", len(train_loader), "| Val batches:", len(val_loader))

# 7) Load model on CPU and set up optimizer + loss
model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM-Instruct").to(DEVICE)
model.gradient_checkpointing_enable()  # saves some CPU activation memory
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn   = CrossEntropyLoss(ignore_index=-100)

# 8) Simple training loop on CPU
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0.0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]"):
        # Move inputs to CPU (they are already on CPU by default)
        pixel_values   = batch["pixel_values"]      # (1,1,3,128,128)
        input_ids      = batch["input_ids"]         # (1, L1)
        attention_mask = batch["attention_mask"]    # (1, L1)
        labels         = batch["labels"]            # (1, L2)

        optimizer.zero_grad()
        outputs = model(
            pixel_values=pixel_values,
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        # HuggingFace’s Vision2Seq returns outputs.loss when labels are provided
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch+1} ▶ Avg Train Loss: {avg_train_loss:.4f}")

    # 9) Validation on CPU
    model.eval()
    total_val_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} [Val]"):
            pixel_values   = batch["pixel_values"]
            input_ids      = batch["input_ids"]
            attention_mask = batch["attention_mask"]
            labels         = batch["labels"]

            outputs = model(
                pixel_values=pixel_values,
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            total_val_loss += outputs.loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    print(f"Epoch {epoch+1} ▶ Avg Val Loss: {avg_val_loss:.4f}\n")
