In [1]:
!pip install transformers datasets accelerate



In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
from datasets import load_from_disk
from transformers import AutoFeatureExtractor, AutoModelForImageClassification
import os

from tqdm import tqdm

In [3]:
DATA_PATH = "processed_bird_data"
TEST_DATA_PATH = "processed_bird_test_data"
MODEL_NAME = "google/mobilenet_v2_1.0_224"
BATCH_SIZE = 32 
EPOCHS = 10
DEVICE = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")

print(f"Using device: {DEVICE}")

Using device: cpu


In [4]:
print("Loading data...")
try:
    dataset = load_from_disk(DATA_PATH)
except FileNotFoundError:
    print(f"Error: {DATA_PATH} not found.")
    raise

Loading data...


In [5]:
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)



In [6]:
from torchvision import transforms

normalize = transforms.Normalize(
    mean=feature_extractor.image_mean, 
    std=feature_extractor.image_std
)

train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)), 
    transforms.RandomHorizontalFlip(),                 
    transforms.RandomRotation(15),                     
    transforms.ColorJitter(brightness=0.1, contrast=0.1), 
    transforms.ToTensor(),
    normalize,
])

val_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    normalize,
])

def train_transform_func(batch):
    inputs = {}
    inputs["pixel_values"] = [train_transforms(img.convert("RGB")) for img in batch["image"]]
    inputs["label"] = batch["label"]
    return inputs

def val_transform_func(batch):
    inputs = {}
    inputs["pixel_values"] = [val_transforms(img.convert("RGB")) for img in batch["image"]]
    inputs["label"] = batch["label"]
    return inputs


dataset["train"] = dataset["train"].with_transform(train_transform_func)
dataset["validation"] = dataset["validation"].with_transform(val_transform_func)

def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

train_loader = DataLoader(dataset["train"], batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(dataset["validation"], batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

print(f"Train batches: {len(train_loader)} | Val batches: {len(val_loader)}")

Train batches: 105 | Val batches: 19


In [7]:
print("Initializing Baseline Model (MobileNetV2)...")
model = AutoModelForImageClassification.from_pretrained(
    MODEL_NAME,
    num_labels=200,
    ignore_mismatched_sizes=True
)
model.to(DEVICE)

optimizer = optim.AdamW(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

print("Model ready.")

Initializing Baseline Model (MobileNetV2)...


Some weights of MobileNetV2ForImageClassification were not initialized from the model checkpoint at google/mobilenet_v2_1.0_224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1001]) in the checkpoint and torch.Size([200]) in the model instantiated
- classifier.weight: found shape torch.Size([1001, 1280]) in the checkpoint and torch.Size([200, 1280]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model ready.


In [8]:
best_val_acc = 0.0
save_path = "baseline_best_model.pth"

In [None]:
patience = 5
counter = 0
best_val_loss = float('inf')

print("Starting training loop (with early stopping)...\n")

for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    correct_train = 0
    total_train = 0
    
    for i, batch in enumerate(train_loader):
        pixel_values = batch["pixel_values"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        
        optimizer.zero_grad()
        
        # forward pass 
        outputs = model(pixel_values=pixel_values)
        logits = outputs.logits
        
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        
        # stats
        running_loss += loss.item()
        _, predicted = torch.max(logits, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()
        
        if i % 20 == 0:
            print(f"[Epoch {epoch+1}] Batch {i}/{len(train_loader)} | loss={loss.item():.4f}")

    train_epoch_loss = running_loss / len(train_loader)
    train_epoch_acc = correct_train / total_train
    
    # validation phase
    model.eval()
    val_running_loss = 0.0
    correct_val = 0
    total_val = 0
    
    with torch.no_grad():
        for batch in val_loader:
            pixel_values = batch["pixel_values"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)
            
            outputs = model(pixel_values=pixel_values)
            val_loss = criterion(outputs.logits, labels)
            
            val_running_loss += val_loss.item()
            _, predicted = torch.max(outputs.logits, 1)
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()
            
    val_epoch_loss = val_running_loss / len(val_loader)
    val_epoch_acc = correct_val / total_val
    
    # status report
    print(f"Train: loss={train_epoch_loss:.4f}, acc={train_epoch_acc:.4f}")
    print(f"Val:   loss={val_epoch_loss:.4f}, acc={val_epoch_acc:.4f}")
    
    if val_epoch_loss < best_val_loss:
        best_val_loss = val_epoch_loss
        best_val_acc = val_epoch_acc
        counter = 0 # reset bc improvement was found
        
        # saving the best model
        torch.save(model.state_dict(), save_path)
        print(f"Validation loss improved. Model saved to {save_path}")
    else:
        counter += 1
        print(f"No improvement in validation loss for {counter}/{patience} epochs.")
        
        if counter >= patience:
            print(f"Early stopping triggered at epoch {epoch+1}.")
            break
    
    print("-" * 30)

print(f"\nTraining finished. Best Validation Accuracy: {best_val_acc:.4f}")

Starting training loop (with early stopping)...

[Epoch 1] Batch 0/105 | loss=5.3042
[Epoch 1] Batch 20/105 | loss=5.1404
[Epoch 1] Batch 40/105 | loss=5.0564
[Epoch 1] Batch 60/105 | loss=4.3242
[Epoch 1] Batch 80/105 | loss=3.9000
[Epoch 1] Batch 100/105 | loss=3.6477
Train: loss=4.4982, acc=0.1250
Val:   loss=3.6576, acc=0.2207
Validation loss improved. Model saved to baseline_best_model.pth
------------------------------
[Epoch 2] Batch 0/105 | loss=3.1088
[Epoch 2] Batch 20/105 | loss=3.0372
[Epoch 2] Batch 40/105 | loss=2.4082
[Epoch 2] Batch 60/105 | loss=2.6759
[Epoch 2] Batch 80/105 | loss=2.3149
[Epoch 2] Batch 100/105 | loss=2.3968
Train: loss=2.6193, acc=0.4402
Val:   loss=2.7232, acc=0.3599
Validation loss improved. Model saved to baseline_best_model.pth
------------------------------
[Epoch 3] Batch 0/105 | loss=1.9641
[Epoch 3] Batch 20/105 | loss=1.7902
[Epoch 3] Batch 40/105 | loss=1.6156
[Epoch 3] Batch 60/105 | loss=2.1417
[Epoch 3] Batch 80/105 | loss=1.7953
[Epoch 

In [15]:
from torchvision import transforms
import pandas as pd

TEST_DATA_PATH = "processed_bird_test_data"
WEIGHTS_PATH = "baseline_best_model.pth"
OUTPUT_FILENAME = "baseline_submission.csv"

print(f"Loading test data from {TEST_DATA_PATH}...")
try:
    dataset_raw = load_from_disk(TEST_DATA_PATH)
    if isinstance(dataset_raw, dict) and "test" in dataset_raw:
        test_ds = dataset_raw["test"]
    else:
        test_ds = dataset_raw
    
    submission_ids = list(test_ds["id"]) if "id" in test_ds.column_names else list(range(len(test_ds)))
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise e

normalize = transforms.Normalize(
    mean=feature_extractor.image_mean, 
    std=feature_extractor.image_std
)

test_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    normalize,
])

class BaselineTestDataset(Dataset):
    def __init__(self, hf_dataset, transform=None):
        self.hf_dataset = hf_dataset
        self.transform = transform

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, idx):
        img = self.hf_dataset[idx]["image"].convert("RGB")
        if self.transform:
            img = self.transform(img)
        return img

test_loader = DataLoader(
    BaselineTestDataset(test_ds, transform=test_transforms),
    batch_size=32,
    shuffle=False, 
    num_workers=0
)

if os.path.exists(WEIGHTS_PATH):
    model.load_state_dict(torch.load(WEIGHTS_PATH, map_location=DEVICE))
    model.to(DEVICE)
    model.eval()
    
    all_preds = []
    with torch.no_grad():
        for imgs in tqdm(test_loader, desc="Predicting"):
            imgs = imgs.to(DEVICE)
            outputs = model(pixel_values=imgs)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            all_preds.extend(preds)

    df = pd.DataFrame({"id": submission_ids, "label": all_preds})
    df.to_csv(OUTPUT_FILENAME, index=False)
    print(f"Saved to {OUTPUT_FILENAME}")

Loading test data from processed_bird_test_data...


Predicting: 100%|████████████████████████████████████████████████████████████████████| 125/125 [01:55<00:00,  1.08it/s]

Saved to baseline_submission.csv





In [15]:
# ? blind 