In [4]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from datasets import load_from_disk
from PIL import Image
import pandas as pd

torch.manual_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [5]:
dataset = load_from_disk("processed_bird_data")
train_ds = dataset["train"]
val_ds   = dataset["validation"]

print("Train samples:", len(train_ds))
print("Val samples:  ", len(val_ds))

Train samples: 3337
Val samples:   589


In [6]:
# For standardization purposes, adding RandomResizedCrop for the "Zoom" effect, so that the models have the same setup.

class BirdsTrainValDataset(Dataset):
    def __init__(self, hf_dataset, augment=False):
        self.ds = hf_dataset
        if augment:
            
            self.transform = transforms.Compose([
                transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
                transforms.RandomHorizontalFlip(),
                transforms.RandomRotation(15),
                transforms.ColorJitter(brightness=0.1, contrast=0.1),
                transforms.ToTensor(),
            ])
        else:
            self.transform = transforms.Compose([
                transforms.Resize((224, 224)),
                transforms.ToTensor(),
            ])

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        sample = self.ds[idx]
        img = sample["image"]
        
        if img.mode != 'RGB':
            img = img.convert('RGB')
            
        label = int(sample["label"])

        img = self.transform(img)
        return img, label

In [7]:
BATCH_SIZE = 64

train_loader = DataLoader(
    BirdsTrainValDataset(train_ds, augment=True),
    batch_size=BATCH_SIZE,
    shuffle=True
)

val_loader = DataLoader(
    BirdsTrainValDataset(val_ds, augment=False),
    batch_size=BATCH_SIZE,
    shuffle=False
)

print("Train batches:", len(train_loader))
print("Val batches:  ", len(val_loader))

Train batches: 53
Val batches:   10


In [8]:
NUM_CLASSES = 200

class BirdCNN(nn.Module):
    def __init__(self, num_classes=NUM_CLASSES):
        super().__init__()

        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2), #112x112

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2), #56x56

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2), #28x28

            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2), #14x14

            nn.AdaptiveAvgPool2d((1,1)) #256-dim vector
        )

        self.dropout = nn.Dropout(0.2)

        self.classifier = nn.Sequential(
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        feat = self.features(x) #[B,256,1,1]
        feat = feat.view(feat.size(0), -1) #[B,256]
        feat = self.dropout(feat)
        logits = self.classifier(feat)
        return logits

model = BirdCNN().to(device)
print(model)

BirdCNN(
  (features): Sequential(
    (0): Conv2d(3, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (12): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 

In [9]:
EPOCHS = 20
criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(
    model.parameters(),
    lr=1e-3,
    weight_decay=1e-4 #L2 regularization
)

scheduler = optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=EPOCHS 
)

In [10]:
def train_one_epoch(epoch):
    model.train()
    total_loss = 0
    correct = 0
    samples = 0

    for batch_idx, (images, labels) in enumerate(train_loader):
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item() * labels.size(0)

        preds = outputs.argmax(dim=1)
        correct += (preds == labels).sum().item()
        samples += labels.size(0)

        # Logging batch
        if batch_idx % 20 == 0:
            print(f"[Epoch {epoch}] Batch {batch_idx}/{len(train_loader)} | loss={loss.item():.4f}")

    avg_loss = total_loss / samples
    acc = correct / samples

    return avg_loss, acc

In [11]:
def evaluate(epoch):
    model.eval()
    total_loss = 0
    correct = 0
    samples = 0

    with torch.no_grad():
        for batch_idx, (images, labels) in enumerate(val_loader):
            images, labels = images.to(device), labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            total_loss += loss.item() * labels.size(0)

            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            samples += labels.size(0)

    avg_loss = total_loss / samples
    acc = correct / samples

    return avg_loss, acc

In [12]:
best_val_acc = 0.0

for epoch in range(1, EPOCHS + 1):
    print(f"\nEpoch {epoch}/{EPOCHS}")

    train_loss, train_acc = train_one_epoch(epoch)
    val_loss, val_acc = evaluate(epoch)

    scheduler.step()

    print(f"Train: loss={train_loss:.4f}, acc={train_acc:.4f}")
    print(f"Val:   loss={val_loss:.4f}, acc={val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "cnn_best_model.pth")
        print("Best CNN model saved")


Epoch 1/20
[Epoch 1] Batch 0/53 | loss=5.3342
[Epoch 1] Batch 20/53 | loss=5.2691
[Epoch 1] Batch 40/53 | loss=5.2660
Train: loss=5.2082, acc=0.0141
Val:   loss=5.1449, acc=0.0102
Best CNN model saved

Epoch 2/20
[Epoch 2] Batch 0/53 | loss=4.9332
[Epoch 2] Batch 20/53 | loss=5.0259
[Epoch 2] Batch 40/53 | loss=4.9338
Train: loss=4.9935, acc=0.0249
Val:   loss=5.4091, acc=0.0119
Best CNN model saved

Epoch 3/20
[Epoch 3] Batch 0/53 | loss=4.9605
[Epoch 3] Batch 20/53 | loss=4.9847
[Epoch 3] Batch 40/53 | loss=4.7989
Train: loss=4.8702, acc=0.0300
Val:   loss=5.0398, acc=0.0238
Best CNN model saved

Epoch 4/20
[Epoch 4] Batch 0/53 | loss=4.7060
[Epoch 4] Batch 20/53 | loss=4.7029
[Epoch 4] Batch 40/53 | loss=4.9069
Train: loss=4.7910, acc=0.0366
Val:   loss=5.0574, acc=0.0221

Epoch 5/20
[Epoch 5] Batch 0/53 | loss=4.5046
[Epoch 5] Batch 20/53 | loss=4.5583
[Epoch 5] Batch 40/53 | loss=4.6030
Train: loss=4.7068, acc=0.0378
Val:   loss=5.0089, acc=0.0255
Best CNN model saved

Epoch 6/20

In [1]:
# todo: test data