In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from datasets import load_from_disk
from PIL import Image
import pandas as pd

torch.manual_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [2]:
dataset = load_from_disk("processed_bird_data")
train_ds = dataset["train"]
val_ds   = dataset["validation"]

print("Train samples:", len(train_ds))
print("Val samples:  ", len(val_ds))

Train samples: 3337
Val samples:   589


In [3]:
class BirdsTrainValDataset(Dataset):
    def __init__(self, hf_dataset, augment=False):
        self.ds = hf_dataset
        if augment:
            self.transform = transforms.Compose([
                transforms.Resize((224, 224)),
                transforms.RandomHorizontalFlip(),
                transforms.RandomRotation(15),
                transforms.ToTensor(),
            ])
        else:
            self.transform = transforms.Compose([
                transforms.Resize((224, 224)),
                transforms.ToTensor(),
            ])

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        sample = self.ds[idx]
        img = sample["image"] #todo: check rezi's notebook
        label = int(sample["label"])

        img = self.transform(img)
        return img, label

In [4]:
BATCH_SIZE = 64

train_loader = DataLoader(
    BirdsTrainValDataset(train_ds, augment=True),
    batch_size=BATCH_SIZE,
    shuffle=True
)

val_loader = DataLoader(
    BirdsTrainValDataset(val_ds, augment=False),
    batch_size=BATCH_SIZE,
    shuffle=False
)

print("Train batches:", len(train_loader))
print("Val batches:  ", len(val_loader))

Train batches: 53
Val batches:   10


In [5]:
NUM_CLASSES = 200

class BirdCNN(nn.Module):
    def __init__(self, num_classes=NUM_CLASSES):
        super().__init__()

        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2), #112x112

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2), #56x56

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2), #28x28

            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2), #14x14

            nn.AdaptiveAvgPool2d((1,1)) #256-dim vector
        )

        self.dropout = nn.Dropout(0.2)

        self.classifier = nn.Sequential(
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        feat = self.features(x) #[B,256,1,1]
        feat = feat.view(feat.size(0), -1) #[B,256]
        feat = self.dropout(feat)
        logits = self.classifier(feat)
        return logits

model = BirdCNN().to(device)
print(model)

BirdCNN(
  (features): Sequential(
    (0): Conv2d(3, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (12): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 

In [6]:
EPOCHS = 12
criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(
    model.parameters(),
    lr=1e-3,
    weight_decay=1e-4 #L2 regularization
)

scheduler = optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=EPOCHS 
)

In [7]:
def train_one_epoch(epoch):
    model.train()
    total_loss = 0.0
    samples = 0

    for batch_idx, (imgs, labels) in enumerate(train_loader):
        imgs, labels = imgs.to(device), labels.to(device)

        optimizer.zero_grad()
        logits = model(imgs)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * imgs.size(0)
        samples += imgs.size(0)

        if batch_idx % 20 == 0:
            print(f"[Epoch {epoch}] Batch {batch_idx}/{len(train_loader)} | loss={loss.item():.4f}")

    return total_loss / samples


def evaluate(epoch):
    model.eval()
    total_loss = 0.0
    correct = 0
    samples = 0

    with torch.no_grad():
        for imgs, labels in val_loader:
            imgs, labels = imgs.to(device), labels.to(device)

            logits = model(imgs)
            loss = criterion(logits, labels)

            total_loss += loss.item() * imgs.size(0)
            preds = logits.argmax(dim=1)
            correct += (preds == labels).sum().item()
            samples += imgs.size(0)

    val_loss = total_loss / samples
    val_acc  = correct / samples
    return val_loss, val_acc


In [8]:
best_val = 0.0

for epoch in range(1, EPOCHS + 1):
    print(f"\nEpoch {epoch}/{EPOCHS}")
    train_loss = train_one_epoch(epoch)
    val_loss, val_acc = evaluate(epoch)
    scheduler.step()

    print(f"Epoch {epoch} | train_loss={train_loss:.4f} | val_loss={val_loss:.4f} | val_acc={val_acc:.4f}")

    if val_acc > best_val:
        best_val = val_acc
        torch.save(model.state_dict(), "best_cnn_model.pth")
        print("Best CNN model saved")


Epoch 1/12
[Epoch 1] Batch 0/53 | loss=5.3814
[Epoch 1] Batch 20/53 | loss=5.1986
[Epoch 1] Batch 40/53 | loss=5.3136
Epoch 1 | train_loss=5.2154 | val_loss=5.1550 | val_acc=0.0153
Best CNN model saved

Epoch 2/12
[Epoch 2] Batch 0/53 | loss=5.0308
[Epoch 2] Batch 20/53 | loss=4.9571
[Epoch 2] Batch 40/53 | loss=4.9432
Epoch 2 | train_loss=4.9876 | val_loss=5.1802 | val_acc=0.0204
Best CNN model saved

Epoch 3/12
[Epoch 3] Batch 0/53 | loss=4.8051
[Epoch 3] Batch 20/53 | loss=4.7576
[Epoch 3] Batch 40/53 | loss=4.7544
Epoch 3 | train_loss=4.8604 | val_loss=5.0371 | val_acc=0.0221
Best CNN model saved

Epoch 4/12
[Epoch 4] Batch 0/53 | loss=4.7391
[Epoch 4] Batch 20/53 | loss=4.7848
[Epoch 4] Batch 40/53 | loss=4.7311
Epoch 4 | train_loss=4.7757 | val_loss=5.0707 | val_acc=0.0272
Best CNN model saved

Epoch 5/12
[Epoch 5] Batch 0/53 | loss=4.7957
[Epoch 5] Batch 20/53 | loss=4.6534
[Epoch 5] Batch 40/53 | loss=4.6646
Epoch 5 | train_loss=4.7165 | val_loss=5.0702 | val_acc=0.0170

Epoch

In [9]:
print(">>> Loading processed test data from 'processed_bird_test_data'")
test_ds = load_from_disk("processed_bird_test_data")
print("Test samples:", len(test_ds))

>>> Loading processed test data from 'processed_bird_test_data'
Test samples: 4000


In [10]:
class BirdsTestDataset(Dataset):
    def __init__(self, hf_dataset):
        self.ds = hf_dataset
        self.transform = transforms.Compose([
            transforms.Resize((224,224)),
            transforms.ToTensor()
        ])

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        sample = self.ds[idx]
        img = sample["image"]
        img_id = sample["id"]
        img = self.transform(img)
        return img, img_id

test_loader = DataLoader(
    BirdsTestDataset(test_ds),
    batch_size=64,
    shuffle=False
)
print("Test batches:", len(test_loader))

Test batches: 63


In [11]:
model = BirdCNN().to(device)
model.load_state_dict(torch.load("best_cnn_model.pth", map_location=device))
model.eval()
print("loaded best CNN model")

loaded best CNN model


In [14]:
all_ids = []
all_preds = []

with torch.no_grad():
    for imgs, img_ids in test_loader:
        imgs = imgs.to(device)

        logits = model(imgs)
        preds = logits.argmax(dim=1).cpu().numpy()

        all_ids.extend(list(img_ids))
        all_preds.extend(list(preds))

submission = pd.DataFrame({
    "id": all_ids,
    "label": all_preds
})

submission.head()

Unnamed: 0,id,label
0,tensor(1),16
1,tensor(2),34
2,tensor(3),70
3,tensor(4),21
4,tensor(5),32


In [15]:
submission.to_csv("cnn_submission.csv", index=False)
print("Saved cnn_submission.csv")

Saved cnn_submission.csv
