### 1. Data preparation:

##### 1.1. Download data from Kaggle:

In [3]:
import kagglehub

# Download latest version
data_dir = kagglehub.dataset_download("andrewmvd/dog-and-cat-detection")

print("Path to dataset files:", data_dir)

Path to dataset files: C:\Users\admin\.cache\kagglehub\datasets\andrewmvd\dog-and-cat-detection\versions\1


##### 1.2. Create dataset:

In [4]:
from torch.utils.data import Dataset
import os
from xml.etree import ElementTree as ET
from PIL import Image

class ImageDataset(Dataset):
    def __init__(self, image_dir, annotation_dir, transform=None):
        self.image_dir = image_dir
        self.annotation_dir = annotation_dir
        self.transform = transform
        self.image_files = self._filter_images_with_single_object()

    def _filter_images_with_single_object(self):
        valid_image_files = []
        for file in os.listdir(self.image_dir):
            if os.path.isfile(os.path.join(self.image_dir, file)):
                image_name = file
                annotation_name = os.path.splitext(image_name)[0] + '.xml'
                annotation_path = os.path.join(self.annotation_dir, annotation_name)

                if self._count_object_in_annotation(annotation_path) <= 1:
                    valid_image_files.append(image_name)
                else:
                    print(f'Image {image_name} has more than 1 object and will be excluded from dataset.')
        return valid_image_files
    
    def _count_object_in_annotation(self, annotation_path):
        try:
            tree = ET.parse(annotation_path)
            root = tree.getroot()
            count = 0
            for obj in root.findall('object'):
                count += 1
            return count
        except FileNotFoundError:
            return 0

    def _parse_annotation(self, annotation_path):
        try:
            tree = ET.parse(annotation_path)
            root = tree.getroot()
            label = None
            for obj in root.findall('object'):
                name = obj.find('name').text
                # we only consider image with 1 object at the moment
                if label is None:
                    label = name
                    break

            # convert label to numerical representation
            return 0 if label == 'cat' else 1 if label == 'dog' else -1
        except FileNotFoundError:
            print(f'File {annotation_path} not found.')

    def __len__(self):
        return len(self.image_files)
    
    def __getitem__(self, idx):
        image_name = self.image_files[idx]
        image_path = os.path.join(self.image_dir, image_name)
        annotation_name = os.path.splitext(image_name)[0] + '.xml'
        annotation_path = os.path.join(self.annotation_dir, annotation_name)

        # load image
        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        label = self._parse_annotation(annotation_path)
        return image, label

##### 1.3. Data loader, train test split:

In [8]:
from torch.utils.data import DataLoader, random_split
import torchvision.transforms as transforms
import torch

annotation_dir = os.path.join(data_dir, 'annotations')
image_dir = os.path.join(data_dir, 'images')

data_transform = transforms.Compose([
    transforms.Resize(size=(224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

dataset = ImageDataset(image_dir, annotation_dir, data_transform)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
generator = torch.Generator().manual_seed(42)
train_set, val_set = random_split(dataset=dataset, lengths=[train_size, val_size], generator=generator)

# data loader
train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
val_loader = DataLoader(val_set, batch_size=32, shuffle=False)

Image Cats_Test736.png has more than 1 object and will be excluded from dataset.


### 2. Build model:

Finetune ResNet18

In [14]:
import torchvision
import torch

model = torchvision.models.resnet18(weights=torchvision.models.ResNet18_Weights.DEFAULT)
num_features = model.fc.in_features
model.fc = torch.nn.Linear(num_features, 2)   # 2 classes for cat and dog

# device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# loss and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# summary
print(model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

### 3. Train model:

In [15]:
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    # train
    current_items = 0
    train_loss = 0.0
    for step, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)

        # forward pass
        outputs = model(images)
        train_loss = criterion(outputs, labels)

        # update parameters
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

        # print loss
        current_items += labels.size(0)
        print(f'Epoch: {epoch+1}/{num_epochs}, num_items: {current_items}/{len(train_set)}, loss: {train_loss.item()}')
    
    # eval
    model.eval()
    val_loss = 0.0
    accuracy = 0.0
    with torch.no_grad():
        correct = 0
        for images, labels in val_loader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            val_loss += criterion(outputs, labels)
            _, predict_label = torch.max(outputs, 1)
            correct += (predict_label == labels).sum()
        accuracy = 100 * correct / len(val_set)
    print(f'Epoch {epoch+1}/{num_epochs}, loss: {train_loss.item()}, val_loss: {val_loss}, accuracy: {accuracy:.2f}')

Epoch: 1/10, num_items: 32/2948, loss: 0.9179412722587585
Epoch: 1/10, num_items: 64/2948, loss: 0.27484041452407837
Epoch: 1/10, num_items: 96/2948, loss: 0.28936800360679626
Epoch: 1/10, num_items: 128/2948, loss: 0.4699711203575134
Epoch: 1/10, num_items: 160/2948, loss: 0.04625909775495529
Epoch: 1/10, num_items: 192/2948, loss: 0.2624463438987732
Epoch: 1/10, num_items: 224/2948, loss: 0.4034675657749176
Epoch: 1/10, num_items: 256/2948, loss: 0.24559076130390167
Epoch: 1/10, num_items: 288/2948, loss: 0.06946821510791779
Epoch: 1/10, num_items: 320/2948, loss: 0.5868371725082397
Epoch: 1/10, num_items: 352/2948, loss: 0.20336933434009552
Epoch: 1/10, num_items: 384/2948, loss: 0.4686952531337738
Epoch: 1/10, num_items: 416/2948, loss: 0.6834747791290283
Epoch: 1/10, num_items: 448/2948, loss: 0.2868286371231079
Epoch: 1/10, num_items: 480/2948, loss: 0.33878597617149353
Epoch: 1/10, num_items: 512/2948, loss: 0.23030780255794525
Epoch: 1/10, num_items: 544/2948, loss: 0.496438860

### 4. Save model:

In [16]:
torch.save(model.state_dict(), 'resnet_cat_and_dog.ckpt')