In [1]:
"""
VGG Architecture.

224x224 RGB image

16 Conv layers with 3x3 filters
Stride 1
Padding 1
5 Maxpooling layers that are 2x2 with stride 2

3 FC layers, the first 2 have 4096 channels
The third performs the 1000 class classification
Softmax Layer

Layer Layout
  Channels(I, O)      |   Output Dimensions (Ch, W, H)
Conv1: (3, 64)        |   (64, 224, 224)256
Conv2: (64, 64)       |   (64, 224, 224)256
Maxpool1              |   (64, 112, 112)128
Conv3 (64, 128)       |   (128, 112, 112)128
Conv4 (128, 128)      |   (128, 112, 112)128
Maxpool2              |   (128, 56, 56)64
Conv5 (128, 256)      |   (256, 56, 56)
Conv6 (256, 256)      |   (256, 56, 56)
Conv7 (256, 256)      |   (256, 56, 56)
Conv8 (256, 256)      |   (256, 56, 56)
Maxpool3              |   (256, 28, 28)32
Conv9 (256, 512)      |   (512, 28, 28)
Conv10 (512, 512)     |   (512, 28, 28)
Conv11 (512, 512)     |   (512, 28, 28)
Conv12 (512, 512)     |   (512, 28, 28)
Maxpool4              |   (512, 14, 14)16
Conv13 (512, 512)     |   (512, 14, 14)
Conv14 (512, 512)     |   (512, 14, 14)
Conv15 (512, 512)     |   (512, 14, 14)
Conv16 (512, 512)     |   (512, 14, 14)
Maxpool5              |   (512, 7, 7)8
Flatten (512, 25088)  |   (1, 1, 25088)
FC17 (25088, 1)       |   (1, 1, 4096)
FC18 (1, 1)           |   (1, 1, 4096)
FC19 (1, 1)           |   (1, 1, 1000)
Softmax
"""

'\nVGG Architecture.\n\n224x224 RGB image\n\n16 Conv layers with 3x3 filters\nStride 1\nPadding 1\n5 Maxpooling layers that are 2x2 with stride 2\n\n3 FC layers, the first 2 have 4096 channels\nThe third performs the 1000 class classification\nSoftmax Layer\n\nLayer Layout\n  Channels(I, O)      |   Output Dimensions (Ch, W, H)\nConv1: (3, 64)        |   (64, 224, 224)256\nConv2: (64, 64)       |   (64, 224, 224)256\nMaxpool1              |   (64, 112, 112)128\nConv3 (64, 128)       |   (128, 112, 112)128\nConv4 (128, 128)      |   (128, 112, 112)128\nMaxpool2              |   (128, 56, 56)64\nConv5 (128, 256)      |   (256, 56, 56)\nConv6 (256, 256)      |   (256, 56, 56)\nConv7 (256, 256)      |   (256, 56, 56)\nConv8 (256, 256)      |   (256, 56, 56)\nMaxpool3              |   (256, 28, 28)32\nConv9 (256, 512)      |   (512, 28, 28)\nConv10 (512, 512)     |   (512, 28, 28)\nConv11 (512, 512)     |   (512, 28, 28)\nConv12 (512, 512)     |   (512, 28, 28)\nMaxpool4              |   (5

In [3]:
!wget "http://aisdatasets.informatik.uni-freiburg.de/freiburg_groceries_dataset/freiburg_groceries_dataset.tar.gz"

--2025-06-27 08:11:43--  http://aisdatasets.informatik.uni-freiburg.de/freiburg_groceries_dataset/freiburg_groceries_dataset.tar.gz
Resolving aisdatasets.informatik.uni-freiburg.de (aisdatasets.informatik.uni-freiburg.de)... 132.230.105.132
Connecting to aisdatasets.informatik.uni-freiburg.de (aisdatasets.informatik.uni-freiburg.de)|132.230.105.132|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 541562880 (516M) [application/x-gzip]
Saving to: ‘freiburg_groceries_dataset.tar.gz.1’


2025-06-27 08:12:23 (13.0 MB/s) - ‘freiburg_groceries_dataset.tar.gz.1’ saved [541562880/541562880]



In [4]:
!tar -xvf "freiburg_groceries_dataset.tar.gz"

images/
images/BEANS/
images/BEANS/BEANS0000.png
images/BEANS/BEANS0001.png
images/BEANS/BEANS0002.png
images/BEANS/BEANS0003.png
images/BEANS/BEANS0004.png
images/BEANS/BEANS0005.png
images/BEANS/BEANS0006.png
images/BEANS/BEANS0007.png
images/BEANS/BEANS0008.png
images/BEANS/BEANS0009.png
images/BEANS/BEANS0010.png
images/BEANS/BEANS0011.png
images/BEANS/BEANS0012.png
images/BEANS/BEANS0013.png
images/BEANS/BEANS0014.png
images/BEANS/BEANS0015.png
images/BEANS/BEANS0016.png
images/BEANS/BEANS0017.png
images/BEANS/BEANS0018.png
images/BEANS/BEANS0019.png
images/BEANS/BEANS0020.png
images/BEANS/BEANS0021.png
images/BEANS/BEANS0022.png
images/BEANS/BEANS0023.png
images/BEANS/BEANS0024.png
images/BEANS/BEANS0025.png
images/BEANS/BEANS0026.png
images/BEANS/BEANS0027.png
images/BEANS/BEANS0028.png
images/BEANS/BEANS0029.png
images/BEANS/BEANS0030.png
images/BEANS/BEANS0031.png
images/BEANS/BEANS0032.png
images/BEANS/BEANS0033.png
images/BEANS/BEANS0034.png
images/BEANS/BEANS0035.png
images

In [5]:
# Import image paths in a way where we have a dataframe with the structure --> Path|Folder_name (aka label)
import glob
import pandas as pd

In [6]:
glob.glob("images/*/*.png")

['images/CAKE/CAKE0102.png',
 'images/CAKE/CAKE0035.png',
 'images/CAKE/CAKE0140.png',
 'images/CAKE/CAKE0155.png',
 'images/CAKE/CAKE0101.png',
 'images/CAKE/CAKE0116.png',
 'images/CAKE/CAKE0076.png',
 'images/CAKE/CAKE0062.png',
 'images/CAKE/CAKE0031.png',
 'images/CAKE/CAKE0006.png',
 'images/CAKE/CAKE0113.png',
 'images/CAKE/CAKE0134.png',
 'images/CAKE/CAKE0111.png',
 'images/CAKE/CAKE0026.png',
 'images/CAKE/CAKE0053.png',
 'images/CAKE/CAKE0123.png',
 'images/CAKE/CAKE0004.png',
 'images/CAKE/CAKE0096.png',
 'images/CAKE/CAKE0002.png',
 'images/CAKE/CAKE0109.png',
 'images/CAKE/CAKE0033.png',
 'images/CAKE/CAKE0125.png',
 'images/CAKE/CAKE0064.png',
 'images/CAKE/CAKE0057.png',
 'images/CAKE/CAKE0052.png',
 'images/CAKE/CAKE0127.png',
 'images/CAKE/CAKE0104.png',
 'images/CAKE/CAKE0058.png',
 'images/CAKE/CAKE0045.png',
 'images/CAKE/CAKE0107.png',
 'images/CAKE/CAKE0039.png',
 'images/CAKE/CAKE0072.png',
 'images/CAKE/CAKE0150.png',
 'images/CAKE/CAKE0131.png',
 'images/CAKE/

In [7]:
folder_names = glob.glob("images/*")

In [8]:
image_paths = glob.glob("images/*/*")
labels = pd.Series(image_paths)

In [9]:
labels

Unnamed: 0,0
0,images/CAKE/CAKE0102.png
1,images/CAKE/CAKE0035.png
2,images/CAKE/CAKE0140.png
3,images/CAKE/CAKE0155.png
4,images/CAKE/CAKE0101.png
...,...
4942,images/CEREAL/CEREAL0017.png
4943,images/CEREAL/CEREAL0154.png
4944,images/CEREAL/CEREAL0066.png
4945,images/CEREAL/CEREAL0141.png


In [10]:
labels = labels.str.split(pat="/", expand=True)

In [11]:
image_paths = pd.DataFrame(image_paths)
image_paths

Unnamed: 0,0
0,images/CAKE/CAKE0102.png
1,images/CAKE/CAKE0035.png
2,images/CAKE/CAKE0140.png
3,images/CAKE/CAKE0155.png
4,images/CAKE/CAKE0101.png
...,...
4942,images/CEREAL/CEREAL0017.png
4943,images/CEREAL/CEREAL0154.png
4944,images/CEREAL/CEREAL0066.png
4945,images/CEREAL/CEREAL0141.png


In [12]:
images_dict = dict()
images_dict["image_paths"] = image_paths[0].values
images_dict["labels"] = labels[1].values
images_dict

{'image_paths': array(['images/CAKE/CAKE0102.png', 'images/CAKE/CAKE0035.png',
        'images/CAKE/CAKE0140.png', ..., 'images/CEREAL/CEREAL0066.png',
        'images/CEREAL/CEREAL0141.png', 'images/CEREAL/CEREAL0099.png'],
       dtype=object),
 'labels': array(['CAKE', 'CAKE', 'CAKE', ..., 'CEREAL', 'CEREAL', 'CEREAL'],
       dtype=object)}

In [13]:
labels

Unnamed: 0,0,1,2
0,images,CAKE,CAKE0102.png
1,images,CAKE,CAKE0035.png
2,images,CAKE,CAKE0140.png
3,images,CAKE,CAKE0155.png
4,images,CAKE,CAKE0101.png
...,...,...,...
4942,images,CEREAL,CEREAL0017.png
4943,images,CEREAL,CEREAL0154.png
4944,images,CEREAL,CEREAL0066.png
4945,images,CEREAL,CEREAL0141.png


In [14]:
dataset = pd.DataFrame(images_dict)
dataset

Unnamed: 0,image_paths,labels
0,images/CAKE/CAKE0102.png,CAKE
1,images/CAKE/CAKE0035.png,CAKE
2,images/CAKE/CAKE0140.png,CAKE
3,images/CAKE/CAKE0155.png,CAKE
4,images/CAKE/CAKE0101.png,CAKE
...,...,...
4942,images/CEREAL/CEREAL0017.png,CEREAL
4943,images/CEREAL/CEREAL0154.png,CEREAL
4944,images/CEREAL/CEREAL0066.png,CEREAL
4945,images/CEREAL/CEREAL0141.png,CEREAL


In [15]:
len(dataset)


4947

In [25]:
# Torch imports
import torch
import torchvision
from torchvision.transforms import ToTensor, Lambda, Compose, v2
from torch.nn.functional import one_hot


In [17]:
label_encoder = pd.DataFrame(dataset["labels"].unique())
label_encoder

Unnamed: 0,0
0,CAKE
1,NUTS
2,PASTA
3,CORN
4,FISH
5,RICE
6,SUGAR
7,OIL
8,SPICES
9,WATER


In [18]:
label = label_encoder[label_encoder[0] == 'PASTA'].index.values
label.item()

2

In [26]:
class groceries_dataset_class(Dataset):
  def __init__(self, dataframe, transform = None):
    self.dataframe = dataframe
    self.transform = transform

  def __len__(self):
    return len(self.dataframe)

  def __getitem__(self, idx):
    image = torchvision.io.read_image(self.dataframe["image_paths"][idx])
    label = self.dataframe["labels"][idx]
    encoded_label = label_encoder[label_encoder[0] == label].index.values.item()
    if self.transform:
      image = self.transform(image)
    return image, encoded_label


In [28]:
batch_size = 32
transforms = Compose([
#    v2.RandomResizedCrop(size=(227, 227), antialias=True),
#    v2.RandomHorizontalFlip(p=0.5),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
# Train test split
groceries_dataset = groceries_dataset_class(dataset, transform=transforms)
train_data, test_data = torch.utils.data.random_split(groceries_dataset, [0.80, 0.20])
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

In [29]:
# Test dataset
image, label = groceries_dataset[3]

In [30]:
image.shape

torch.Size([3, 256, 256])

In [31]:
import torch.nn as nn
import torch.nn.functional as F

In [33]:
class VGG(nn.Module):
  def __init__(self):
    super(VGG, self).__init__()
    self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
    self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
    self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
    self.conv4 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1)
    self.conv5 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)
    self.conv6 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
    self.conv9 = nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1)
    self.conv10 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
    self.conv13 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
    self.drop = nn.Dropout(p=0.5)
    self.fc17 = nn.Linear(8*8*512, 4096)
    self.fc18 = nn.Linear(4096, 512)
    self.fc19 = nn.Linear(512, 25)

  def forward(self, x):
    # TODO: Add Relu after convs instead of around them
    x = F.relu(self.conv1(x))
    x = F.relu(self.conv2(x))
    x = F.max_pool2d(x, kernel_size=2, stride=2)
    x = F.relu(self.conv3(x))
    x = F.relu(self.conv4(x))
    x = F.max_pool2d(x, kernel_size=2, stride=2)
    x = F.relu(self.conv5(x))
    x = F.relu(self.conv6(x))
    x = F.max_pool2d(x, kernel_size=2, stride=2)
    x = F.relu(self.conv9(x))
    x = F.relu(self.conv10(x))
    x = F.max_pool2d(x, kernel_size=2, stride=2)
    x = F.relu(self.conv13(x))
    x = F.max_pool2d(x, kernel_size=2, stride=2)
    x = x.view(-1, 512*8*8)
    x = F.relu(self.fc17(x))
    x = self.drop(x)
    x = F.relu(self.fc18(x))
    x = self.drop(x)
    x = self.fc19(x)
    return x

In [38]:
model = VGG().cuda()
epochs = 74
loss = nn.CrossEntropyLoss( )
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)

In [35]:
def accuracy(y_pred, y_true):
  y_pred_classes = y_pred.argmax(dim=1)
  return (y_pred_classes == y_true).float().mean()

data_iter = iter(train_loader)
image, label = next(data_iter)
output = model(image.cuda()).cpu()
output.shape

torch.Size([32, 25])

In [36]:
for batch in train_loader:
    print(type(batch), len(batch))  # Check if it's a tuple & how many items it has
    print([type(x) for x in batch])  # Print the type of each element
    images, labels = batch
    print(labels)
    print(model(images.cuda()).cpu())
    break  # Just look at one batch
print(labels.min(), labels.max(), labels.dtype)

<class 'list'> 2
[<class 'torch.Tensor'>, <class 'torch.Tensor'>]
tensor([ 0,  0, 15, 17, 21,  7, 14, 19, 10, 19, 11,  2,  0,  2, 11,  7, 23, 23,
         7, 17, 14,  7,  1,  2, 11,  4, 11,  8, 11, 11, 19,  9])
tensor([[-3.5357e-02,  8.0775e-03, -1.1860e-02, -5.4234e-03, -2.9763e-02,
         -4.0553e-02,  1.0167e-02, -4.1349e-02,  2.0385e-02,  6.4444e-03,
          3.7999e-02, -1.6528e-02, -3.3211e-03, -2.3890e-02,  1.9153e-03,
          3.2532e-02,  3.3920e-02,  2.7450e-02, -3.7339e-02, -3.3353e-02,
          2.1387e-02, -1.9618e-02,  8.3177e-03, -1.1510e-02,  3.8524e-02],
        [-3.2162e-02,  6.1534e-03, -5.6494e-03,  4.8104e-03, -4.3092e-02,
         -3.8806e-02,  1.4526e-02, -3.9282e-02,  2.6240e-02,  1.2352e-02,
          3.2778e-02, -1.5933e-02,  1.5344e-02, -2.4395e-02,  4.8971e-03,
          3.2097e-02,  2.7671e-02,  3.3795e-02, -3.5823e-02, -2.4702e-02,
          1.6800e-02, -2.0447e-02,  1.0476e-02, -2.3670e-02,  4.1157e-02],
        [-3.3528e-02,  2.8641e-03, -1.3531e-02,

In [39]:
for epoch in range(epochs):
    model.train()
    train_loss_val = 0.0
    train_acc_val = 0.0

    # Training loop
    for images, labels in train_loader:
        images, labels = images.cuda(), labels.cuda()
        optimizer.zero_grad()
        outputs = model(images)
        train_loss = loss(outputs, labels)
        train_loss.backward()
        optimizer.step()

        train_loss_val += train_loss.item()
        train_acc_val += accuracy(outputs, labels).item()

    train_loss_val /= len(train_loader)
    train_acc_val /= len(train_loader)

    # Evaluation loop
    model.eval()
    test_loss_val = 0.0
    test_acc_val = 0.0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.cuda(), labels.cuda()
            outputs = model(images)
            test_loss_val += loss(outputs, labels).item()
            test_acc_val += accuracy(outputs, labels).item()

    test_loss_val /= len(test_loader)
    test_acc_val /= len(test_loader)

    print(f"Epoch {epoch + 1}: Test Loss = {test_loss_val:.6f}, Train Loss = {train_loss_val:.6f}")
    print(f"Epoch {epoch + 1}: Test Accuracy = {test_acc_val:.4f}, Train Accuracy = {train_acc_val:.4f}")
    scheduler.step()

Epoch 1: Test Loss = 3.185888, Train Loss = 3.182579
Epoch 1: Test Accuracy = 0.0568, Train Accuracy = 0.0646
Epoch 2: Test Loss = 3.160492, Train Loss = 3.158033
Epoch 2: Test Accuracy = 0.0568, Train Accuracy = 0.0775
Epoch 3: Test Loss = 3.143083, Train Loss = 3.142206
Epoch 3: Test Accuracy = 0.0729, Train Accuracy = 0.0847
Epoch 4: Test Loss = 3.013596, Train Loss = 3.060113
Epoch 4: Test Accuracy = 0.1265, Train Accuracy = 0.1101
Epoch 5: Test Loss = 2.934595, Train Loss = 2.951020
Epoch 5: Test Accuracy = 0.1438, Train Accuracy = 0.1450
Epoch 6: Test Loss = 2.958338, Train Loss = 2.902854
Epoch 6: Test Accuracy = 0.1347, Train Accuracy = 0.1510
Epoch 7: Test Loss = 2.833401, Train Loss = 2.820630
Epoch 7: Test Accuracy = 0.1539, Train Accuracy = 0.1742
Epoch 8: Test Loss = 2.801867, Train Loss = 2.779908
Epoch 8: Test Accuracy = 0.1662, Train Accuracy = 0.1940
Epoch 9: Test Loss = 2.763957, Train Loss = 2.707998
Epoch 9: Test Accuracy = 0.1769, Train Accuracy = 0.2046
Epoch 10: 

In [None]:
model.eval()
test_loss_val = 0.0
test_acc_val = 0.0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.cuda(), labels.cuda()
        outputs = model(images)
        test_loss_val += loss(outputs, labels).item()
        test_acc_val += accuracy(outputs, labels).item()
        break
test_loss_val /= len(test_loader)
test_acc_val /= len(test_loader)

print(f"Epoch {epoch + 1}: Test Loss = {test_loss_val:.6f}, Train Loss = {train_loss_val:.6f}")
print(f"Epoch {epoch + 1}: Test Accuracy = {test_acc_val:.4f}, Train Accuracy = {train_acc_val:.4f}")

Epoch 5: Test Loss = 0.103360, Train Loss = 63.410877
Epoch 5: Test Accuracy = 0.0000, Train Accuracy = 1.2500


In [None]:
outputs

tensor([[ 0.0532, -0.4306, -0.3060, -0.5212,  0.3319,  0.1748,  0.3581,  0.1234,
          0.3313, -0.1901, -0.1837, -0.2378, -0.5114, -0.1212, -0.0268,  0.3273,
         -0.1978,  0.2822, -0.4360, -0.1626, -0.1147, -0.1785, -0.2443,  0.5137,
         -0.0729],
        [ 0.0532, -0.4306, -0.3060, -0.5212,  0.3319,  0.1748,  0.3581,  0.1234,
          0.3313, -0.1901, -0.1837, -0.2378, -0.5114, -0.1212, -0.0268,  0.3273,
         -0.1978,  0.2822, -0.4360, -0.1626, -0.1147, -0.1785, -0.2443,  0.5137,
         -0.0729],
        [ 0.0532, -0.4306, -0.3060, -0.5212,  0.3319,  0.1748,  0.3581,  0.1234,
          0.3313, -0.1901, -0.1837, -0.2378, -0.5114, -0.1212, -0.0268,  0.3273,
         -0.1978,  0.2822, -0.4360, -0.1626, -0.1147, -0.1785, -0.2443,  0.5137,
         -0.0729],
        [ 0.0532, -0.4306, -0.3060, -0.5212,  0.3319,  0.1748,  0.3581,  0.1234,
          0.3313, -0.1901, -0.1837, -0.2378, -0.5114, -0.1212, -0.0268,  0.3273,
         -0.1978,  0.2822, -0.4360, -0.1626, -0.1147

In [None]:
labels

tensor([20, 13,  6, 24, 24, 22, 24, 22,  6,  7, 17, 14,  3, 18, 17,  7, 11,  0,
         8, 22,  0,  5, 24, 15,  6, 10,  6, 19, 11,  0, 11,  0],
       device='cuda:0')

In [None]:
y_pred_classes = outputs.argmax(dim=1)
(y_pred_classes == labels).float().mean()

tensor(0., device='cuda:0')

In [None]:
y_pred_classes

tensor([23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
        23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23],
       device='cuda:0')