In [1]:
from preprocessing.preprocessing import image_scaling, convert_image, label_conversion
import torch
import torchinfo
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, models
from torch.utils.data import DataLoader, Dataset
import os
import json
import gc
from PIL import Image
import numpy as np
import timm
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


In [2]:
image_folder = 'images'
BATCH_SIZE = 32
# Load calorie database (for food labels)
CALORIE_DB_FILE = os.path.join('preprocessing', "calories_database.json")
with open(CALORIE_DB_FILE, "r") as f:
    calorie_db = json.load(f)

FOOD_LABELS = sorted(list(calorie_db.keys()))
NUM_CLASSES = len(FOOD_LABELS)
IMG_FEATURES = 512

In [3]:
resnet18 = timm.create_model('resnet18', pretrained=True, num_classes=IMG_FEATURES)
resnet26 = timm.create_model('resnet26', pretrained=True, num_classes=IMG_FEATURES)
resnet34 = timm.create_model('resnet34', pretrained=True, num_classes=IMG_FEATURES)
resnet50 = timm.create_model('resnet50', pretrained=True, num_classes=IMG_FEATURES)
# resnet101 = timm.create_model('resnet101', pretrained=True, num_classes=NUM_CLASSES)
# resnet200 = timm.create_model('resnet200')#, pretrained=True)
vittiny = timm.create_model('vit_tiny_patch16_384', pretrained=True, num_classes=IMG_FEATURES)
vitsmall16 = timm.create_model('vit_small_patch16_384', pretrained=True, num_classes=IMG_FEATURES)
vitsmall32 = timm.create_model('vit_small_patch32_384', pretrained=True, num_classes=IMG_FEATURES)
# vitlarge16 = timm.create_model('vit_large_patch16_384', pretrained=True, num_classes=NUM_CLASSES)
# vitlarge32 = timm.create_model('vit_large_patch32_384', pretrained=True, num_classes=NUM_CLASSES)
# vitbase16 = timm.create_model('vit_base_patch16_384', pretrained=True, num_classes=NUM_CLASSES)
# vitbase32 = timm.create_model('vit_base_patch32_384', pretrained=True, num_classes=NUM_CLASSES)

resnet_models = {
    'ResNet18': resnet18,
    'ResNet26': resnet26,
    'ResNet34': resnet34,
    'ResNet50': resnet50
    # 'ResNet101': resnet101,
    # 'ResNet200': resnet200
}

vit_models = {
    'ViT-Tiny': vittiny,
    'ViT-Small 16': vitsmall16,
    'ViT-Small 32': vitsmall32
    # 'ViT-Large 16' : vitlarge16,
    # 'ViT-Large 32' : vitlarge32,
    # 'ViT-Base 16': vitbase16,
    # 'ViT-Base 32': vitbase32
}

# print(timm.list_models(pretrained=True))

# torchinfo.summary(vitsmall16)
# torchinfo.summary(vitbase16)

In [4]:
# Define Custom Dataset
class FoodPortionDataset(Dataset):
    def __init__(self, json_path, img_dir, transform=None):
        with open(json_path, "r") as f:
            self.data = json.load(f)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        img_path = os.path.join(self.img_dir, item["name"] + ".png")
        image = Image.open(img_path).convert("RGB")

        # Convert food types to a one-hot encoded vector
        food_vector = torch.zeros(NUM_CLASSES)
        portion_vector = torch.zeros(NUM_CLASSES)

        for food, portion in zip(item["food type"], item["portion"]):
            if food in FOOD_LABELS:
                food_idx = FOOD_LABELS.index(food)
                food_vector[food_idx] = 1  # Detected food category
                portion_vector[food_idx] = float(portion)  # Ground truth portion size

        if self.transform:
            image = self.transform(image)

        return image, food_vector, portion_vector  # Return image, detected foods, and portion sizes

In [5]:
# Define Data Transforms
transform = transforms.Compose([
    transforms.Resize((400, 400)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

transform384 = transforms.Compose([
    transforms.Resize((384, 384)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

# Load Datasets
train_dataset = FoodPortionDataset(json_path=os.path.join('preprocessing', "train.json"), 
                            img_dir="images_resized",
                            transform=transform)
test_dataset = FoodPortionDataset(json_path=os.path.join('preprocessing', "test.json"), 
                          img_dir="images_resized",
                          transform=transform)
train_dataset384 = FoodPortionDataset(json_path=os.path.join('preprocessing', "train.json"), 
                            img_dir="images_resized",
                            transform=transform384)
test_dataset384 = FoodPortionDataset(json_path=os.path.join('preprocessing', "test.json"), 
                          img_dir="images_resized",
                          transform=transform384)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)#
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)#
train_loader384 = DataLoader(train_dataset384, batch_size=BATCH_SIZE, shuffle=True)#
test_loader384 = DataLoader(test_dataset384, batch_size=BATCH_SIZE, shuffle=False)#

# For efficiency, load all data into device so it can be reused
# train_data = [data.to(device, non_blocking=True) for data, _ in train_loader]
# train_label = [label.to(device, non_blocking=True) for _, label in train_loader]
# test_data = [data.to(device, non_blocking=True) for data, _ in test_loader]
# test_label = [label.to(device, non_blocking=True) for _, label in test_loader]
#data.to(device, non_blocking=True)

In [6]:
class CustomModel(nn.Module):
    def __init__(self, model, num_classes):
        super(CustomModel, self).__init__()
        self.model = model
        # self.model.fc = nn.Identity()
        self.fc_food = nn.Linear(num_classes, 64)
        self.fc_combine = nn.Sequential(
            nn.Linear(512 + 64, 128),  # Image features + food class vector
            # nn.ReLU(),
            # nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(128, num_classes),  # Predict portion sizes for each food type
            nn.ReLU()
        )
    
    def forward(self, img, food_vec):
        model_out = self.model(img)
        food_vec_embed = self.fc_food(food_vec)
        # print(model_out.shape)
        # print(food_vec_embed.shape)
        concat = torch.cat((model_out, food_vec_embed), dim=1)
        # print(concat.shape)
        # print('----------------------------')
        return self.fc_combine(concat)

In [7]:
def train(model, trainloader, optimizer, criterion, device, epochs=10):
    min_loss = np.inf
    for epoch in range(epochs):
        running_loss = 0
        model.train()
        for images, labels, portions in trainloader:
            images, labels, portions = images.to(device), labels.to(device).float(), portions.to(device).float()
            # print(f"{images.shape} {labels.shape} {portions.shape}")
            optimizer.zero_grad()
            out = model(images, labels)
            loss = criterion(out, portions)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

            # delete from memory
            del images
            del labels
            torch.cuda.empty_cache()
            gc.collect()
        loss = running_loss/len(train_loader)
        if loss < min_loss: min_loss = loss
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss}")
    return min_loss

In [8]:
def evaluate(model, testloader, device):
    model.eval() # Set the model to evaluation mode
    mse_loss = 0
    criterion = nn.MSELoss()
    total_size = 0
    with torch.no_grad():
        for images, labels, portions in testloader:
            images, labels, portions = images.to(device), labels.to(device).float(), portions.to(device).float()
            outputs = model(images, labels)
            print(f"real out: {portions}")
            print(f"model out: {outputs}")
            loss = criterion(outputs, portions)
            mse_loss += loss.item() * images.size(0)  # batch size
            total_size += images.size(0)

            del images
            del labels
            torch.cuda.empty_cache()
            gc.collect()

    return mse_loss / total_size

In [9]:
test_model = CustomModel(resnet18, NUM_CLASSES)
torchinfo.summary(test_model)

Layer (type:depth-idx)                        Param #
CustomModel                                   --
├─ResNet: 1-1                                 --
│    └─Conv2d: 2-1                            9,408
│    └─BatchNorm2d: 2-2                       128
│    └─ReLU: 2-3                              --
│    └─MaxPool2d: 2-4                         --
│    └─Sequential: 2-5                        --
│    │    └─BasicBlock: 3-1                   73,984
│    │    └─BasicBlock: 3-2                   73,984
│    └─Sequential: 2-6                        --
│    │    └─BasicBlock: 3-3                   230,144
│    │    └─BasicBlock: 3-4                   295,424
│    └─Sequential: 2-7                        --
│    │    └─BasicBlock: 3-5                   919,040
│    │    └─BasicBlock: 3-6                   1,180,672
│    └─Sequential: 2-8                        --
│    │    └─BasicBlock: 3-7                   3,673,088
│    │    └─BasicBlock: 3-8                   4,720,640
│    └─SelectAda

In [10]:
results = []

for rn_model in resnet_models.keys():
    model = CustomModel(resnet_models[rn_model], NUM_CLASSES).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001)
    print(f"Training model {rn_model}")
    train_loss = train(model, train_loader, optimizer, criterion, device, epochs=25)
    print("Training complete")
    avg_loss = evaluate(model, test_loader, device)
    print(f"{rn_model} Average Loss: {avg_loss}")
    results.append({
        'model name': rn_model,
        'best training loss': train_loss,
        'test avg loss': avg_loss
    })
    del model
    torch.cuda.empty_cache()
    gc.collect()

for vit_model in vit_models.keys():
    model = CustomModel(vit_models[vit_model], NUM_CLASSES).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001)
    print(f"Training model {vit_model}")
    train_loss = train(model, train_loader384, optimizer, criterion, device, epochs=25)
    print("Training complete")
    avg_loss = evaluate(model, test_loader384, device)
    print(f"{vit_model} Average Loss: {avg_loss}")
    results.append({
        'model name': vit_model,
        'best training loss': train_loss,
        'test avg loss': avg_loss
    })
    del model
    torch.cuda.empty_cache()
    gc.collect()

results_df = pd.DataFrame(results)
results_df

Training model ResNet18
Epoch 1/25, Loss: 654.4452623639788
Epoch 2/25, Loss: 562.381723676409
Epoch 3/25, Loss: 559.8713171822684
Epoch 4/25, Loss: 544.2362365722656
Epoch 5/25, Loss: 540.6689818246024
Epoch 6/25, Loss: 481.3222759791783
Epoch 7/25, Loss: 446.10514068603516
Epoch 8/25, Loss: 422.67871802193775
Epoch 9/25, Loss: 391.6313798086984
Epoch 10/25, Loss: 374.5128786904471
Epoch 11/25, Loss: 371.56159700666154
Epoch 12/25, Loss: 360.18154689243863
Epoch 13/25, Loss: 337.8273296356201
Epoch 14/25, Loss: 326.8032896859305
Epoch 15/25, Loss: 317.01980100359236
Epoch 16/25, Loss: 319.7337728227888
Epoch 17/25, Loss: 306.4415702819824
Epoch 18/25, Loss: 303.7560795375279
Epoch 19/25, Loss: 288.875757762364
Epoch 20/25, Loss: 288.1258716583252
Epoch 21/25, Loss: 281.8733959197998
Epoch 22/25, Loss: 271.46838869367326
Epoch 23/25, Loss: 276.43562752859935
Epoch 24/25, Loss: 285.4316210065569
Epoch 25/25, Loss: 273.2831666128976
Training complete
real out: tensor([[  0.,   0.,   0., 

Unnamed: 0,model name,best training loss,test avg loss
0,ResNet18,271.468389,249.116926
1,ResNet26,168.743534,202.302115
2,ResNet34,235.151948,179.789096
3,ResNet50,200.245192,192.867321
4,ViT-Tiny,290.509898,295.151091
5,ViT-Small 16,327.386796,315.691064
6,ViT-Small 32,265.510029,266.850095
