# Assignment 04: Use pretrained ResNet50 from Hugging 

In [5]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from tqdm import tqdm
from transformers import AutoImageProcessor, AutoModelForImageClassification

2025-11-06 09:32:05.058588: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762421525.279506      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762421525.341918      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [1]:
class VinaFoodDataLoader:
    def __init__(self, batch_size=64, num_workers=2, data_dir='/kaggle/input/vinafood21/VinaFood21', image_processor=None):
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.data_dir = data_dir

        if image_processor is None:
            raise ValueError("image_processor must be provided for VinaFoodDataLoader in fine-tuning setup")

        self.image_processor = image_processor
        self.transform_train = transforms.Compose([
            transforms.RandomResizedCrop(image_processor.size["shortest_edge"]),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize(image_processor.image_mean, image_processor.image_std)
        ])

        self.transform_test = transforms.Compose([
            transforms.Resize(image_processor.size["shortest_edge"]),
            transforms.CenterCrop(image_processor.size["shortest_edge"]),
            transforms.ToTensor(),
            transforms.Normalize(image_processor.image_mean, image_processor.image_std)
        ])

    def get_train_loader(self):
        train_path = os.path.join(self.data_dir, 'train')
        train_dataset = datasets.ImageFolder(root=train_path, transform=self.transform_train)
        train_loader = DataLoader(dataset=train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers)
        print(f"Loaded {len(train_dataset)} training samples from {train_path}. Found {len(train_dataset.classes)} classes")
        self.num_classes = len(train_dataset.classes)
        return train_loader

        # Mỗi thư mục con tương ứng với 1 nhãn (class) -> gán label cho từng ảnh dựa theo thứ tự thư mục con được sắp xếp alphabetically

    def get_test_loader(self):
        test_path = os.path.join(self.data_dir, 'test')
        test_dataset = datasets.ImageFolder(root=test_path, transform=self.transform_test)
        test_loader = DataLoader(dataset=test_dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)
        print(f"Loaded {len(test_dataset)} test samples from {test_path}")
        return test_loader


In [None]:
def evaluate_model(model, data_loader, device):
    model.eval()
    all_preds = []
    all_targets = []
    with torch.no_grad():
        for data, target in tqdm(data_loader, desc="Evaluating"):
            data, target = data.to(device), target.to(device)
            output = model(data)
            logits  = output.logits
            probabilities = torch.softmax(logits, dim=1)

            predictions = probabilities.argmax(dim=1)
            all_preds.extend(predictions.cpu().numpy())
            all_targets.extend(target.cpu().numpy())

    all_preds = np.array(all_preds)
    all_targets = np.array(all_targets)
    print("Overall evaluation metrics")
    overall_accuracy = accuracy_score(all_targets, all_preds)
    print(f"Accuracy: {overall_accuracy:.4f}")

    overall_recall = recall_score(all_targets, all_preds, average='macro', zero_division=0)
    print(f"Recall: {overall_recall:.4f}")

    overall_precision = precision_score(all_targets, all_preds, average='macro', zero_division=0)
    print(f"Precision: {overall_accuracy:.4f}")

    overall_f1 = f1_score(all_targets, all_preds, average='macro', zero_division=0)
    print(f"F1: {overall_accuracy:.4f}")

    num_classes = len(np.unique(all_targets))
    per_class_results = {}

    # print(f"Per-class evaluation metrics")
    # for i in range (num_classes):
    #     class_target = (all_targets == i).astype(int)
    #     class_pred = (all_preds == i).astype(int)

    #     accuracy = accuracy_score(class_target, class_pred)
    #     precision = precision_score(class_target, class_pred, zero_division=0)
    #     recall = recall_score(class_target, class_pred, zero_division=0)
    #     f1 = f1_score(class_target, class_pred, zero_division=0)

    #     per_class_results[i] = {
    #         'accuracy': accuracy,
    #         'precision': precision,
    #         'recall': recall,
    #         'f1': f1
    #     }
    #     print(f"Class {i}: \n Accuracy: {accuracy:.4f} \n Recall: {recall:.4f} \n Precision: {precision:.4f} \n F1: {f1:.4f}")
    return {
        'overall': {
            'accuracy': overall_accuracy,
            'precision': overall_precision,
            'recall': overall_recall,
            'f1': overall_f1
        }
        # 'per_class': per_class_results
    }


In [16]:
class Trainer:
    def __init__(self, model, train_loader, test_loader, device, learning_rate=0.01, epochs=5, save_dir='/kaggle/working/checkpoints/'):
        self.model = model
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.device = device
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.save_dir = save_dir

        self.best_accuracy = 0.0

        self.criterion = nn.CrossEntropyLoss()

        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        os.makedirs(self.save_dir, exist_ok=True)

    def train_epoch(self, epoch):
        self.model.train()
        total_loss = 0
        for batch_idx, (data, target) in enumerate(tqdm(self.train_loader, desc=f'Training {epoch}/{self.epochs}')):
            data, target = data.to(self.device), target.to(self.device)

            self.optimizer.zero_grad()
            outputs = self.model(pixel_values=data)
            logits = outputs.logits
            loss = self.criterion(logits, target)
            loss.backward()
            self.optimizer.step()

            total_loss += loss

        avg_loss = total_loss/len(self.train_loader)
        print(f"Epoch {epoch} Training loss: {avg_loss:.4f}")
        return avg_loss

    def train(self):
        print(f"Full training on {self.device} for {self.epochs} with {self.learning_rate}")
        for epoch in range(1, self.epochs + 1):
            self.train_epoch(epoch)

            metrics = evaluate_model(self.model, self.test_loader, self.device)
            current_accuracy = metrics['overall']['accuracy']
            if current_accuracy > self.best_accuracy:
                self.best_accuracy = current_accuracy
                model_path = os.path.join(self.save_dir, 'best_model_assigment_02.pt')
                torch.save(self.model.state_dict(), model_path)
                print(f"Save new model version with accurcay = {self.best_accuracy:.4f}")
        print("Training_finished")
        print(f"Best model version is saved at {model_path} with accuracy = {self.best_accuracy}")

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

Using cuda


In [7]:
model_name = "microsoft/resnet-50"
print("Load Image Processor")
image_processor = AutoImageProcessor.from_pretrained(model_name)


Load Image Processor


preprocessor_config.json:   0%|          | 0.00/266 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [8]:
print("Load data")
data_loader = VinaFoodDataLoader(image_processor = image_processor)
train_loader = data_loader.get_train_loader()
test_loader = data_loader.get_test_loader()

num_classes = data_loader.num_classes
print(f"Number of classes: {num_classes}")


Load data
Loaded 10044 training samples from /kaggle/input/vinafood21/VinaFood21/train. Found 21 classes
Loaded 6682 test samples from /kaggle/input/vinafood21/VinaFood21/test
Number of classes: 21


In [10]:
print("Build model")
model = AutoModelForImageClassification.from_pretrained(model_name)

print(model.classifier)

 # --- Điều chỉnh lớp phân loại cuối cùng ---
    # ResNet50 trong Hugging Face có lớp phân loại cuối cùng là `classifier`.
    # Kích thước đầu vào của lớp này thường là 2048 (đầu ra của Global Average Pooling).
    # --> Thay thế nó bằng một lớp tuyến tính mới có số đầu ra bằng `num_classes`.

num_features = model.classifier[1].in_features
model.classifier[1] = nn.Linear(num_features, num_classes)
model.to(device)
print(model)

Build model


config.json: 0.00B [00:00, ?B/s]



model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=2048, out_features=1000, bias=True)
)
ResNetForImageClassification(
  (resnet): ResNetModel(
    (embedder): ResNetEmbeddings(
      (embedder): ResNetConvLayer(
        (convolution): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        (normalization): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (activation): ReLU()
      )
      (pooler): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    )
    (encoder): ResNetEncoder(
      (stages): ModuleList(
        (0): ResNetStage(
          (layers): Sequential(
            (0): ResNetBottleNeckLayer(
              (shortcut): ResNetShortCut(
                (convolution): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (normalization): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              )

In [11]:
print(model.classifier[1])

Linear(in_features=2048, out_features=21, bias=True)


In [None]:
print("Start training")
trainer = Trainer(
    model=model,
    train_loader=train_loader,
    test_loader=test_loader,
    device=device,
    learning_rate=0.01,
    epochs=10,
    save_dir= '/kaggle/working/checkpoints/'
)
trainer.train()


Start training
Full training on cuda for 3 with 0.01


Training 1/3: 100%|██████████| 157/157 [01:58<00:00,  1.33it/s]


Epoch 1 Training loss: 2.3558


Evaluating: 100%|██████████| 105/105 [01:25<00:00,  1.23it/s]


Overall evaluation metrics
Accuracy: 0.2547
Recall: 0.2379
Precision: 0.2547
F1: 0.2547
Save new model version with accurcay = 0.2547


Training 2/3: 100%|██████████| 157/157 [01:54<00:00,  1.37it/s]


Epoch 2 Training loss: 1.8851


Evaluating: 100%|██████████| 105/105 [01:02<00:00,  1.69it/s]


Overall evaluation metrics
Accuracy: 0.3351
Recall: 0.3186
Precision: 0.3351
F1: 0.3351
Save new model version with accurcay = 0.3351


Training 3/3: 100%|██████████| 157/157 [01:54<00:00,  1.37it/s]


Epoch 3 Training loss: 1.5969


Evaluating: 100%|██████████| 105/105 [01:02<00:00,  1.68it/s]

Overall evaluation metrics
Accuracy: 0.3340
Recall: 0.3142
Precision: 0.3340
F1: 0.3340
Training_finished
Best model version is saved at /kaggle/working/checkpoints/best_model_assigment_02.pt with accuracy = 0.3350793175695899



