## 1. Import các thư viện cần thiết

In [1]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, datasets
from PIL import Image
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import shutil
from torchvision.datasets import ImageFolder
from collections import Counter


## 2. Định nghĩa lớp và Tải dữ liệu

In [2]:
# Lớp AIDDataset tùy chỉnh
class AIDDataset(Dataset):
    def __init__(self, image_paths, labels, classes, class_to_idx, transform=None):
        """
        image_paths: Danh sách đường dẫn đến ảnh
        labels: Danh sách nhãn tương ứng
        classes: Danh sách tên các lớp
        class_to_idx: Ánh xạ từ tên lớp sang chỉ số
        transform: Các phép biến đổi (transforms) áp dụng lên ảnh
        """
        self.image_paths = image_paths
        self.labels = labels
        self.classes = classes
        self.class_to_idx = class_to_idx
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        label = self.labels[idx]
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        return image, label

In [3]:
# Hàm để tải toàn bộ dữ liệu từ thư mục, giống
def load_aid_data(root_dir):
    classes = sorted([d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))])
    class_to_idx = {cls_name: idx for idx, cls_name in enumerate(classes)}
    image_paths, labels = [], []
    image_files = [[] for _ in range(len(classes))]

    for cls_idx, cls_name in enumerate(classes):
        cls_dir = os.path.join(root_dir, cls_name)
        cls_images = [os.path.join(cls_dir, img_name) for img_name in os.listdir(cls_dir)
                     if img_name.lower().endswith(('.jpg', '.jpeg', '.png'))]

        image_files[cls_idx] = cls_images
        image_paths.extend(cls_images)
        labels.extend([class_to_idx[cls_name]] * len(cls_images))

    return image_paths, labels, classes, class_to_idx, image_files

# Tải dữ liệu
data_dir = '/kaggle/input/aid-scene-classification-datasets/AID'
image_paths, labels, class_names, class_to_idx, image_files = load_aid_data(data_dir)

# In thông tin dataset
image_width, image_height = Image.open(image_paths[0]).size
print('Total image count:', len(image_paths))
print("Image dimensions:", image_width, "x", image_height)
print("Label names:", class_names)
print("Label counts:", [len(image_files[i]) for i in range(len(class_names))])

Total image count: 10000
Image dimensions: 600 x 600
Label names: ['Airport', 'BareLand', 'BaseballField', 'Beach', 'Bridge', 'Center', 'Church', 'Commercial', 'DenseResidential', 'Desert', 'Farmland', 'Forest', 'Industrial', 'Meadow', 'MediumResidential', 'Mountain', 'Park', 'Parking', 'Playground', 'Pond', 'Port', 'RailwayStation', 'Resort', 'River', 'School', 'SparseResidential', 'Square', 'Stadium', 'StorageTanks', 'Viaduct']
Label counts: [360, 310, 220, 400, 360, 260, 240, 350, 410, 300, 370, 250, 390, 280, 290, 340, 350, 390, 370, 420, 380, 260, 290, 410, 300, 300, 330, 290, 360, 420]


## 3. Chia tập dữ liệu

In [4]:
# Đường dẫn dataset gốc và thư mục lưu kết quả
source_dataset_path = '/kaggle/input/aid-scene-classification-datasets/AID'
output_base_path = '/kaggle/working/split_aid_dataset'

# Tạo các thư mục train, val, test nếu chưa có
train_dir = os.path.join(output_base_path, 'train')
val_dir = os.path.join(output_base_path, 'val')
test_dir = os.path.join(output_base_path, 'test')

for directory in [train_dir, val_dir, test_dir]:
    os.makedirs(directory, exist_ok=True)

# Tải dataset
dataset = ImageFolder(source_dataset_path)
image_paths = [item[0] for item in dataset.samples]
labels = [item[1] for item in dataset.samples]
class_names = dataset.classes

# Phân loại ảnh theo từng lớp
class_images = {class_idx: [] for class_idx in range(len(class_names))}
for img_path, label in zip(image_paths, labels):
    class_images[label].append(img_path)

# Chia đều từng lớp theo tỷ lệ 6:2:2
train_paths, val_paths, test_paths = [], [], []
train_labels, val_labels, test_labels = [], [], []

for class_idx, class_name in enumerate(class_names):
    images = class_images[class_idx]
    total_images = len(images)

    train_size = int(total_images * 0.6)
    val_size = int(total_images * 0.2)
    test_size = total_images - train_size - val_size

    # Điều chỉnh để tránh sai lệch do làm tròn
    if train_size + val_size + test_size > total_images:
        diff = train_size + val_size + test_size - total_images
        if val_size > diff:
            val_size -= diff
        else:
            test_size -= diff
    elif train_size + val_size + test_size < total_images:
        diff = total_images - (train_size + val_size + test_size)
        test_size += diff

    train_split = images[:train_size]
    val_split = images[train_size:train_size + val_size]
    test_split = images[train_size + val_size:]

    train_paths.extend(train_split)
    val_paths.extend(val_split)
    test_paths.extend(test_split)
    train_labels.extend([class_idx] * len(train_split))
    val_labels.extend([class_idx] * len(val_split))
    test_labels.extend([class_idx] * len(test_split))

# Sao chép ảnh vào thư mục đích với cấu trúc thư mục con theo lớp
def copy_images(image_paths, labels, dest_dir, class_names):
    for class_name in class_names:
        class_dir = os.path.join(dest_dir, class_name)
        os.makedirs(class_dir, exist_ok=True)

    for img_path, label in zip(image_paths, labels):
        class_name = class_names[label]
        dest_path = os.path.join(dest_dir, class_name, os.path.basename(img_path))
        shutil.copy(img_path, dest_path)

copy_images(train_paths, train_labels, train_dir, class_names)
copy_images(val_paths, val_labels, val_dir, class_names)
copy_images(test_paths, test_labels, test_dir, class_names)

# Phân phối lớp trong các tập
def print_class_distribution(labels, dataset_name):
    counts = Counter(labels)
    print(f"\nPhân phối lớp trong tập {dataset_name}:")
    for class_idx, count in counts.items():
        print(f"  - {class_names[class_idx]}: {count} ảnh")

print_class_distribution(train_labels, "Train")
print_class_distribution(val_labels, "Validation")
print_class_distribution(test_labels, "Test")

# Tổng số ảnh trong mỗi tập
print(f"\nTổng số ảnh trong tập Train: {len(train_paths)}")
print(f"Tổng số ảnh trong tập Validation: {len(val_paths)}")
print(f"Tổng số ảnh trong tập Test: {len(test_paths)}")


Phân phối lớp trong tập Train:
  - Airport: 216 ảnh
  - BareLand: 186 ảnh
  - BaseballField: 132 ảnh
  - Beach: 240 ảnh
  - Bridge: 216 ảnh
  - Center: 156 ảnh
  - Church: 144 ảnh
  - Commercial: 210 ảnh
  - DenseResidential: 246 ảnh
  - Desert: 180 ảnh
  - Farmland: 222 ảnh
  - Forest: 150 ảnh
  - Industrial: 234 ảnh
  - Meadow: 168 ảnh
  - MediumResidential: 174 ảnh
  - Mountain: 204 ảnh
  - Park: 210 ảnh
  - Parking: 234 ảnh
  - Playground: 222 ảnh
  - Pond: 252 ảnh
  - Port: 228 ảnh
  - RailwayStation: 156 ảnh
  - Resort: 174 ảnh
  - River: 246 ảnh
  - School: 180 ảnh
  - SparseResidential: 180 ảnh
  - Square: 198 ảnh
  - Stadium: 174 ảnh
  - StorageTanks: 216 ảnh
  - Viaduct: 252 ảnh

Phân phối lớp trong tập Validation:
  - Airport: 72 ảnh
  - BareLand: 62 ảnh
  - BaseballField: 44 ảnh
  - Beach: 80 ảnh
  - Bridge: 72 ảnh
  - Center: 52 ảnh
  - Church: 48 ảnh
  - Commercial: 70 ảnh
  - DenseResidential: 82 ảnh
  - Desert: 60 ảnh
  - Farmland: 74 ảnh
  - Forest: 50 ảnh
  - Industr

## 4. Biến đổi dữ liệu

In [5]:
base_dir = '/kaggle/working/split_aid_dataset'

train_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(30),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.RandomResizedCrop((224, 224), scale=(0.8, 1.0)),
    transforms.ToTensor(),
    # transforms.RandomErasing(p=0.25, scale=(0.02, 0.33), ratio=(0.3, 3.3))
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

val_test_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

In [6]:
def load_dataset(root_dir, transform):
    if not os.path.exists(root_dir):
        raise FileNotFoundError(f"{root_dir} không tồn tại.")
    return datasets.ImageFolder(root=root_dir, transform=transform)

train_dataset = load_dataset(os.path.join(base_dir, 'train'), train_transforms)
val_dataset = load_dataset(os.path.join(base_dir, 'val'), val_test_transforms)
test_dataset = load_dataset(os.path.join(base_dir, 'test'), val_test_transforms)

batch_size = 32
num_workers = 4
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)

## 5. Khai báo mô hình

Sử dụng code có sẵn trên github để tiện việc sử dụng

In [7]:
!apt-get install -y git
!git clone https://github.com/microsoft/Cream.git
%cd Cream/EfficientViT/classification

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.12).
0 upgraded, 0 newly installed, 0 to remove and 87 not upgraded.
fatal: destination path 'Cream' already exists and is not an empty directory.
/kaggle/working/Cream/EfficientViT/classification


In [8]:
import sys
sys.path.append("/kaggle/working/Cream/EfficientViT/classification")

In [9]:
!pip install timm



Chỉnh sửa 1 vài lỗi trong file code đã clone để cho phù hợp với cách thực hiện

In [10]:
with open('/kaggle/working/Cream/EfficientViT/classification/engine.py', 'w') as f:
    f.write('''import torch
import torch.distributed as dist
from timm.utils import accuracy
from utils import MetricLogger, SmoothedValue
from losses import DistillationLoss

def train_one_epoch(model, criterion, data_loader, optimizer, device, epoch, loss_scaler, clip_grad, clip_mode, model_ema=None, mixup_fn=None, set_training_mode=True, set_bn_eval=False):
    model.train(set_training_mode)
    if set_bn_eval:
        for m in model.modules():
            if isinstance(m, torch.nn.BatchNorm2d):
                m.eval()

    metric_logger = MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}'))
    metric_logger.add_meter('loss', SmoothedValue(window_size=1, fmt='{value:.4f}'))
    header = f'Epoch: [{epoch}]'

    for batch_idx, (samples, targets) in enumerate(metric_logger.log_every(data_loader, 10, header)):
        samples = samples.to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)

        if mixup_fn is not None:
            samples, targets = mixup_fn(samples, targets)

        with torch.cuda.amp.autocast():
            outputs = model(samples)
            if isinstance(criterion, DistillationLoss):
                loss = criterion(samples, outputs, targets)
            else:
                loss = criterion(outputs, targets)

        loss_value = loss.item()

        if not torch.isfinite(loss):
            print(f"Loss is {loss_value}, stopping training")
            return

        optimizer.zero_grad()
        # Use NativeScaler for mixed precision (timm==0.5.4)
        loss_scaler(
            loss,
            optimizer,
            clip_grad=clip_grad,
            clip_mode=clip_mode,
            parameters=model.parameters(),
            create_graph=False
        )

        if model_ema is not None:
            model_ema.update(model)

        torch.cuda.synchronize()
        metric_logger.update(loss=loss_value)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

    metric_logger.synchronize_between_processes()
    print(f"Average stats: {metric_logger}")
    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}

def evaluate(data_loader, model, device):
    model.eval()
    metric_logger = MetricLogger(delimiter="  ")
    metric_logger.add_meter('acc1', SmoothedValue(window_size=1, fmt='{value:.3f}'))
    metric_logger.add_meter('acc5', SmoothedValue(window_size=1, fmt='{value:.3f}'))

    with torch.no_grad():
        for samples, targets in metric_logger.log_every(data_loader, 10, 'Evaluation'):
            samples = samples.to(device, non_blocking=True)
            targets = targets.to(device, non_blocking=True)

            outputs = model(samples)
            acc1, acc5 = accuracy(outputs, targets, topk=(1, 5))

            batch_size = samples.shape[0]
            metric_logger.update(acc1=acc1.item(), n=batch_size)
            metric_logger.update(acc5=acc5.item(), n=batch_size)

    metric_logger.synchronize_between_processes()
    print(f" * Acc@1 {metric_logger.acc1.global_avg:.3f} Acc@5 {metric_logger.acc5.global_avg:.3f}")
    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
''')

In [None]:
import datetime
import json
import numpy as np
import torch
import torch.backends.cudnn as cudnn
import time
import os
from pathlib import Path
from types import SimpleNamespace
from timm.models import create_model
from timm.loss import LabelSmoothingCrossEntropy
from timm.optim import create_optimizer
from timm.scheduler import create_scheduler
from timm.utils import NativeScaler, get_state_dict
from model import build  # Assuming from EfficientViT repo
import utils
from engine import train_one_epoch, evaluate

config = {
    'batch_size': 32,
    'epochs': 100,
    'input_size': 224,
    'lr': 1e-3,
    'weight_decay': 0.025,
    'opt': 'adamw',
    'opt_eps': 1e-8,
    'opt_betas': None,
    'momentum': 0.9,
    'sched': 'cosine',
    'warmup_epochs': 5,
    'warmup_lr': 1e-6,
    'min_lr': 1e-5,
    'cooldown_epochs': 10,    # Added to fix AttributeError
    'decay_epochs': 30,       # Added for scheduler compatibility
    'decay_rate': 0.1,        # Added for scheduler compatibility
    'lr_noise': None,         # Added for scheduler compatibility
    'lr_noise_pct': 0.67,     # Added for scheduler compatibility
    'lr_noise_std': 1.0,      # Added for scheduler compatibility
    'patience_epochs': 10,    # Added for scheduler compatibility
    'output_dir': './output',
    'save_freq': 20,
    'distillation_type': 'none',
    'nb_classes': 30,
}
args = SimpleNamespace(**config)

# Initialize device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cudnn.benchmark = True

# Check for data loaders
if train_loader is None or val_loader is None:
    raise ValueError("train_loader and val_loader must be defined. Please replace placeholders with actual DataLoaders.")

# Create model
model = create_model(
    'efficientvit_m5.r224_in1k',
    num_classes=args.nb_classes,
    pretrained=True,
)

model.to(device)

# Optimizer and scheduler
linear_scaled_lr = args.lr * args.batch_size / 512.0
args.lr = linear_scaled_lr  # Update lr in args
optimizer = create_optimizer(args, model)
lr_scheduler, _ = create_scheduler(args, optimizer)
loss_scaler = NativeScaler()

# Loss function
criterion = LabelSmoothingCrossEntropy()

# Create output directory
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)

# Training loop
print(f"Start training for {args.epochs} epochs")
max_accuracy = 0.0
total_time = 0.0
for epoch in range(args.epochs):
    start_time = time.time()
    train_stats = train_one_epoch(
        model, criterion, train_loader,
        optimizer, device, epoch, loss_scaler,
        clip_grad=0.02, clip_mode='agc',
        set_training_mode=True
    )
    total_time += time.time() - start_time
    print(total_time)
    lr_scheduler.step(epoch)
    test_stats = evaluate(val_loader, model, device)
    
    if epoch % args.save_freq == 0 or epoch == args.epochs - 1:
        checkpoint_path = output_dir / f'checkpoint_{epoch}.pth'
        print(f"Saving checkpoint to {checkpoint_path}")
        torch.save(model.state_dict(), checkpoint_path)
    
        
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print(f'Training time {total_time_str}')


Creating model: EfficientViT_M5
Start training for 100 epochs
Epoch: [0]  [  0/188]  eta: 0:04:16  lr: 0.000001  loss: 3.3965  time: 1.3656  data: 0.8124  max mem: 1394
Epoch: [0]  [ 10/188]  eta: 0:01:03  lr: 0.000001  loss: 3.3808  time: 0.3572  data: 0.0744  max mem: 1394
Epoch: [0]  [ 20/188]  eta: 0:00:48  lr: 0.000001  loss: 3.3775  time: 0.2377  data: 0.0005  max mem: 1394
Epoch: [0]  [ 30/188]  eta: 0:00:42  lr: 0.000001  loss: 3.4196  time: 0.2238  data: 0.0002  max mem: 1394
Epoch: [0]  [ 40/188]  eta: 0:00:39  lr: 0.000001  loss: 3.4399  time: 0.2374  data: 0.0002  max mem: 1394
Epoch: [0]  [ 50/188]  eta: 0:00:36  lr: 0.000001  loss: 3.4270  time: 0.2484  data: 0.0002  max mem: 1394
Epoch: [0]  [ 60/188]  eta: 0:00:33  lr: 0.000001  loss: 3.4057  time: 0.2444  data: 0.0004  max mem: 1394
Epoch: [0]  [ 70/188]  eta: 0:00:29  lr: 0.000001  loss: 3.4116  time: 0.2324  data: 0.0003  max mem: 1394
Epoch: [0]  [ 80/188]  eta: 0:00:26  lr: 0.000001  loss: 3.4185  time: 0.2240  dat

## 6. Đánh giá mô hình

# 6.1. Đánh giá trên tập test

In [16]:
device_cpu = torch.device('cpu')
device_gpu = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [17]:
evaluate(test_loader, model, device_gpu)

Evaluation  [ 0/63]  eta: 0:00:52  acc1: 100.000  acc5: 100.000  n: 32.0000 (32.0000)  time: 0.8373  data: 0.7266  max mem: 1394
Evaluation  [10/63]  eta: 0:00:09  acc1: 96.875  acc5: 100.000  n: 32.0000 (32.0000)  time: 0.1773  data: 0.1043  max mem: 1394
Evaluation  [20/63]  eta: 0:00:06  acc1: 96.875  acc5: 96.875  n: 32.0000 (32.0000)  time: 0.1290  data: 0.0571  max mem: 1394
Evaluation  [30/63]  eta: 0:00:05  acc1: 100.000  acc5: 100.000  n: 32.0000 (32.0000)  time: 0.1395  data: 0.0726  max mem: 1394
Evaluation  [40/63]  eta: 0:00:03  acc1: 96.875  acc5: 100.000  n: 32.0000 (32.0000)  time: 0.1302  data: 0.0649  max mem: 1394
Evaluation  [50/63]  eta: 0:00:01  acc1: 75.000  acc5: 96.875  n: 32.0000 (32.0000)  time: 0.1136  data: 0.0501  max mem: 1394
Evaluation  [60/63]  eta: 0:00:00  acc1: 100.000  acc5: 100.000  n: 32.0000 (32.0000)  time: 0.1158  data: 0.0694  max mem: 1394
Evaluation  [62/63]  eta: 0:00:00  acc1: 100.000  acc5: 100.000  n: 32.0000 (31.7460)  time: 0.1110  da

{'acc1': 94.7420634920635, 'acc5': 99.05753968253968, 'n': 31.746031746031747}

### 6.2. Đo thời gian inference

In [18]:
def measure_inference_time(model, val_loader, device):
    model.to(device)
    model.eval()

    total_time = 0.0
    total_samples = 0

    with torch.no_grad():
        for inputs, _ in val_loader:
            inputs = inputs.to(device)
            start_time = time.time()
            outputs = model(inputs)
            end_time = time.time()

            total_time += (end_time - start_time)
            total_samples += inputs.size(0)

    avg_time_per_sample = total_time / total_samples
    print(f"[{device}] Avg inference time per sample: {avg_time_per_sample * 1000:.4f} ms")
    return avg_time_per_sample

In [19]:
# Đo thời gian
print("Measuring on CPU...")
measure_inference_time(model, test_loader, device_cpu)

if torch.cuda.is_available():
    print("Measuring on GPU...")
    measure_inference_time(model, test_loader, device_gpu)

Measuring on CPU...
[cpu] Avg inference time per sample: 18.4609 ms
Measuring on GPU...
[cuda] Avg inference time per sample: 1.6685 ms
