In [1]:
!pip install -qq medmnist
!pip install -qq av
import os
import cv2
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import av

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m100.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m78.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m18.7 MB/s

In [2]:
# Constants
DATA_DIR = '/kaggle/input/hanwash/HandWashDataset/HandWashDataset'
CLASSES = ['Step_1', 'Step_2', 'Step_3', 'Step_4', 'Step_5', 'Step_6']
LABELS_TO_ID = {label: i for i, label in enumerate(CLASSES)}
TEST_SIZE = 0.2
VAL_SIZE = 0.2
DATASET_SEED = 81
FRAME_SIZE = (360, 240)
SCALE = 255.0
CLIP_LENGTH = 100
INPUT_SHAPE = (1, 210, 270)  # PyTorch: (C, H, W)
BATCH_SIZE = 64
NUM_CLASSES = len(CLASSES)
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 1e-5
EPOCHS = 200
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
def prepare_dataset(data_dir, classes, test_size=TEST_SIZE, val_size=VAL_SIZE, random_state=DATASET_SEED):
    video_lengths = []
    for class_name in classes:
        class_dir = os.path.join(data_dir, class_name)
        videos = os.listdir(class_dir)
        for video_file in videos:
            video_path = os.path.join(class_dir, video_file)
            video_reader = cv2.VideoCapture(video_path)
            frames_count = int(video_reader.get(cv2.CAP_PROP_FRAME_COUNT))
            video_lengths.append((class_name, video_path, frames_count))
            video_reader.release()
    train_videos, test_val_videos = train_test_split(video_lengths, test_size=test_size+val_size, random_state=random_state)
    val_videos, test_videos = train_test_split(test_val_videos, test_size=test_size/(test_size+val_size), random_state=random_state)
    return train_videos, val_videos, test_videos

def read_video_pyav(container, indices, new_size=FRAME_SIZE, scale=SCALE):
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            new_frame = frame.to_ndarray(format="rgb24")
            new_frame = cv2.cvtColor(new_frame, cv2.COLOR_RGB2GRAY)
            new_frame = new_frame / scale
            new_frame = cv2.resize(new_frame, new_size)
            frames.append(new_frame)
    return np.stack(frames, axis=0)  # Shape: (num_frames, H, W)

def sample_frame_indices(frame_sample_rate, seg_len, clip_len=CLIP_LENGTH):
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = seg_len
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices

def download_and_prepare_dataset(video_directories: list):
    np_videos = []
    valid_indices = []
    for i, address in enumerate(video_directories):
        try:
            vid_container = av.open(address)
            vid_indices = sample_frame_indices(frame_sample_rate=1, seg_len=vid_container.streams.video[0].frames)
            video_frames = read_video_pyav(container=vid_container, indices=vid_indices)
            if video_frames.shape[0] != CLIP_LENGTH:
                print(f"Warning: Video {i} at {address} has {video_frames.shape[0]} frames, expected {CLIP_LENGTH}. Skipping.")
                continue
            np_videos.append(video_frames)
            valid_indices.append(i)
        except Exception as e:
            print(f"Error processing video {i} at {address}: {str(e)}")
    return np_videos, valid_indices

def preprocess_dataset(list_of_videos, list_of_labels, valid_indices, new_size=FRAME_SIZE, clip_len=CLIP_LENGTH):
    total_frames = len(list_of_videos) * clip_len
    resized_list = np.zeros((total_frames, new_size[1], new_size[0]), dtype=np.float32)
    resized_labels = np.zeros(total_frames, dtype=np.int64)
    valid_frame_count = 0
    for i, video in enumerate(list_of_videos):
        if video.shape[0] != clip_len:
            print(f"Warning: Video {i} has {video.shape[0]} frames, expected {clip_len}. Skipping.")
            continue
        for j in range(clip_len):
            resized_list[valid_frame_count] = video[j]
            # Ép kiểu nhãn thành int64
            resized_labels[valid_frame_count] = int(list_of_labels[valid_indices[i]])
            valid_frame_count += 1
    resized_list = resized_list[:valid_frame_count]
    resized_labels = resized_labels[:valid_frame_count]
    return resized_list, resized_labels

In [4]:
class HandWashDataset(Dataset):
    def __init__(self, videos, labels, transform=None):
        self.videos = videos
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.videos)

    def __getitem__(self, idx):
        try:
            frame = self.videos[idx]
            label = self.labels[idx]
            if frame.ndim != 2:
                raise ValueError(f"Frame at index {idx} has shape {frame.shape}, expected 2D (H, W)")
            frame = frame[..., np.newaxis]
            frame = torch.from_numpy(frame).permute(2, 0, 1).float()
            if self.transform:
                frame = self.transform(frame)
            return frame, torch.tensor(label, dtype=torch.long)
        except Exception as e:
            print(f"Transform error at index {idx}: {str(e)}")
            raise e

In [5]:
# Transforms
train_transform = transforms.Compose([
    transforms.RandomCrop((210, 270)),
    transforms.RandomVerticalFlip(),
    transforms.ColorJitter(brightness=0.2),
])
val_test_transform = transforms.Compose([
    transforms.Resize((210, 270)),  # Thêm Resize để khớp với INPUT_SHAPE
])

In [6]:
# Chuẩn bị dữ liệu
train_info, val_info, test_info = prepare_dataset(DATA_DIR, CLASSES)
train_addr = [info[1] for info in train_info]
train_names = [LABELS_TO_ID[info[0]] for info in train_info]
val_addr = [info[1] for info in val_info]
val_names = [LABELS_TO_ID[info[0]] for info in val_info]
test_addr = [info[1] for info in test_info]
test_names = [LABELS_TO_ID[info[0]] for info in test_info]

In [7]:
train_videos, train_valid_indices = download_and_prepare_dataset(train_addr)
train_videos, train_labels = preprocess_dataset(train_videos, train_names, train_valid_indices)
print("done with train")
val_videos, val_valid_indices = download_and_prepare_dataset(val_addr)
val_videos, val_labels = preprocess_dataset(val_videos, val_names, val_valid_indices)
print("done with val")
test_videos, test_valid_indices = download_and_prepare_dataset(test_addr)
test_videos, test_labels = preprocess_dataset(test_videos, test_names, test_valid_indices)
print("done with test")

done with train
done with val
done with test


In [8]:
train_dataset = HandWashDataset(train_videos, train_labels, transform=train_transform)
val_dataset = HandWashDataset(val_videos, val_labels, transform=val_test_transform)
test_dataset = HandWashDataset(test_videos, test_labels, transform=val_test_transform)

In [9]:
trainloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
validloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
testloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

In [10]:
import torchvision.models as models

class HandWashResNet(nn.Module):
    def __init__(self, num_classes=NUM_CLASSES):
        super(HandWashResNet, self).__init__()
        self.model = models.resnet18(weights='DEFAULT')  # Chuyển sang ResNet18
        self.model.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        num_ftrs = self.model.fc.in_features
        self.model.fc = nn.Linear(num_ftrs, num_classes)

    def forward(self, x):
        return self.model(x)

# Hàm huấn luyện và đánh giá
def train_one_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    train_loss = running_loss / len(dataloader)
    train_acc = 100 * correct / total
    return train_loss, train_acc

def evaluate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    correct_top5 = 0
    total = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            _, predicted_top5 = outputs.topk(5, dim=1)
            correct_top5 += sum([labels[i] in predicted_top5[i] for i in range(labels.size(0))])
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    avg_loss = running_loss / len(dataloader)
    accuracy = 100 * correct / total
    top5_accuracy = 100 * correct_top5 / total
    return avg_loss, accuracy, top5_accuracy, all_preds, all_labels

In [11]:
# Khởi tạo model mới và các thành phần khác
model = HandWashResNet().to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
criterion = nn.CrossEntropyLoss()

# Huấn luyện
best_val_loss = float('inf')
best_val_acc = 0.0
for epoch in range(EPOCHS):
    train_loss, train_acc = train_one_epoch(model, trainloader, criterion, optimizer, DEVICE)
    val_loss, val_acc, val_top5_acc, _, _ = evaluate(model, validloader, criterion, DEVICE)
    print(f"Epoch {epoch+1}/{EPOCHS} - Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
    print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%, Val Top-5 Acc: {val_top5_acc:.2f}%")
    if val_loss < best_val_loss and val_acc > best_val_acc:
        best_val_loss = val_loss
        best_val_acc = val_acc
        torch.save(model, 'best_sequential_model.pt')
        print(f"Saved best full model at epoch {epoch+1} with val_loss: {val_loss:.4f}, val_acc: {val_acc:.2f}%")

# Đánh giá mô hình
test_loss, test_acc, test_top5_acc, test_preds, test_labels = evaluate(model, testloader, criterion, DEVICE)
print(f"Test accuracy: {test_acc:.2f}%")
print(f"Test top-5 accuracy: {test_top5_acc:.2f}%")
print("\nBáo cáo phân loại:")
print(classification_report(test_labels, test_preds, target_names=CLASSES))
print("\nMa trận nhầm lẫn:")
print(confusion_matrix(test_labels, test_preds))

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 187MB/s]


Epoch 1/200 - Train Loss: 0.2414, Train Acc: 92.54%
Val Loss: 0.2816, Val Acc: 90.28%, Val Top-5 Acc: 100.00%
Saved best full model at epoch 1 with val_loss: 0.2816, val_acc: 90.28%
Epoch 2/200 - Train Loss: 0.0177, Train Acc: 99.61%
Val Loss: 0.2679, Val Acc: 91.08%, Val Top-5 Acc: 100.00%
Saved best full model at epoch 2 with val_loss: 0.2679, val_acc: 91.08%
Epoch 3/200 - Train Loss: 0.0154, Train Acc: 99.63%
Val Loss: 0.1430, Val Acc: 95.12%, Val Top-5 Acc: 100.00%
Saved best full model at epoch 3 with val_loss: 0.1430, val_acc: 95.12%
Epoch 4/200 - Train Loss: 0.0065, Train Acc: 99.84%
Val Loss: 0.1728, Val Acc: 93.90%, Val Top-5 Acc: 100.00%
Epoch 5/200 - Train Loss: 0.0099, Train Acc: 99.74%
Val Loss: 0.0900, Val Acc: 97.22%, Val Top-5 Acc: 100.00%
Saved best full model at epoch 5 with val_loss: 0.0900, val_acc: 97.22%
Epoch 6/200 - Train Loss: 0.0053, Train Acc: 99.89%
Val Loss: 0.1225, Val Acc: 96.28%, Val Top-5 Acc: 100.00%
Epoch 7/200 - Train Loss: 0.0025, Train Acc: 99.94%
