In [None]:
import torch
from torch.utils.data import Dataset, DataLoader, Subset
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, datasets
from sklearn.model_selection import train_test_split
from collections import Counter
import os
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
if device.type == "cuda":
    print("GPU:", torch.cuda.get_device_name(0))

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

path = r"C:\Users\bluem\Downloads\mediapipe_version_dataset_full\mediapipe_version_dataset_full"
assert os.path.exists(path), "Dataset path not found!"

train_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

val_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])


full_dataset = datasets.ImageFolder(root=path, allow_empty=True)
print("Total images:", len(full_dataset))
print("Classes:", full_dataset.classes)

targets = full_dataset.targets
class_counts = Counter(targets)
print("Class counts:", class_counts)

train_indices, test_indices = train_test_split(
    range(len(full_dataset)),
    test_size=0.1,
    stratify=targets,
    random_state=42
)

class ApplyTransform(Dataset):
    def __init__(self, subset, transform=None):
        self.subset = subset
        self.transform = transform

    def __getitem__(self, index):
        x, y = self.subset[index]
        if self.transform:
            x = self.transform(x)
        return x, y

    def __len__(self):
        return len(self.subset)

train_dataset = ApplyTransform(Subset(full_dataset, train_indices), train_transforms)
test_dataset  = ApplyTransform(Subset(full_dataset, test_indices), val_transforms)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0, pin_memory=False)
test_loader  = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0, pin_memory=False)


class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()
        self.conv_path = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, 3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, 3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(out_channels)
        )

        self.shortcut = nn.Identity()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, 1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        return torch.relu(self.conv_path(x) + self.shortcut(x))

class GesturesResNet(nn.Module):
    def __init__(self, num_classes):
        super().__init__()

        self.prep = nn.Sequential(
            nn.Conv2d(3, 16, 3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(16),
            nn.ReLU(inplace=True)
        )

        self.layer1 = nn.Sequential(ResidualBlock(16, 16, 1), ResidualBlock(16, 32, 2))
        self.layer2 = nn.Sequential(ResidualBlock(32, 32, 1), ResidualBlock(32, 64, 2))
        self.layer3 = nn.Sequential(ResidualBlock(64, 64, 1), ResidualBlock(64, 128, 2))
        self.layer4 = nn.Sequential(ResidualBlock(128, 128, 1), ResidualBlock(128, 256, 2))

        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(256, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        x = self.prep(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avg_pool(x)
        return self.fc(x)

num_classes = len(full_dataset.classes)
model = GesturesResNet(num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-4)

num_epochs = 50
best_acc = 0.0

print("\n================ TRAINING START ================\n")
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    start_time = time.time()

    for batch_idx, (images, labels) in enumerate(train_loader):
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(images)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        preds = outputs.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

        if batch_idx % 20 == 0:
            print(f"Epoch {epoch+1}/{num_epochs} | Batch {batch_idx}/{len(train_loader)} "
                  f"| Loss: {running_loss/(batch_idx+1):.4f} | Acc: {100*correct/total:.2f}%")

    epoch_loss = running_loss / len(train_loader)
    epoch_acc = 100 * correct / total
    print(f"\n✅ Epoch {epoch+1}/{num_epochs} DONE | Loss: {epoch_loss:.4f} | Acc: {epoch_acc:.2f}% "
          f"| Time: {time.time()-start_time:.1f}s\n")

   
    torch.save(model.state_dict(), "gesture_resnet_latest.pth")

print("================ TRAINING END ================\n")


model.eval()
test_correct = 0
test_total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        preds = outputs.argmax(dim=1)
        test_correct += (preds == labels).sum().item()
        test_total += labels.size(0)

print(f"✅ Final Test Accuracy: {100 * test_correct / test_total:.2f}%")


Device: cuda
GPU: NVIDIA GeForce RTX 4050 Laptop GPU
Total images: 7944
Classes: ['Blank', 'Fist', 'Five', 'Four', 'One', 'Rock_On', 'Spider_man', 'Three', 'Thumb', 'Two']
Class counts: Counter({6: 989, 2: 986, 5: 986, 8: 949, 3: 927, 1: 851, 4: 781, 9: 753, 7: 722})


Epoch 1/50 | Batch 0/224 | Loss: 2.2753 | Acc: 9.38%
Epoch 1/50 | Batch 20/224 | Loss: 2.2109 | Acc: 16.22%
Epoch 1/50 | Batch 40/224 | Loss: 2.1396 | Acc: 20.96%
Epoch 1/50 | Batch 60/224 | Loss: 2.0819 | Acc: 23.21%
Epoch 1/50 | Batch 80/224 | Loss: 2.0100 | Acc: 26.89%
Epoch 1/50 | Batch 100/224 | Loss: 1.9374 | Acc: 30.11%
Epoch 1/50 | Batch 120/224 | Loss: 1.8548 | Acc: 34.19%
Epoch 1/50 | Batch 140/224 | Loss: 1.7729 | Acc: 38.01%
Epoch 1/50 | Batch 160/224 | Loss: 1.7000 | Acc: 41.61%
Epoch 1/50 | Batch 180/224 | Loss: 1.6209 | Acc: 45.37%
Epoch 1/50 | Batch 200/224 | Loss: 1.5486 | Acc: 48.62%
Epoch 1/50 | Batch 220/224 | Loss: 1.4876 | Acc: 51.26%

✅ Epoch 1/50 DONE | Loss: 1.4788 | Acc: 51.53% | Time: 58.5s

Ep

In [None]:
# Save model
model.to("cpu")
torch.save(model.state_dict(), "Hand_Gestures_MediaPipe_ResNet_v1.pth")
print("Model saved successfully.")