In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random

import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import DataLoader, Subset
from torchvision.datasets import ImageFolder
import torchvision.transforms as transforms

In [2]:
class VGG(nn.Module):
    def __init__(self):
        super(VGG, self).__init__()
        self.conv_pool = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        
        self.fc_layers = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, 29),
        )
        
    def forward(self, x):
        x = self.conv_pool(x)
        print(x.shape)
        x = torch.flatten(x, 1)
        x = self.fc_layers(x)
        return x

In [3]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

In [4]:
train_data_dir = 'assets/asl/asl_alphabet_train'
test_data_dir = 'assets/asl/asl_alphabet_test'

training_data = ImageFolder(root=train_data_dir, transform=transform)
testing_data = ImageFolder(root=test_data_dir, transform=transform)

num_images_per_class = 100
subset_indices = []
for class_idx in range(len(training_data.classes)):
    indices = [idx for idx, (_, label_idx) in enumerate(training_data.samples) if label_idx == class_idx]
    selected_indices = random.sample(indices, num_images_per_class)  # Randomly select indices
    subset_indices.extend(selected_indices)

# Create a subset of the dataset with the selected indices
train_subset = Subset(training_data, subset_indices)

batch_size = 32
train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(testing_data, batch_size=batch_size, shuffle=False)

In [5]:
model = VGG()
num_epochs = 10

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}")

model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Accuracy: {correct / total}")

torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512, 7, 7])
torch.Size([32, 512,