### Convolutional Neural Network (CNN)

Imagine this scenario: You have 10k training images, each image will be RBG and 200pixels * 200, for a normal NN the input it would be $3(RGB)*200*200=120,000$ input nodes, that would be too much, for both computation and parameters.

Instead of connecting every pixel to every neuron, CNNs use convolutional filters(kernels).
- A filter is a small matrix (3x3 or 5x5) that slides across the image
- At each position, it computes a dot product between its weights and the local patch of the image
- The result forms one pixel in the feature map (activation map)

This way, one filter learns to detect on type of feature (eg. edge, texture). Multiple filters produce multiple feature maps, each spotting different pattern.

If the input image has size H * W, with filter size K, stride S, padding P, then the output feature map size given by:
$$
Output H = \left\lfloor \frac{H-K+2P}{s} \right\rfloor+1 \\

Outtput W = \left\lfloor \frac{W-K+2P}{s} \right\rfloor+1
$$

$k$: kernel size

$S$: stride, how many pixels to slide filter

$P$: add an extra border of 0, to prevent evetual information loss

If the input is RGB, then a filter has depth 3 (3* K* K), applying 32 filters will result an output of 32 feature maps

So the final result will be a 32 * 200 * 200

But wait, shouldn't it be reducing? Since this will only increment the input nodes

To reduce size and keep important info, we use pooling:
- Max pooling, takes the maximum value in each patch

So in a 2x2 pooling with stride 2, it halves the size into 100x100 but with 32 feature maps, so overall will still need $32*100*100 = 320,000$

And we need to find a balance between how many feature maps, and how much to scale the original image down

#### Balance between depth and resolution

Convolutions usually increase the number of feature maps (depth),  
while pooling reduces their width and height (resolution).  
A good CNN architecture carefully balances these two effects,  
so the network extracts rich features **without exploding in size**.

**Example pipeline:**
- Input: (3×200×200)  
- Conv (32 filters, stride=1, padding=1): (32×200×200)  
- Max Pooling (2×2, stride=2): (32×100×100)  
- Conv (64 filters): (64×100×100)  
- Pooling: (64×50×50)  

In [2]:
import torch
print(torch.__version__)
from torch.utils.data import DataLoader
from torchvision import datasets,transforms

# Resize,to tensor, normalize
transform = transforms.Compose([
    transforms.Resize((200,200)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5,0.5,0.5],std=[0.5,0.5,0.5]) # Scale to [-1,1]
])

dataset = datasets.ImageFolder(root="data/archive/asl_alphabet_train",transform=transform)

train_loader = DataLoader(dataset=dataset,batch_size=32,shuffle=True)

class_names = dataset.classes
num_classes = len(class_names)
print(class_names)

2.5.1+cu121
['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'del', 'nothing', 'space']


In [3]:
import torch.nn as nn

class ASLCNN(nn.Module):
    def __init__(self,num_classes=29):
        super(ASLCNN,self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),  # (3,200,200) → (32,200,200)
            nn.ReLU(),
            nn.MaxPool2d(2,2),  # → (32,100,100)

            nn.Conv2d(32, 64, kernel_size=3, padding=1), # → (64,100,100)
            nn.ReLU(),
            nn.MaxPool2d(2,2),  # → (64,50,50)

            nn.Conv2d(64, 128, kernel_size=3, padding=1),# → (128,50,50)
            nn.ReLU(),
            nn.MaxPool2d(2,2),  # → (128,25,25)

            nn.Conv2d(128, 128, kernel_size=3, padding=1), # → (128,25,25)
            nn.ReLU(),
            nn.MaxPool2d(2,2),  # → (128,12,12)

            nn.Conv2d(128, 256, kernel_size=3, padding=1), # → (256,12,12)
            nn.ReLU(),
            nn.MaxPool2d(2,2),  # → (256,6,6)
        )
        self.classifier = nn.Sequential(
            nn.Flatten(), # 256*6*6 = 9216
            nn.Linear(9216,512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512,num_classes)
        )

    def forward(self,x):
        x = self.features(x)
        x = self.classifier(x)
        return x

In [9]:
import torch.optim as optim
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("GPU count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))
model = ASLCNN(num_classes=num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

for i in range(10):
    for images, labels in train_loader:
        imgs, lbels = images.to(device), labels.to(device)
        outputs = model(imgs)
        loss = criterion(outputs, lbels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {i+1}, Loss: {loss.item():.4f}")

PyTorch version: 2.5.1+cu121
CUDA available: True
GPU count: 1
GPU name: NVIDIA GeForce RTX 4070 Laptop GPU
Epoch 1, Loss: 3.3676
Epoch 2, Loss: 3.3579
Epoch 3, Loss: 3.3792
Epoch 4, Loss: 3.3868
Epoch 5, Loss: 3.3806
Epoch 6, Loss: 3.3794
Epoch 7, Loss: 3.3879
Epoch 8, Loss: 3.3756
Epoch 9, Loss: 3.3724
Epoch 10, Loss: 3.3864


In [10]:
# save
torch.save(model.state_dict(), "asl_cnn.pth")

# load later
model = ASLCNN(num_classes=num_classes)
model.load_state_dict(torch.load("asl_cnn.pth"))
model.eval()


  model.load_state_dict(torch.load("asl_cnn.pth"))


ASLCNN(
  (features): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (9): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (10): ReLU()
    (11): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (12): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU()
    (14): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (classifier): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1):

In [11]:
import os
from PIL import Image
import torch
from torchvision import transforms

# same preprocessing as training
transform = transforms.Compose([
    transforms.Resize((200, 200)),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
])

test_dir = "data/archive/asl_alphabet_test/asl_alphabet_test"
test_images = sorted(os.listdir(test_dir))  # [A_test.jpg, B_test.jpg, ...]


In [None]:
model.to(device)
model.eval()
correct = 0
results = []

from torch import tensor

with torch.no_grad():
    for filename in test_images:
        path = os.path.join(test_dir, filename)
        img = Image.open(path).convert("RGB")
        img_tensor = transform(img)
        img_tensor = img_tensor.unsqueeze(0).to(device)
        img_tensor = img_tensor

        output = model(img_tensor)
        _, predicted = torch.max(output, 1)
        pred_class = class_names[int(predicted.item())]

        # true label is the first character of filename ("A_test.jpg" → "A")
        true_class = filename.split("_")[0]

        is_correct = (pred_class == true_class)
        correct += is_correct
        results.append((filename, true_class, pred_class, is_correct))

accuracy = 100 * correct / len(test_images)
print(f"Accuracy on 29 test images: {accuracy:.2f}%")

for r in results:
    print(f"File: {r[0]} | True: {r[1]} | Predicted: {r[2]} | Correct: {r[3]}")


RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same