In [1]:
!pip install torch torchvision



In [2]:
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

In [3]:
# Define transformation
transform = transforms.Compose([transforms.ToTensor()])

# Load EMNIST dataset (Letters subset)
train_dataset = torchvision.datasets.EMNIST(root="./data", split="letters", train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.EMNIST(root="./data", split="letters", train=False, download=True, transform=transform)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

print(f"Train size: {len(train_dataset)}, Test size: {len(test_dataset)}")

100%|██████████| 562M/562M [00:02<00:00, 242MB/s]


Train size: 124800, Test size: 20800


In [4]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [5]:
# Define CNN Model

class CNN_Model(nn.Module):
    def __init__(self, num_classes=27): # EMNIST letters has 26 letters + 1 unknown
        super(CNN_Model, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x= x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [6]:
# CNN Model Initialization

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cnn_model = CNN_Model().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(cnn_model.parameters(), lr=0.001)

In [7]:
# Training Loop

def train_cnn(model, epochs=5):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            output = model(images)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}")

train_cnn(cnn_model)

Epoch 1/5, Loss: 0.4971930516874179
Epoch 2/5, Loss: 0.2500816688304528
Epoch 3/5, Loss: 0.2066176892320315
Epoch 4/5, Loss: 0.17915347245736765
Epoch 5/5, Loss: 0.1581368944612451


In [None]:
from transformers import ViTForImageClassification, ViTFeatureExtractor
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load Pretrained ViT Model (Fix shape mismatch)
model = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224",
    num_labels=27,  # EMNIST has 26 letters + 1 unknown class
    ignore_mismatched_sizes=True
)
model.to(device)

# Load Feature Extractor
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")

# Define Optimizer & Loss Function
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

# Updated DataLoader with Transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),   # Resize images for ViT
    transforms.Grayscale(num_output_channels=3),  # Convert 1-channel to 3-channel
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Load EMNIST dataset (letters)
train_dataset = torchvision.datasets.EMNIST(root="./data", split="letters", train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.EMNIST(root="./data", split="letters", train=False, download=True, transform=transform)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

# Training Loop
def train_vit(model, epochs=3):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images).logits
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss / len(train_loader):.4f}")

train_vit(model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([27]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([27, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

