In [2]:
from pathlib import Path
import os

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image

from tqdm.auto import tqdm

# Detect device (M-series will use "mps")
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print("Using device:", device)

# Project + data roots
PROJECT_ROOT = Path.cwd().resolve().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
DATA_ROOT = PROJECT_ROOT / "data"

print("Project root:", PROJECT_ROOT)
print("Data root:", DATA_ROOT)

# Quick sanity check
print("Devanagari Train exists:", (DATA_ROOT /"ocr_datasets"/ "Devanagari" / "Train").exists())
print("Devanagari Test exists :", (DATA_ROOT /"ocr_datasets"/ "Devanagari" / "Test").exists())


Using device: mps
Project root: /Users/jyotirmoy/Desktop/Image/ancient-script-ai
Data root: /Users/jyotirmoy/Desktop/Image/ancient-script-ai/data
Devanagari Train exists: True
Devanagari Test exists : True


In [3]:
class AncientScriptDataset(Dataset):
    """
    Generic dataset for character images arranged as:

        root_dir/
            class_name_1/
                img1.png
                img2.jpg
                ...
            class_name_2/
                ...

    It returns (image_tensor, label_index).
    """
    def __init__(self, root_dir, transform=None):
        self.root_dir = Path(root_dir)
        self.transform = transform

        # All subfolders are class names
        self.class_names = sorted([
            d.name for d in self.root_dir.iterdir()
            if d.is_dir()
        ])
        self.class_to_idx = {cls_name: idx for idx, cls_name in enumerate(self.class_names)}

        self.samples = []
        image_extensions = (".png", ".jpg", ".jpeg")

        for cls_name in self.class_names:
            cls_dir = self.root_dir / cls_name
            for img_path in cls_dir.rglob("*"):
                if img_path.suffix.lower() in image_extensions:
                    self.samples.append((img_path, self.class_to_idx[cls_name]))

        print(f"[Dataset] {self.root_dir.name}: {len(self.samples)} images, {len(self.class_names)} classes")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_path, label = self.samples[idx]
        img = Image.open(img_path).convert("L")  # 'L' = grayscale

        if self.transform is not None:
            img = self.transform(img)

        return img, label


In [4]:
img_size = 64

train_transform = transforms.Compose([
    transforms.Resize((img_size, img_size)),
    transforms.RandomRotation(5),
    transforms.ToTensor(),         # → tensor in [0, 1]
])

test_transform = transforms.Compose([
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor(),
])

dev_train_dir = DATA_ROOT /"ocr_datasets"/ "Devanagari" / "Train"
dev_test_dir  = DATA_ROOT /"ocr_datasets"/ "Devanagari" / "Test"

dev_train_dataset = AncientScriptDataset(dev_train_dir, transform=train_transform)
dev_test_dataset  = AncientScriptDataset(dev_test_dir,  transform=test_transform)

print("Classes (first few):", dev_train_dataset.class_names[:10])
print("Total classes:", len(dev_train_dataset.class_names))


[Dataset] Train: 78200 images, 46 classes
[Dataset] Test: 13800 images, 46 classes
Classes (first few): ['character_10_yna', 'character_11_taa', 'character_12_thaa', 'character_13_daa', 'character_14_dhaa', 'character_15_adna', 'character_16_tabala', 'character_17_tha', 'character_18_da', 'character_19_dha']
Total classes: 46


In [5]:
batch_size = 32

train_loader = DataLoader(dev_train_dataset, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(dev_test_dataset,  batch_size=batch_size, shuffle=False)

len(train_loader), len(test_loader)


(2444, 432)

In [6]:
class ScriptCNN(nn.Module):
    """
    Simple CNN for character classification:
    - 3x (Conv + ReLU + MaxPool)
    - 2x Fully Connected layers
    - Softmax (implicitly via CrossEntropyLoss)
    """
    def __init__(self, num_classes):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)

        self.pool = nn.MaxPool2d(2, 2)  # halves H and W

        # 64x64 → pool→32x32 → pool→16x16 → pool→8x8
        self.fc1 = nn.Linear(128 * 8 * 8, 256)
        self.fc2 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = self.pool(torch.relu(self.conv3(x)))
        x = x.view(x.size(0), -1)     # flatten
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)               # logits
        return x


In [7]:
num_classes = len(dev_train_dataset.class_names)

model = ScriptCNN(num_classes=num_classes).to(device)

criterion = nn.CrossEntropyLoss()               # softmax + NLL in one
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

print(model)


ScriptCNN(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=8192, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=46, bias=True)
)


In [8]:
def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels in tqdm(loader, leave=False):
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)

        _, preds = outputs.max(1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    epoch_loss = running_loss / total
    epoch_acc = correct / total
    return epoch_loss, epoch_acc


def evaluate(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in loader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * images.size(0)
            _, preds = outputs.max(1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    epoch_loss = running_loss / total
    epoch_acc = correct / total
    return epoch_loss, epoch_acc


In [9]:
num_epochs = 5

best_val_acc = 0.0

for epoch in range(1, num_epochs + 1):
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate(model, test_loader, criterion, device)

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_state = model.state_dict()

    print(f"Epoch {epoch:02d}/{num_epochs} | "
          f"Train loss: {train_loss:.4f}, acc: {train_acc:.4f} | "
          f"Val loss: {val_loss:.4f}, acc: {val_acc:.4f}")

print("Best validation accuracy:", best_val_acc)


                                                                                     

Epoch 01/5 | Train loss: 0.4043, acc: 0.8818 | Val loss: 0.1380, acc: 0.9546


                                                                                     

Epoch 02/5 | Train loss: 0.1091, acc: 0.9666 | Val loss: 0.0914, acc: 0.9731


                                                                                     

Epoch 03/5 | Train loss: 0.0648, acc: 0.9794 | Val loss: 0.0908, acc: 0.9725


                                                                                     

Epoch 04/5 | Train loss: 0.0470, acc: 0.9849 | Val loss: 0.0834, acc: 0.9772


                                                                                     

Epoch 05/5 | Train loss: 0.0358, acc: 0.9887 | Val loss: 0.0722, acc: 0.9798
Best validation accuracy: 0.9797826086956521


In [10]:
models_dir = PROJECT_ROOT / "models"
models_dir.mkdir(exist_ok=True)

save_path = models_dir / "devanagari_cnn_v1.pth"

torch.save({
    "model_state_dict": best_state if 'best_state' in locals() else model.state_dict(),
    "class_names": dev_train_dataset.class_names,
}, save_path)

print("Model saved to:", save_path)


Model saved to: /Users/jyotirmoy/Desktop/Image/ancient-script-ai/models/devanagari_cnn_v1.pth


In [11]:
import random

model.load_state_dict(torch.load(save_path, map_location=device)["model_state_dict"])
model.eval()

idx = random.randint(0, len(dev_test_dataset) - 1)
img, label = dev_test_dataset[idx]

with torch.no_grad():
    logits = model(img.unsqueeze(0).to(device))
    probs = torch.softmax(logits, dim=1)
    pred_idx = probs.argmax(dim=1).item()

true_class = dev_test_dataset.class_names[label]
pred_class = dev_test_dataset.class_names[pred_idx]

print(f"True class: {true_class}")
print(f"Predicted : {pred_class}")


True class: character_23_ba
Predicted : character_23_ba
