<a align="center" href="https://ultralytics.com/hub" target="_blank">
<img width="1024", src="https://github.com/ultralytics/assets/raw/main/im/ultralytics-hub.png"></a>

<div align="center">

[中文](https://docs.ultralytics.com/zh/hub/) | [한국어](https://docs.ultralytics.com/ko/hub/) | [日本語](https://docs.ultralytics.com/ja/hub/) | [Русский](https://docs.ultralytics.com/ru/hub/) | [Deutsch](https://docs.ultralytics.com/de/hub/) | [Français](https://docs.ultralytics.com/fr/hub/) | [Español](https://docs.ultralytics.com/es/hub/) | [Português](https://docs.ultralytics.com/pt/hub/) | [Türkçe](https://docs.ultralytics.com/tr/hub/) | [Tiếng Việt](https://docs.ultralytics.com/vi/hub/) | [العربية](https://docs.ultralytics.com/ar/hub/)

  <a href="https://github.com/ultralytics/hub/actions/workflows/ci.yml"><img src="https://github.com/ultralytics/hub/actions/workflows/ci.yml/badge.svg" alt="CI CPU"></a>
  <a href="https://colab.research.google.com/github/ultralytics/hub/blob/main/hub.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a>

  <a href="https://ultralytics.com/discord"><img alt="Discord" src="https://img.shields.io/discord/1089800235347353640?logo=discord&logoColor=white&label=Discord&color=blue"></a>
  <a href="https://community.ultralytics.com"><img alt="Ultralytics Forums" src="https://img.shields.io/discourse/users?server=https%3A%2F%2Fcommunity.ultralytics.com&logo=discourse&label=Forums&color=blue"></a>
  <a href="https://reddit.com/r/ultralytics"><img alt="Ultralytics Reddit" src="https://img.shields.io/reddit/subreddit-subscribers/ultralytics?style=flat&logo=reddit&logoColor=white&label=Reddit&color=blue"></a>

Welcome to the [Ultralytics](https://ultralytics.com/) HUB notebook!

This notebook allows you to train Ultralytics [YOLO](https://github.com/ultralytics/ultralytics) 🚀 models using [HUB](https://hub.ultralytics.com/). Please browse the HUB <a href="https://docs.ultralytics.com/hub/">Docs</a> for details, raise an issue on <a href="https://github.com/ultralytics/hub/issues/new/choose">GitHub</a> for support, and join our <a href="https://ultralytics.com/discord">Discord</a> community for questions and discussions!
</div>

# Setup

Pip install `ultralytics` and [dependencies](https://github.com/ultralytics/ultralytics/blob/main/pyproject.toml) and check software and hardware.

[![PyPI - Version](https://img.shields.io/pypi/v/ultralytics?logo=pypi&logoColor=white)](https://pypi.org/project/ultralytics/) [![Downloads](https://static.pepy.tech/badge/ultralytics)](https://www.pepy.tech/projects/ultralytics) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/ultralytics?logo=python&logoColor=gold)](https://pypi.org/project/ultralytics/)

Ultralytics 8.3.99 🚀 Python-3.11.11 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)
Setup complete ✅ (2 CPUs, 12.7 GB RAM, 39.6/112.6 GB disk)


# Start

⚡ Login with your API key, load your YOLO 🚀 model, and start training in 3 lines of code!

In [2]:
import os

for root, dirs, files in os.walk("/root/.cache/kagglehub/datasets"):
    for dir_name in dirs:
        print(os.path.join(root, dir_name))

/root/.cache/kagglehub/datasets/iraqyomar
/root/.cache/kagglehub/datasets/iraqyomar/khatt-arabic-hand-written-lines
/root/.cache/kagglehub/datasets/iraqyomar/khatt-arabic-hand-written-lines/versions
/root/.cache/kagglehub/datasets/iraqyomar/khatt-arabic-hand-written-lines/versions/1
/root/.cache/kagglehub/datasets/iraqyomar/khatt-arabic-hand-written-lines/versions/1/images
/root/.cache/kagglehub/datasets/iraqyomar/khatt-arabic-hand-written-lines/versions/1/labels


In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("iraqyomar/khatt-arabic-hand-written-lines")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/iraqyomar/khatt-arabic-hand-written-lines?dataset_version_number=1...


100%|██████████| 295M/295M [00:02<00:00, 104MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/iraqyomar/khatt-arabic-hand-written-lines/versions/1


In [None]:
import os
import random
import cv2
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# === paths ===
IMG_DIR = "/root/.cache/kagglehub/datasets/iraqyomar/khatt-arabic-hand-written-lines/versions/1/images"
LBL_DIR = "/root/.cache/kagglehub/datasets/iraqyomar/khatt-arabic-hand-written-lines/versions/1/labels"

# === hyperparams ===
IMG_W, IMG_H = 128, 32
BATCH_SIZE = 16
EPOCHS = 40
VALID_SPLIT = 0.1
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# === utility: prefix matching for labels ===
def build_prefix_map(label_dir):
    prefix_map = {}
    for fname in sorted(os.listdir(label_dir)):
        if not fname.lower().endswith(".txt"):
            continue
        prefix = fname.split("_", 1)[0]
        if prefix not in prefix_map:
            prefix_map[prefix] = os.path.join(label_dir, fname)
    return prefix_map

# === dataset ===
class KHATTDataset(Dataset):
    def __init__(self, img_dir, lbl_dir, char_to_idx, max_samples=None):
        self.img_dir = img_dir
        self.lbl_dir = lbl_dir
        self.char_to_idx = char_to_idx
        self.prefix_map = build_prefix_map(lbl_dir)
        self.img_files = sorted([f for f in os.listdir(img_dir) if f.lower().endswith((".jpg", ".jpeg", ".png"))])
        if max_samples:
            self.img_files = self.img_files[:max_samples]
        self.samples = []
        for fname in self.img_files:
            prefix = fname.split("_", 1)[0]
            lbl_path = self.prefix_map.get(prefix)
            if lbl_path is None:
                continue
            img_path = os.path.join(img_dir, fname)
            label = None
            for enc in ["windows-1256", "utf-8", "latin1"]:
                try:
                    with open(lbl_path, "r", encoding=enc) as f:
                        label = f.read().strip()
                    if label:
                        break
                except Exception:
                    continue
            if not label:
                continue
            self.samples.append((img_path, label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_path, label = self.samples[idx]
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        img = cv2.resize(img, (IMG_W, IMG_H))
        img = img.astype("float32") / 255.0  # normalize
        img = torch.from_numpy(img).unsqueeze(0)  # (1, H, W)

        # encode label (no blank; CTC uses 0 as blank implicitly)
        label_seq = [self.char_to_idx[c] for c in label if c in self.char_to_idx]
        label_tensor = torch.tensor(label_seq, dtype=torch.long)
        return img, label_tensor

# === character mapping ===
def build_char_map(dataset):
    all_text = "".join([lbl for _, lbl in dataset.samples])
    chars = sorted(set(all_text))
    # map chars -> 1..N, reserving 0 for blank
    char_to_idx = {c: i + 1 for i, c in enumerate(chars)}
    idx_to_char = {i + 1: c for i, c in enumerate(chars)}
    return char_to_idx, idx_to_char

# === collate for variable-length labels ===
def collate_fn(batch):
    imgs, labels = zip(*batch)
    imgs = torch.stack(imgs)  # (B,1,H,W)
    label_lengths = torch.tensor([len(l) for l in labels], dtype=torch.long)
    labels_concat = torch.cat(labels)  # needed by CTCLoss
    return imgs, labels_concat, label_lengths

# === model: CNN -> BiLSTM -> linear (vocab+1) ===
class OCRModel(nn.Module):
    def __init__(self, num_chars):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1),  # (B,32,H,W)
            nn.ReLU(),
            nn.MaxPool2d(2, 2),              # (B,32,H/2,W/2)
            nn.Conv2d(32, 64, 3, padding=1), # (B,64,H/2,W/2)
            nn.ReLU(),
            nn.MaxPool2d(2, 2),              # (B,64,H/4,W/4)
        )
        # after two poolings: H' = IMG_H//4, W' = IMG_W//4
        self.rnn_input_size = (IMG_W // 4) * 64  # treating height as time
        self.bi_lstm = nn.LSTM(self.rnn_input_size, 128, bidirectional=True, batch_first=True)
        self.classifier = nn.Linear(128 * 2, num_chars + 1)  # +1 for CTC blank

    def forward(self, x):
        # x: (B,1,H,W)
        x = self.cnn(x)  # (B,64,H/4,W/4)
        # reshape: time dimension = height after conv (IMG_H//4)
        b, c, h, w = x.size()
        # collapse width*channels as features per time-step
        x = x.permute(0, 2, 1, 3)  # (B, H', C, W')
        x = x.contiguous().view(b, h, c * w)  # (B, time, feat)
        x, _ = self.bi_lstm(x)  # (B, time, 256)
        x = self.classifier(x)  # (B, time, num_chars+1)
        x = x.log_softmax(2)    # for CTCLoss expects log-probs
        return x  # (B, time, vocab+1)

# === training setup ===
# temporary dataset to build char maps
temp_ds = KHATTDataset(IMG_DIR, LBL_DIR, char_to_idx={})  # will ignore char_to_idx here
char_to_idx, idx_to_char = build_char_map(temp_ds)
dataset = KHATTDataset(IMG_DIR, LBL_DIR, char_to_idx)
# split
n = len(dataset)
indices = list(range(n))
random.shuffle(indices)
split = int(n * (1 - VALID_SPLIT))
train_idx, val_idx = indices[:split], indices[split:]
train_ds = torch.utils.data.Subset(dataset, train_idx)
val_ds = torch.utils.data.Subset(dataset, val_idx)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

model = OCRModel(num_chars=len(char_to_idx)).to(DEVICE)
ctc_loss = nn.CTCLoss(blank=0, zero_infinity=True)
optimizer = torch.optim.Adam(model.parameters())

# time steps (input_length) is constant: IMG_H//4
input_length_val = torch.full((BATCH_SIZE,), IMG_H // 4, dtype=torch.long)

def greedy_decode(log_probs, idx_to_char):
    # log_probs: (B, time, vocab+1)
    preds = torch.argmax(log_probs, dim=2)  # (B, time)
    results = []
    for seq in preds:
        prev = -1
        chars = []
        for idx in seq.cpu().numpy():
            if idx != prev and idx != 0:  # skip blanks and repeats
                chars.append(idx_to_char.get(idx, ""))
            prev = idx
        results.append("".join(chars))
    return results

# === training loop ===
for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0
    for imgs, labels_concat, label_lengths in train_loader:
        imgs = imgs.to(DEVICE)
        # compute predicted log-probs
        log_probs = model(imgs)  # (B, time, vocab+1)
        batch_size, time_steps, _ = log_probs.size()
        log_probs = log_probs.permute(1, 0, 2)  # CTCLoss expects (T, N, C)

        # prepare inputs for CTCLoss
        input_lengths = torch.full((batch_size,), time_steps, dtype=torch.long).to(DEVICE)
        target_lengths = label_lengths.to(DEVICE)
        labels_concat = labels_concat.to(DEVICE)

        loss = ctc_loss(log_probs, labels_concat, input_lengths, target_lengths)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * batch_size

    avg = total_loss / len(train_ds)
    print(f"[Epoch {epoch}] Train loss: {avg:.4f}")

    # validation quick check (no gradient)
    model.eval()
    with torch.no_grad():
        for imgs, labels_concat, label_lengths in val_loader:
            imgs = imgs.to(DEVICE)
            log_probs = model(imgs)  # (B, time, vocab+1)
            decoded = greedy_decode(log_probs, idx_to_char)
            print("Sample decode:", decoded[:3])
            break  # just one batch preview

# === save model ===
torch.save({
    "model_state": model.state_dict(),
    "char_to_idx": char_to_idx,
    "idx_to_char": idx_to_char
}, "ocr_ctc_pytorch.pt")
print("Saved PyTorch model to ocr_ctc_pytorch.pt")


[Epoch 1] Train loss: 0.0019
Sample decode: ['', '', '']
[Epoch 2] Train loss: 0.0011
Sample decode: ['', '', '']
[Epoch 3] Train loss: 0.0009
Sample decode: ['نر', 'نر', 'نر']
[Epoch 4] Train loss: 0.0001
Sample decode: ['نسر', 'نسر', 'نسر']
[Epoch 5] Train loss: 0.0000
Sample decode: ['نسر', 'نسر', 'نسر']
[Epoch 6] Train loss: 0.0000
Sample decode: ['نسر', 'نسر', 'نسر']
[Epoch 7] Train loss: 0.0000
Sample decode: ['نسر', 'نسر', 'نسر']
[Epoch 8] Train loss: 0.0000
Sample decode: ['نسر', 'نسر', 'نسر']
[Epoch 9] Train loss: 0.0000
Sample decode: ['نسر', 'نسر', 'نسر']
[Epoch 10] Train loss: 0.0000
Sample decode: ['نسر', 'نسر', 'نسر']
[Epoch 11] Train loss: 0.0000
Sample decode: ['نسر', 'نسر', 'نسر']
[Epoch 12] Train loss: 0.0000
Sample decode: ['نسر', 'نسر', 'نسر']
[Epoch 13] Train loss: 0.0000
Sample decode: ['نسر', 'نسر', 'نسر']
[Epoch 14] Train loss: 0.0000
Sample decode: ['نسر', 'نسر', 'نسر']
[Epoch 15] Train loss: 0.0000
Sample decode: ['نسر', 'نسر', 'نسر']


In [4]:
import torch
import torch.nn.functional as F
import cv2
import numpy as np

# === Load model checkpoint ===
checkpoint = torch.load("/content/ocr_ctc_pytorch.pt", map_location='cpu')
char_to_idx = checkpoint["char_to_idx"]
idx_to_char = checkpoint["idx_to_char"]
num_chars = len(char_to_idx)

# === Match model architecture ===
class OCRModel(torch.nn.Module):
    def __init__(self, num_chars):
        super().__init__()
        self.cnn = torch.nn.Sequential(
            torch.nn.Conv2d(1, 32, 3, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(2, 2),
            torch.nn.Conv2d(32, 64, 3, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(2, 2),
        )
        self.bi_lstm = torch.nn.LSTM((128 // 4) * 64, 128, bidirectional=True, batch_first=True)
        self.classifier = torch.nn.Linear(128 * 2, num_chars + 1)

    def forward(self, x):
        x = self.cnn(x)  # [B, 64, H/4, W/4]
        b, c, h, w = x.size()
        x = x.permute(0, 2, 1, 3).contiguous().view(b, h, c * w)
        x, _ = self.bi_lstm(x)
        x = self.classifier(x)
        return F.log_softmax(x, dim=2)

# === Initialize model ===
model = OCRModel(num_chars=num_chars)
model.load_state_dict(checkpoint["model_state"])
model.eval()

# === Preprocessing ===
IMG_W, IMG_H = 128, 32

def preprocess_image(image_path):
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, (IMG_W, IMG_H))
    img = img.astype("float32") / 255.0
    img = torch.from_numpy(img).unsqueeze(0).unsqueeze(0)  # [1, 1, H, W]
    return img

# === Greedy decode ===
def greedy_decode(log_probs, idx_to_char):
    preds = torch.argmax(log_probs, dim=2)  # [B, T]
    results = []
    for seq in preds:
        prev = -1
        chars = []
        for idx in seq.numpy():
            if idx != prev and idx != 0:
                chars.append(idx_to_char.get(idx, ""))
            prev = idx
        results.append("".join(chars))
    return results

# === Inference ===
test_image_path = "/content/AHTD3A0083_Para3_4.jpg"  # 🔁 Change this path
img_tensor = preprocess_image(test_image_path)
with torch.no_grad():
    log_probs = model(img_tensor)
    decoded = greedy_decode(log_probs, idx_to_char)

print("📝 Predicted Text:", decoded[0])


📝 Predicted Text: نسر
