In [1]:
pip install paddlepaddle paddleocr


Collecting paddlepaddle
  Downloading paddlepaddle-2.6.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (8.6 kB)
Collecting paddleocr
  Downloading paddleocr-2.8.1-py3-none-any.whl.metadata (19 kB)
Collecting astor (from paddlepaddle)
  Downloading astor-0.8.1-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting rapidfuzz (from paddleocr)
  Downloading rapidfuzz-3.10.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting opencv-contrib-python (from paddleocr)
  Downloading opencv_contrib_python-4.10.0.84-cp37-abi3-macosx_11_0_arm64.whl.metadata (20 kB)
Collecting python-docx (from paddleocr)
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting fire>=0.3.0 (from paddleocr)
  Downloading fire-0.6.0.tar.gz (88 kB)
  Preparing metadata (setup.py) ... [?25ldone
Downloading paddlepaddle-2.6.2-cp311-cp311-macosx_11_0_arm64.whl (65.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.6/65.6 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:

In [4]:
import os
from PIL import Image
import numpy as np
import paddle
import paddle.nn as nn
from paddle.io import Dataset, DataLoader
from paddle.vision.transforms import Compose, Resize, ToTensor

# Step 1: Define a Dataset
class LicensePlateDataset(Dataset):
    def __init__(self, image_dir, transform=None):
        self.image_dir = image_dir
        self.image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith('.png') or f.endswith('.jpg')]
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        # Load the image
        img = Image.open(img_path).convert('RGB')
        if self.transform:
            img = self.transform(img)

        # Get label from filename (assuming filename is label without extension)
        label = os.path.splitext(os.path.basename(img_path))[0]
        return np.array(img), label

# Step 2: Define the label processing for Arabic and English
def is_arabic_numeral(char):
    return '\u0660' <= char <= '\u0669'

def label_license_plate_text(recognized_text):
    labeled_output = {
        'English Number': '',
        'English Letter': '',
        'Arabic Number': '',
        'Arabic Letter': '',
        'Unknown': ''
    }

    for word in recognized_text.split():
        for char in word:
            if is_arabic_numeral(char):
                label = 'Arabic Number'
            elif char.isdigit():
                label = 'English Number'
            elif char.isalpha() and char.isascii():
                label = 'English Letter'
            elif char.isalpha():
                label = 'Arabic Letter'
            else:
                label = 'Unknown'
            
            labeled_output[label] += char

    return labeled_output

# Define your character dictionary with Arabic and English letters/numbers
english_chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
arabic_chars = 'ابتثجحخدذرزسشصضطظعغفقكلمنهوي' + ''.join(chr(i) for i in range(0x0660, 0x066A))  # Arabic letters + numerals

# Combine Arabic and English characters
char_dict = english_chars + arabic_chars
num_classes = len(char_dict) + 1  # +1 for the CTC blank label

# Create a mapping from characters to indices
char_to_idx = {char: idx for idx, char in enumerate(char_dict)}
blank_idx = num_classes - 1  # The index for the CTC blank label

def encode_labels(labels):
    labels_idx = []
    for label in labels:
        labeled_output = label_license_plate_text(label)  # Extract Arabic/English letters and numerals
        label_idx = []
        for category, text in labeled_output.items():
            for c in text:
                label_idx.append(char_to_idx.get(c, blank_idx))  # Map characters to indices
        labels_idx.append(label_idx)
    return labels_idx

# Step 3: Define the CRNN Model using ResNet18 as the backbone
class CRNN(nn.Layer):
    def __init__(self, num_classes):
        super(CRNN, self).__init__()
        
        # Backbone: ResNet18
        self.backbone = paddle.vision.models.resnet18(pretrained=True)

        # Reduce channels and features
        self.conv = nn.Conv2D(in_channels=512, out_channels=256, kernel_size=1)

        # LSTM layer (Bidirectional)
        self.lstm = nn.LSTM(input_size=256, hidden_size=128, num_layers=2, direction='bidirectional')

        # Final fully connected layer (CTC Head)
        self.fc = nn.Linear(128 * 2, num_classes)  # 128 * 2 because of bidirectional LSTM

    def forward(self, x):
        # Backbone feature extraction
        x = self.backbone(x)  # Output shape: [batch_size, 512, H, W]

        # Reduce the feature size with a convolutional layer
        x = self.conv(x)  # Output shape: [batch_size, 256, H, W]

        # Reshape to fit LSTM input: flatten height and channels
        b, c, h, w = x.shape
        x = paddle.reshape(x, [b, c * h, w])  # Shape: [batch_size, feature_size, width]

        # Transpose to match LSTM input
        x = x.transpose([2, 0, 1])  # Shape: [width, batch_size, feature_size]

        # LSTM sequence encoding
        x, (h_n, c_n) = self.lstm(x)  # Output: [width, batch_size, hidden_size * 2]

        # Reshape for the fully connected layer
        x = paddle.reshape(x, [-1, x.shape[-1]])  # Shape: [width * batch_size, hidden_size * 2]

        # Apply the fully connected layer
        x = self.fc(x)  # Shape: [width * batch_size, num_classes]

        # Reshape back to the expected shape for CTC loss
        x = paddle.reshape(x, [-1, b, num_classes])  # Shape: [width, batch_size, num_classes]

        return x

# Step 4: Define the training loop
def train_model(model, dataloader, optimizer, loss_fn, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for images, labels in dataloader:
            images = paddle.to_tensor(images, dtype='float32')
            labels_idx = encode_labels(labels)
            labels_concat = [item for sublist in labels_idx for item in sublist]
            labels_tensor = paddle.to_tensor(labels_concat, dtype='int32')
            labels_lengths = paddle.to_tensor([len(label) for label in labels_idx], dtype='int64')

            # Forward pass
            preds = model(images)  # preds shape: [seq_len, batch_size, num_classes]

            # Prepare inputs for CTC Loss
            preds = preds.log_softmax(axis=2)  # Apply log softmax over classes
            preds_lengths = paddle.to_tensor([preds.shape[0]] * preds.shape[1], dtype='int64')  # All sequences have the same length

            # Compute loss
            loss = loss_fn(preds, labels_tensor, preds_lengths, labels_lengths)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            optimizer.clear_grad()

            total_loss += loss.numpy()[0]

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(dataloader):.4f}")

# Step 5: Create DataLoader
image_dir = "Dataset/"  # Update this to your dataset path
transform = Compose([
    Resize((32, 320)),  # Resize to match the input shape
    ToTensor()
])
dataset = LicensePlateDataset(image_dir, transform=transform)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Step 6: Initialize model, loss function, and optimizer
model = CRNN(num_classes)
loss_fn = nn.CTCLoss(blank=blank_idx)  # Blank index is num_classes - 1
optimizer = paddle.optimizer.Adam(parameters=model.parameters(), learning_rate=0.001)

# Step 7: Train the model
train_model(model, dataloader, optimizer, loss_fn, num_epochs=10)


I0925 19:21:56.641599 4172401664 kernel_dispatch.h:102] Get BackendSet from tensor
I0925 19:21:56.642033 4172401664 kernel_dispatch.h:102] Get BackendSet from tensor
I0925 19:21:56.642134 4172401664 kernel_dispatch.h:102] Get BackendSet from tensor
I0925 19:21:56.642212 4172401664 kernel_dispatch.h:102] Get BackendSet from tensor
I0925 19:21:56.643707 4172401664 kernel_dispatch.h:102] Get BackendSet from tensor
I0925 19:21:56.644301 4172401664 kernel_dispatch.h:102] Get BackendSet from tensor
I0925 19:21:56.644706 4172401664 kernel_dispatch.h:102] Get BackendSet from tensor
I0925 19:21:56.645464 4172401664 kernel_dispatch.h:102] Get BackendSet from tensor
I0925 19:21:56.647061 4172401664 kernel_dispatch.h:102] Get BackendSet from tensor
I0925 19:21:56.647202 4172401664 kernel_dispatch.h:102] Get BackendSet from tensor
I0925 19:21:56.648202 4172401664 kernel_dispatch.h:102] Get BackendSet from tensor
I0925 19:21:56.648403 4172401664 kernel_dispatch.h:102] Get BackendSet from tensor
I092

ValueError: (InvalidArgument) The input of Op(Conv) should be a 4-D or 5-D Tensor. But received: input's dimension is 2, input's shape is [8, 1000].
  [Hint: Expected in_dims.size() == 4 || in_dims.size() == 5 == true, but received in_dims.size() == 4 || in_dims.size() == 5:0 != true:1.] (at /Users/paddle/xly/workspace/293efbd7-945c-47ab-96a0-e0093f12eab2/Paddle/paddle/phi/infermeta/binary.cc:504)


In [5]:
import os
from PIL import Image
import numpy as np
import paddle
import paddle.nn as nn
from paddle.io import Dataset, DataLoader
from paddle.vision.transforms import Compose, Resize, ToTensor

# Step 1: Define a Dataset
class LicensePlateDataset(Dataset):
    def __init__(self, image_dir, transform=None):
        self.image_dir = image_dir
        self.image_paths = [
            os.path.join(image_dir, f)
            for f in os.listdir(image_dir)
            if f.endswith('.png') or f.endswith('.jpg')
        ]
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        # Load the image
        img = Image.open(img_path).convert('RGB')
        if self.transform:
            img = self.transform(img)
        else:
            img = ToTensor()(img)

        # Get label from filename (assuming filename is label without extension)
        label = os.path.splitext(os.path.basename(img_path))[0]
        return img, label

# Step 2: Define the label processing for Arabic and English
def is_arabic_numeral(char):
    return '\u0660' <= char <= '\u0669'

def label_license_plate_text(recognized_text):
    labeled_output = {
        'English Number': '',
        'English Letter': '',
        'Arabic Number': '',
        'Arabic Letter': '',
        'Unknown': ''
    }

    for word in recognized_text.split():
        for char in word:
            if is_arabic_numeral(char):
                label = 'Arabic Number'
            elif char.isdigit():
                label = 'English Number'
            elif char.isalpha() and char.isascii():
                label = 'English Letter'
            elif char.isalpha():
                label = 'Arabic Letter'
            else:
                label = 'Unknown'
            
            labeled_output[label] += char

    return labeled_output

# Define your character dictionary with Arabic and English letters/numbers
english_chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
arabic_chars = 'ابتثجحخدذرزسشصضطظعغفقكلمنهوي' + ''.join(chr(i) for i in range(0x0660, 0x066A))  # Arabic letters + numerals

# Combine Arabic and English characters
char_dict = english_chars + arabic_chars
num_classes = len(char_dict) + 1  # +1 for the CTC blank label

# Create a mapping from characters to indices
char_to_idx = {char: idx for idx, char in enumerate(char_dict)}
blank_idx = num_classes - 1  # The index for the CTC blank label

def encode_labels(labels):
    labels_idx = []
    for label in labels:
        labeled_output = label_license_plate_text(label)  # Extract Arabic/English letters and numerals
        label_idx = []
        for category, text in labeled_output.items():
            for c in text:
                label_idx.append(char_to_idx.get(c, blank_idx))  # Map characters to indices
        labels_idx.append(label_idx)
    return labels_idx

# Step 3: Define the CRNN Model using ResNet18 as the backbone
class CRNN(nn.Layer):
    def __init__(self, num_classes):
        super(CRNN, self).__init__()
        
        # Backbone: ResNet18
        resnet = paddle.vision.models.resnet18(pretrained=True)
        self.feature_extractor = nn.Sequential(*list(resnet.children())[:-2])  # Remove the last pooling and FC layers

        # Reduce channels and features
        self.conv = nn.Conv2D(in_channels=512, out_channels=256, kernel_size=1)

        # LSTM layer (Bidirectional)
        self.lstm = nn.LSTM(input_size=256 * h, hidden_size=128, num_layers=2, direction='bidirectional')

        # Final fully connected layer (CTC Head)
        self.fc = nn.Linear(128 * 2, num_classes)  # 128 * 2 because of bidirectional LSTM

    def forward(self, x):
        # Backbone feature extraction
        x = self.feature_extractor(x)  # Output shape: [batch_size, 512, H, W]

        # Reduce the feature size with a convolutional layer
        x = self.conv(x)  # Output shape: [batch_size, 256, H, W]

        # Reshape to fit LSTM input: flatten height and channels
        b, c, h, w = x.shape
        x = x.transpose([0, 3, 1, 2])  # [batch_size, W, C, H]
        x = x.reshape([b, w, c * h])   # [batch_size, W, feature_size]

        # Transpose to match LSTM input
        x = x.transpose([1, 0, 2])  # Shape: [W, batch_size, feature_size]

        # LSTM sequence encoding
        x, (h_n, c_n) = self.lstm(x)  # Output: [W, batch_size, hidden_size * 2]

        # Reshape for the fully connected layer
        x = x.reshape([-1, x.shape[-1]])  # Shape: [W * batch_size, hidden_size * 2]

        # Apply the fully connected layer
        x = self.fc(x)  # Shape: [W * batch_size, num_classes]

        # Reshape back to the expected shape for CTC loss
        x = x.reshape([-1, b, num_classes])  # Shape: [W, batch_size, num_classes]

        return x

# Step 4: Define a custom collate function
def custom_collate_fn(batch):
    images = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    images = paddle.stack(images, axis=0)
    return images, labels

# Step 5: Create DataLoader
image_dir = "Dataset/"  # Update this to your dataset path
transform = Compose([
    Resize((32, 320)),  # Resize to match the input shape
    ToTensor()
])
dataset = LicensePlateDataset(image_dir, transform=transform)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=custom_collate_fn)

# Step 6: Initialize model, loss function, and optimizer
model = CRNN(num_classes)
loss_fn = nn.CTCLoss(blank=blank_idx)  # Blank index is num_classes - 1
optimizer = paddle.optimizer.Adam(parameters=model.parameters(), learning_rate=0.001)

# Step 7: Train the model
def train_model(model, dataloader, optimizer, loss_fn, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for images, labels in dataloader:
            images = images.astype('float32')
            labels_idx = encode_labels(labels)
            labels_concat = [item for sublist in labels_idx for item in sublist]
            labels_tensor = paddle.to_tensor(labels_concat, dtype='int32')
            labels_lengths = paddle.to_tensor([len(label) for label in labels_idx], dtype='int64')

            # Forward pass
            preds = model(images)  # preds shape: [seq_len, batch_size, num_classes]

            # Prepare inputs for CTC Loss
            preds = preds.log_softmax(axis=2)  # Apply log softmax over classes
            preds_lengths = paddle.to_tensor([preds.shape[0]] * preds.shape[1], dtype='int64')  # All sequences have the same length

            # Compute loss
            loss = loss_fn(preds, labels_tensor, preds_lengths, labels_lengths)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            optimizer.clear_grad()

            total_loss += loss.numpy()[0]

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(dataloader):.4f}")

train_model(model, dataloader, optimizer, loss_fn, num_epochs=10)


I0925 19:25:06.812199 4172401664 kernel_dispatch.h:102] Get BackendSet from tensor
I0925 19:25:06.814056 4172401664 kernel_dispatch.h:102] Get BackendSet from tensor
I0925 19:25:06.814163 4172401664 kernel_dispatch.h:102] Get BackendSet from tensor
I0925 19:25:06.814257 4172401664 kernel_dispatch.h:102] Get BackendSet from tensor
I0925 19:25:06.817037 4172401664 kernel_dispatch.h:102] Get BackendSet from tensor
I0925 19:25:06.817098 4172401664 kernel_dispatch.h:102] Get BackendSet from tensor
I0925 19:25:06.817189 4172401664 kernel_dispatch.h:102] Get BackendSet from tensor
I0925 19:25:06.817260 4172401664 kernel_dispatch.h:102] Get BackendSet from tensor
I0925 19:25:06.817907 4172401664 kernel_dispatch.h:102] Get BackendSet from tensor
I0925 19:25:06.818183 4172401664 kernel_dispatch.h:102] Get BackendSet from tensor
I0925 19:25:06.818241 4172401664 kernel_dispatch.h:102] Get BackendSet from tensor
I0925 19:25:06.818287 4172401664 kernel_dispatch.h:102] Get BackendSet from tensor
I092

NameError: name 'h' is not defined