Create a Combined & Clean Training Dataset

In [17]:
# import pandas as pd
# import os

# # Adjust these paths
# train_images_path = "E:/OCR_project/Datasets/archive/Train_deskewed/Train_deskewed/"
# val_images_path = "E:/OCR_project/Datasets/archive/Validate_deskewed"

# # Load CSVs and drop row 0 if it's fake (123456789 row)
# train_df = pd.read_csv("E:/OCR_project/Datasets/archive/data_Set_converting/TrainLabels.csv")
# val_df = pd.read_csv("E:/OCR_project/Datasets/archive/data_Set_converting/ValidateLabels.csv")

# # Drop fake row (index 1 in your case)
# train_df = train_df.iloc[2:].reset_index(drop=True)
# val_df = val_df.iloc[2:].reset_index(drop=True)

# # Add full path to each image
# train_df["Image"] = train_df["Image"].apply(lambda x: os.path.join(train_images_path, x))
# val_df["Image"] = val_df["Image"].apply(lambda x: os.path.join(val_images_path, x))

# # Sort for consistency
# train_df = train_df.sort_values("Image").reset_index(drop=True)
# val_df = val_df.sort_values("Image").reset_index(drop=True)

# import pandas as pd
# import os

# # Paths
# train_images_path = r"E:/OCR_project/Datasets/archive/Train_deskewed/Train_deskewed"
# val_images_path = r"E:/OCR_project/Datasets/archive/Validate_deskewed"

# train_csv_path = r"E:/OCR_project/Datasets/archive/data_Set_converting/TrainLabels.csv"
# val_csv_path = r"E:/OCR_project/Datasets/archive/data_Set_converting/ValidateLabels.csv"

# # Load CSVs
# train_df = pd.read_csv(train_csv_path)
# val_df = pd.read_csv(val_csv_path)

# # Remove invalid rows (where Image column is not a real filename)
# valid_ext = ('.jpg', '.jpeg', '.png')
# train_df = train_df[train_df["Image"].str.lower().str.endswith(valid_ext)].reset_index(drop=True)
# val_df = val_df[val_df["Image"].str.lower().str.endswith(valid_ext)].reset_index(drop=True)

# # Add full path to images
# train_df["Image"] = train_df["Image"].apply(lambda x: os.path.join(train_images_path, x))
# val_df["Image"] = val_df["Image"].apply(lambda x: os.path.join(val_images_path, x))

# # Sort for consistency
# train_df = train_df.sort_values("Image").reset_index(drop=True)
# val_df = val_df.sort_values("Image").reset_index(drop=True)

# # Optional: quick check
# print(f"✅ Train images: {len(train_df)}")
# print(f"✅ Validation images: {len(val_df)}")
import pandas as pd
import os

# Paths (using your provided ones)
IMG_DIR = r"E:/OCR_project/Datasets/archive/Validate_deskewed"
csv_path = r"E:/OCR_project/Datasets/archive/data_Set_converting/TrainLabels.csv"

# Load CSV
df = pd.read_csv(csv_path)

# Keep only valid image rows
valid_ext = ('.jpg', '.jpeg', '.png')
df = df[df["Image"].str.lower().str.endswith(valid_ext)].reset_index(drop=True)

# Add full path to images
df["Image"] = df["Image"].apply(lambda x: os.path.join(IMG_DIR, x))

# Sort for consistency
df = df.sort_values("Image").reset_index(drop=True)

# Final check
print(f"✅ Total training images: {len(df)}")
print(df.head())



✅ Total training images: 1900
                                               Image  \
0  E:/OCR_project/Datasets/archive/Validate_deske...   
1  E:/OCR_project/Datasets/archive/Validate_deske...   
2  E:/OCR_project/Datasets/archive/Validate_deske...   
3  E:/OCR_project/Datasets/archive/Validate_deske...   
4  E:/OCR_project/Datasets/archive/Validate_deske...   

                                                Text  
0  طفنا وسعينا مع شيخ. كان جاري في الخيم يتكلم وه...  
1  فإن الله تعالى خلق الخلق برحمته، ومن على عباده...  
2  ما يقدرون به على إصح معايشهم في الدنيا، ويدركو...  
3  من العذاب في اخر، وأفضل ما رزقهم الله تعالى وم...  
4  لجميع اشياء والذي  يقدر أحد في الدنيا على إصح ...  


Extract Character Set and Max Length

In [19]:
# Get all unique characters in training set
characters = set(char for text in df["Text"] for char in text)
characters = sorted(list(characters))

# Find maximum label length
max_length = df["Text"].apply(len).max()

print("✅ Unique characters:", characters)
print("✅ Max text length:", max_length)


✅ Unique characters: [' ', '"', '#', '%', '(', ')', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '?', '_', '،', 'ء', 'آ', 'أ', 'ؤ', 'إ', 'ئ', 'ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ى', 'ي']
✅ Max text length: 109


Prepare Train/Val Split

In [20]:
# X_train_path = train_df["Image"].values
# y_train = train_df["Text"].values

# X_val_path = val_df["Image"].values
# y_val = val_df["Text"].values
# Training data
X_train_path = df["Image"].values
y_train = df["Text"].values



 Define Char Map for CTC  (Connectionist Temporal Classification)

In [21]:
import tensorflow as tf
from tensorflow.keras import layers

# Character ↔ number mapping
char_to_num = layers.StringLookup(vocabulary=characters, mask_token=None)
num_to_char = layers.StringLookup(vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True)


Preprocessing Functions

In [22]:
def distortion_free_resize(image, img_size):
    w, h = img_size
    image = tf.image.resize(image, size=(h, w), preserve_aspect_ratio=True)
    pad_height = h - tf.shape(image)[0]
    pad_width = w - tf.shape(image)[1]

    pad_height_top = pad_height // 2 + pad_height % 2
    pad_height_bottom = pad_height // 2
    pad_width_left = pad_width // 2 + pad_width % 2
    pad_width_right = pad_width // 2

    image = tf.pad(image, paddings=[
        [pad_height_top, pad_height_bottom],
        [pad_width_left, pad_width_right],
        [0, 0],
    ])
    image = tf.transpose(image, perm=[1, 0, 2])
    image = tf.image.flip_left_right(image)
    return image

image_width = 2882
image_height = 46
padding_token = 99
batch_size = 64

def preprocess_image(image_path, img_size=(image_width, image_height)):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=1)
    image = distortion_free_resize(image, img_size)
    image = tf.cast(image, tf.float32) / 255.0
    return image

def vectorize_label(label):
    label = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8"))
    length = tf.shape(label)[0]
    pad_amount = 128 - length
    label = tf.pad(label, paddings=[[0, pad_amount]], constant_values=padding_token)
    return label

def process_images_labels(image_path, label):
    image = preprocess_image(image_path)
    label = vectorize_label(label)
    return {"image": image, "label": label}


Create Datasets by tensorflow

In [24]:
def prepare_dataset(image_paths, labels):
    dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels))
    dataset = dataset.map(process_images_labels, num_parallel_calls=tf.data.AUTOTUNE)
    return dataset.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)

train_ds = prepare_dataset(X_train_path, y_train)
# validation_ds = prepare_dataset(X_val_path, y_val)


Modeling

In [28]:
import tensorflow as tf
from tensorflow import keras
import numpy as np


# Custom CTC Layer
class CTCLayer(keras.layers.Layer):
    def __init__(self, name=None):
        super().__init__(name=name)
        self.loss_fn = keras.backend.ctc_batch_cost

    def call(self, y_true, y_pred):
        batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
        label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

        input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
        self.add_loss(loss)

        return y_pred  # At test time, just return predictions

# Build Model Function
def build_model():
    input_img = keras.Input(shape=(image_width, image_height, 1), name="image")
    labels = keras.layers.Input(name="label", shape=(None,))

    x = keras.layers.Conv2D(64, (3, 3), activation="relu", padding="same", name="Conv1")(input_img)
    x = keras.layers.MaxPooling2D((2, 2), name="pool1")(x)
    x = keras.layers.BatchNormalization()(x)

    new_shape = ((image_width // 2), (image_height // 2) * 64)
    x = keras.layers.Reshape(target_shape=new_shape, name="reshape")(x)

    x = keras.layers.Dense(16, activation="relu", name="dense2")(x)
    x = keras.layers.BatchNormalization()(x)

    x = keras.layers.Bidirectional(
        keras.layers.LSTM(128, return_sequences=True, dropout=0.35)
    )(x)

    x = keras.layers.Dense(
        len(char_to_num.get_vocabulary()) + 2, activation="softmax", name="dense3"
    )(x)

    output = CTCLayer(name="ctc_loss")(labels, x)

    model = keras.models.Model(inputs=[input_img, labels], outputs=output, name="Arabic_OCR")

    lr_schedule = keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=0.0001,
        decay_steps=10000,
        decay_rate=0.9
    )

    opt = keras.optimizers.Adam(learning_rate=lr_schedule)
    model.compile(optimizer=opt)
    return model

# Edit Distance Calculation Function
def calculate_edit_distance(labels, predictions):
    sparse_labels = tf.sparse.from_dense(labels)

    input_len = np.ones(predictions.shape[0]) * predictions.shape[1]

    decoded_preds = keras.backend.ctc_decode(
        predictions, input_length=input_len, greedy=False, beam_width=100
    )[0][0][:, :max_length]

    sparse_preds = tf.sparse.from_dense(decoded_preds)

    edit_distances = tf.edit_distance(sparse_preds, sparse_labels, normalize=False)
    return tf.reduce_mean(edit_distances)

# Custom Callback to Print Edit Distance
# class EditDistanceCallback(keras.callbacks.Callback):
#     def __init__(self, pred_model):
#         super().__init__()
#         self.prediction_model = pred_model

#     def on_epoch_end(self, epoch, logs=None):
#         edit_distances = []

#         for i in range(len(validation_images)):
#             labels = validation_labels[i]
#             predictions = self.prediction_model.predict(validation_images[i])
#             edit_distances.append(calculate_edit_distance(labels, predictions).numpy())

#         print(f"Mean edit distance for epoch {epoch + 1}: {np.mean(edit_distances):.4f}")
#***********************************************************************
class EditDistanceCallback(keras.callbacks.Callback):
    def __init__(self, pred_model, val_images=None, val_labels=None):
        super().__init__()
        self.prediction_model = pred_model
        self.val_images = val_images
        self.val_labels = val_labels

    def on_epoch_end(self, epoch, logs=None):
        if not self.val_images or not self.val_labels:
            return  # skip if no validation data
        edit_distances = []
        for i in range(len(self.val_images)):
            labels = self.val_labels[i]
            predictions = self.prediction_model.predict(self.val_images[i])
            edit_distances.append(calculate_edit_distance(labels, predictions).numpy())
        print(f"Mean edit distance for epoch {epoch + 1}: {np.mean(edit_distances):.4f}")



In [29]:
model = build_model()

for layer in model.layers:
    print(layer.name)


image
Conv1
pool1
batch_normalization_4
reshape
dense2
batch_normalization_5
bidirectional_2
label
dense3
ctc_loss


In [None]:
# 1. Build the model
model = build_model()

# 2. Extract prediction model from the full model (before CTCLayer)
# CTCLayer is the final output, so we skip it in prediction
# prediction_model = keras.models.Model(
#     model.get_layer(name="image").input,  # ✅ This is the input layer
#     model.get_layer(name="dense3").output # ✅ This is the layer before CTCLayer
# )
prediction_model = keras.models.Model(
    model.input,
    model.get_layer(name="dense3").output
)


# 3. Create the custom callback
edit_distance_callback = EditDistanceCallback(prediction_model)

# 4. Train the model
history = model.fit(
    train_ds,
    validation_data=validation_ds,
    epochs=30,
    callbacks=[edit_distance_callback],
    shuffle=True
)


Epoch 1/30
[1m 1/30[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m16:02[0m 33s/step - loss: 334656.5625

In [12]:
missing_files = df[~df["filename"].apply(lambda x: os.path.exists(os.path.join(IMG_DIR, x)))]
print(f"Total missing images: {len(missing_files)}")
print(missing_files.head())


NameError: name 'df' is not defined

fixing problem why can`t see images

| Issue                                                        | Fix                                             |
| ------------------------------------------------------------ | ----------------------------------------------- |
| Extra spaces in filename                                     | Strip whitespace: `x.strip()`                   |
| Wrong case (Windows is case-insensitive, but safer to check) | Ensure file names are exact                     |
| Wrong column name                                            | Confirm `Image` column has the filenames        |
| Double slashes or backslashes                                | Use `os.path.join()` instead of manual paths    |
| Encoding issues from CSV                                     | Open CSV in `utf-8` or `windows-1256` if Arabic |


In [1]:
import pandas as pd
import os

# Set image directory and load CSV
IMG_DIR = "E:/OCR_project/Datasets/archive/Train_deskewed/Train_deskewed/"
csv_path = "E:/OCR_project/Datasets/archive/data_Set_converting/TrainLabels.csv"  # Adjust if needed

# Load your CSV
df = pd.read_csv(csv_path)

# Find missing files
missing_files = df[~df["Image"].apply(lambda x: os.path.exists(os.path.join(IMG_DIR, x)))]

# Print report
print(f"❌ Total missing image files: {len(missing_files)}")
print(missing_files.head(10))  # Show first 10 missing

# Optional: Save list of missing to a CSV file for review
missing_files.to_csv("missing_files_report.csv", index=False)


❌ Total missing image files: 1901
                    Image                                               Text
0                       0                                          123456789
1  AHTD3A0001_Para1_3.jpg  طفنا وسعينا مع شيخ. كان جاري في الخيم يتكلم وه...
2  AHTD3A0005_Para1_1.jpg  ذهب نوح مظفر ضرغام بصح ب رؤوف بن لؤي رايق ظافر...
3  AHTD3A0005_Para1_2.jpg  بدأت قوافل الحجيج حاج اثر اخر يلبي. عند وصولنا...
4  AHTD3A0005_Para1_3.jpg  يتكلم وهو نائم  بكلمات لا أفهمها مثل انقض بغلس...
5  AHTD3A0047_Para1_1.jpg  ذهب نوح مظفر ضرغام بصحب رؤوف بن لؤي رايق ظافر ...
6  AHTD3A0047_Para1_2.jpg  بدأت قوافل الحجيج حاج إثر آخر يلبي عند وصولنا ...
7  AHTD3A0047_Para1_3.jpg  يتكلم في الخيم يتكلم وهو نائم  بكلمات لا أفهمه...
8  AHTD3A0047_Para1_4.jpg  راجح هل بلغ أصحابنا ظ ع ك  ث خ ض ب س ش ص غ ه أ...
9  AHTD3A0047_Para1_5.jpg  الكلمات التالي لهذا النص مشمش دراق غيظ ناء   ب...


In [2]:
# Show first 5 mismatches with actual directory files
all_images = set(os.listdir(IMG_DIR))
print("First few mismatches:")
for name in df["Image"].head(10):
    print(f"CSV: '{name}'  →  Exists: {name in all_images}")


First few mismatches:
CSV: '0'  →  Exists: False
CSV: 'AHTD3A0001_Para1_3.jpg'  →  Exists: False
CSV: 'AHTD3A0005_Para1_1.jpg'  →  Exists: False
CSV: 'AHTD3A0005_Para1_2.jpg'  →  Exists: False
CSV: 'AHTD3A0005_Para1_3.jpg'  →  Exists: False
CSV: 'AHTD3A0047_Para1_1.jpg'  →  Exists: False
CSV: 'AHTD3A0047_Para1_2.jpg'  →  Exists: False
CSV: 'AHTD3A0047_Para1_3.jpg'  →  Exists: False
CSV: 'AHTD3A0047_Para1_4.jpg'  →  Exists: False
CSV: 'AHTD3A0047_Para1_5.jpg'  →  Exists: False


In [3]:
import os

IMG_DIR = r"E:/OCR_project/Datasets/archive/Train_deskewed/Train_deskewed"

# List a few files from your directory
files = os.listdir(IMG_DIR)
print(f"Total files found: {len(files)}")
print("Sample:", files[:10])


Total files found: 9522
Sample: ['AHTD3A0008_Para1_1.jpg', 'AHTD3A0008_Para1_5.jpg', 'AHTD3A0008_Para1_6.jpg', 'AHTD3A0008_Para2_1.jpg', 'AHTD3A0008_Para2_2.jpg', 'AHTD3A0008_Para2_3.jpg', 'AHTD3A0008_Para2_4.jpg', 'AHTD3A0008_Para2_5.jpg', 'AHTD3A0008_Para2_6.jpg', 'AHTD3A0008_Para3_1.jpg']


In [4]:
import os

root_dir = r"E:/OCR_project/Datasets/archive"
search_name = "AHTD3A0001_Para1_3.jpg"

for dirpath, dirnames, filenames in os.walk(root_dir):
    if search_name in filenames:
        print("Found at:", dirpath)
        break


Found at: E:/OCR_project/Datasets/archive\Validate_deskewed


In [5]:
import pandas as pd
import os

# Paths
IMG_DIR = r"E:/OCR_project/Datasets/archive/Validate_deskewed"
csv_path = r"E:/OCR_project/Datasets/archive/data_Set_converting/TrainLabels.csv"

# Load CSV
df = pd.read_csv(csv_path)
df["Image"] = df["Image"].astype(str).str.strip()

# Check missing
missing_files = df[~df["Image"].apply(lambda x: os.path.exists(os.path.join(IMG_DIR, x)))]

print(f"❌ Total missing image files: {len(missing_files)}")
print(missing_files.head(10))


❌ Total missing image files: 1
  Image       Text
0     0  123456789


In [6]:
import pandas as pd
import os

IMG_DIR = r"E:/OCR_project/Datasets/archive/Validate_deskewed"
csv_path = r"E:/OCR_project/Datasets/archive/data_Set_converting/TrainLabels.csv"

# Count images in directory
image_count = len([f for f in os.listdir(IMG_DIR) if f.lower().endswith(('.jpg', '.jpeg', '.png'))])

# Count rows in CSV
df = pd.read_csv(csv_path)
csv_count = len(df)

print(f"🖼 Images in folder: {image_count}")
print(f"📄 Rows in CSV: {csv_count}")


🖼 Images in folder: 1905
📄 Rows in CSV: 1901


In [28]:
df.count()

Image    1901
Text     1901
dtype: int64