Data preprocessing:

Cleaning and Normalizing the Arabic text.

In [None]:
import os
import re

input_dir = "/lip reading/pretrain"

#preprocessing functions
def clean_txt(input_str):
    try:
        if input_str:
            input_str = re.sub('[?؟!@#$%&*+~\/=><،.؛!]+^', '', input_str)
            input_str = re.sub(r'[a-zA-Z?]', '', input_str).strip()
            input_str = re.sub('[\\s]+', " ", input_str)
            input_str = re.sub(r'\d', '', input_str).strip()
            input_str = re.sub(r'[٠-٩]', '', input_str).strip()
            input_str = re.sub(r" ?\([^)]+\)", "", input_str)
            input_str = input_str.replace("؟", ' ').replace("!", ' ').replace("/", ' ')
            input_str = input_str.replace(",", '').replace(".", '').replace(":", ' ')
            input_str = input_str.strip()
    except:
        return input_str
    return input_str

def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)  # Normalize ى to ي first
    text = re.sub(r"ي\b", "ى", text)  # Replace ي with ى only at the end of words
    text = re.sub("ؤ", "و", text)
    text = re.sub("ئ", "ي", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text


def preprocess_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read().strip()

    cleaned_content = clean_txt(content)
    normalized_content = normalize_arabic(cleaned_content)

    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(normalized_content)


for root, dirs, files in os.walk(input_dir):
    for file_name in files:
        if file_name.endswith('.txt'):
            file_path = os.path.join(root, file_name)
            preprocess_text_file(file_path)

print(f"Text preprocessing complete! All .txt files in {input_dir} have been updated.")

organizing the dataset

In [None]:

import shutil


input_dir = "/lip reading/pretrain"  
output_dir = "/lip reading/cleanD" 

os.makedirs(output_dir, exist_ok=True)

def process_folder(folder_path):
    txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
    npy_files = [f for f in os.listdir(folder_path) if f.endswith('.npy')]

    for txt_file in txt_files:
        
        txt_file_path = os.path.join(folder_path, txt_file)
        with open(txt_file_path, 'r', encoding='utf-8') as file:
            class_name = file.read().strip()

        class_folder = os.path.join(output_dir, class_name)
        os.makedirs(class_folder, exist_ok=True)

        for npy_file in npy_files:
            if npy_file.startswith(os.path.splitext(txt_file)[0]):  
                src_npy_path = os.path.join(folder_path, npy_file)
                dest_npy_path = os.path.join(class_folder, npy_file)
                shutil.copy(src_npy_path, dest_npy_path)  


for root, dirs, files in os.walk(input_dir):
    if any(f.endswith('.txt') for f in files):
        process_folder(root)

print(f"Dataset processing complete! All files are organized in: {output_dir}")

In [None]:
import os

# Output directory where the dataset is created
output_dir = "/lip reading/cleanD"

# Get the list of class folders
class_folders = [folder for folder in os.listdir(output_dir) if os.path.isdir(os.path.join(output_dir, folder))]

# Print the total number of classes and their names
print(f"Total number of classes: {len(class_folders)}")
print("Class names:")
for class_name in class_folders:
    print(f"- {class_name}")

In [None]:
main_dir = "/lip reading/cleanD"
classes = os.listdir(main_dir)  # List all class folders

In [None]:
classes

In [None]:
class_to_idx = {class_name: idx for idx, class_name in enumerate(classes)}

Finding the maximum shape

In [None]:
import numpy as np

max_shape = None

# Step 1: Identify the maximum shape among all .npy files
for class_dir in class_to_idx:
    class_path = os.path.join(main_dir, class_dir)
    if os.path.isdir(class_path):  # Ensure it's a directory
        for npy_file in os.listdir(class_path):
            if npy_file.endswith('.npy'):
                file_path = os.path.join(class_path, npy_file)
                data = np.load(file_path)
                if max_shape is None:
                    max_shape = data.shape
                else:
                    max_shape = tuple(max(max_shape[dim], data.shape[dim]) for dim in range(len(data.shape)))
     


Max Padding the files

In [None]:


# Step 2: Pad files to the maximum shape
for class_dir in class_to_idx:
    class_path = os.path.join(main_dir, class_dir)
    if os.path.isdir(class_path):  # Ensure it's a directory
        for npy_file in os.listdir(class_path):
            if npy_file.endswith('.npy'):
                file_path = os.path.join(class_path, npy_file)
                data = np.load(file_path)

                # Calculate padding sizes
                padding = [(0, max_dim - curr_dim) for curr_dim, max_dim in zip(data.shape, max_shape)]

                # Apply padding
                padded_data = np.pad(data, pad_width=padding, mode='constant', constant_values=0)

                # Save the padded file (overwrite or save to a new location)
                np.save(file_path, padded_data)

print("Padding completed. All files have the shape:", max_shape)
     
