## 1- Import Required Packages

In [1]:
import time
import random
import numpy as np
import pickle as pkl
import tensorflow as tf

from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Dense, Dropout, LSTM, Bidirectional, TimeDistributed, Conv1D, Lambda, Concatenate, MultiHeadAttention, LayerNormalization, SpatialDropout1D, Add
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import Sequence
from tensorflow.keras.initializers import GlorotNormal
from tensorflow.keras.callbacks import ModelCheckpoint


# 2- Load Constants

In [9]:
DATASET_PATH = '../Data'
CONSTANTS_PATH = '../helpers/constants'

with open(CONSTANTS_PATH + '/ARABIC_LETTERS_LIST.pickle', 'rb') as file:
    ARABIC_LETTERS_LIST = pkl.load(file)
with open(CONSTANTS_PATH + '/DIACRITICS_LIST.pickle', 'rb') as file:
    DIACRITICS_LIST = pkl.load(file)
with open(CONSTANTS_PATH + '/RNN_BIG_CHARACTERS_MAPPING.pickle', 'rb') as file:
    CHARACTERS_MAPPING = pkl.load(file)
with open(CONSTANTS_PATH + '/RNN_CLASSES_MAPPING.pickle', 'rb') as file:
    CLASSES_MAPPING = pkl.load(file)
with open(CONSTANTS_PATH + '/RNN_REV_CLASSES_MAPPING.pickle', 'rb') as file:
    REV_CLASSES_MAPPING = pkl.load(file)
REV_CHARACTERS_MAPPING = {value: key for key, value in CHARACTERS_MAPPING.items()}

# 3 - Load Data

In [10]:
batch_size = 50
Number_of_training_files = 280 # 279 Max
train_raw = []

for i in range(1, Number_of_training_files, batch_size): 
    batch_files = []
    for j in range(i, min(i + batch_size, Number_of_training_files)):  # Ensure we don't exceed the total file count
        filename = f"/tashkeela_train/tashkeela_train_{j:03}.txt"
        with open(DATASET_PATH + filename, 'r', encoding='utf-8') as file:
            lines = file.readlines()
            batch_files.extend(lines)
    train_raw.extend(batch_files)

In [11]:
val_raw = []
Number_of_validation_files = 16 # 15 Max

for i in range(1, Number_of_validation_files):
    filename = f"/tashkeela_val/tashkeela_val_{i:03}.txt"
    with open(DATASET_PATH + filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        val_raw.extend(lines)

In [12]:
test_raw = []
Number_of_test_files = 16 # 15 Max

for i in range(1, Number_of_test_files):
    filename = f"/tashkeela_test/tashkeela_test_{i:03}.txt"
    with open(DATASET_PATH + filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        test_raw.extend(lines)

In [13]:
print(len(train_raw))
print(len(val_raw))
print(len(test_raw))

2790000
150000
150000


# 4 - Helper Functions

In [14]:
def remove_diacritics(Diacritized_data):
    """
    Remove diacritics from the data
    This function receives a string and removes the diacritics from it
    Input: Diacritized_data: String with diacritics
    Output: String with no diacritics
    """
    
    for diacritic in DIACRITICS_LIST:
        Diacritized_data = Diacritized_data.replace(diacritic, '')
    return Diacritized_data

In [15]:
def to_one_hot(data, size):
    """
    Convert the data to one hot encoding
    This function receives a list of integers and converts it to one hot encoding
    Used for converting the labels to one hot encoding for model training
    Input: data: List of integers
    size: Size of the one hot encoding
    Output: List of one hot encoded vectors
    """
    
    one_hot = list()
    for elem in data:
        cur = [0] * size
        cur[elem] = 1
        one_hot.append(cur)
    return one_hot

# 5 - Data Pre-processing

In [23]:
max_seq_len = 400

In [17]:
def split_data(data_raw, max_seq_len=400):
    """
    Split the data into sequences of length less than or equal to max_seq_len
    This function receives a list of strings and splits them into sequences of length less than or equal to max_seq_len
    Input: data_raw: List of strings
    Output: List of strings with sequences of length less than or equal to max_seq_len
    """

    data_new = list()

    for line in data_raw:
        for sub_line in line.split('\n'):
            if len(remove_diacritics(sub_line).strip()) == 0:
                continue

            if len(remove_diacritics(sub_line).strip()) > 0 and len(remove_diacritics(sub_line).strip()) <= max_seq_len:
                data_new.append(sub_line.strip())
            else:
                sub_line = sub_line.split()
                tmp_line = ''
                for word in sub_line:
                    if len(remove_diacritics(tmp_line).strip()) + len(remove_diacritics(word).strip()) + 1 > max_seq_len:
                        if len(remove_diacritics(tmp_line).strip()) > 0:
                            data_new.append(tmp_line.strip())
                        tmp_line = word
                    else:
                        if tmp_line == '':
                            tmp_line = word
                        else:
                            tmp_line += ' '
                            tmp_line += word
                if len(remove_diacritics(tmp_line).strip()) > 0:
                    data_new.append(tmp_line.strip())

    return data_new

In [19]:
train_split = split_data(train_raw, 400)
val_split = split_data(val_raw, 400)

In [20]:
print('Training examples (split):', len(train_split))
print('Validation examples (split):', len(val_split))

Training examples (split): 2795219
Validation examples (split): 150270


In [21]:
def map_data(data_raw, max_seq_len=400):
    """
    Map the data to the required format for training
    This function receives a list of strings and maps them to the required format for training
    Input: data_raw: List of strings
    Output: X: List of mapped strings without diacritics and with <SOS> and <EOS> tokens
    Y: List of one hot encoded vectors for each character in X
    """

    X = list()
    Y = list()

    for line in data_raw:
        x = [CHARACTERS_MAPPING['<SOS>']]
        y = [CLASSES_MAPPING['<SOS>']]

        for idx, char in enumerate(line):
                if char in DIACRITICS_LIST:
                    continue
                # if char wasn't a diacritic add it to x
                try:
                    x.append(CHARACTERS_MAPPING[char])
                except KeyError as e:
                    print(f"Error: Character '{char}' not found in CHARACTERS_MAPPING at index {idx} in line: {line}")

                # if char wasn't a diacritic and wasn't an arabic letter add '' to y (no diacritic)
                if char not in ARABIC_LETTERS_LIST:
                    y.append(CLASSES_MAPPING[''])
                # if char was an arabic letter only.
                else:
                    char_diac = ''
                    if idx + 1 < len(line) and line[idx + 1] in DIACRITICS_LIST:
                        char_diac = line[idx + 1]
                        if idx + 2 < len(line) and line[idx + 2] in DIACRITICS_LIST and char_diac + line[idx + 2] in CLASSES_MAPPING:
                            char_diac += line[idx + 2]
                        elif idx + 2 < len(line) and line[idx + 2] in DIACRITICS_LIST and line[idx + 2] + char_diac in CLASSES_MAPPING: # شدة فتحة = فتحة شدة
                            char_diac = line[idx + 2] + char_diac
                    y.append(CLASSES_MAPPING[char_diac])

        assert(len(x) == len(y))

        

        x.append(CHARACTERS_MAPPING['<EOS>'])
        y.append(CLASSES_MAPPING['<EOS>'])

        x = x[:max_seq_len]
        y = y[:max_seq_len]

        # Padding
        pad_len = max_seq_len - len(x)
        x.extend([CHARACTERS_MAPPING['<PAD>']] * pad_len)  # Pad with '<PAD>' token
        y.extend([CLASSES_MAPPING['<PAD>']] * pad_len)  # Pad with '<PAD>' token

        y = to_one_hot(y, len(CLASSES_MAPPING))

        X.append(x)
        Y.append(y)

    return X, Y

In [22]:
class DataGenerator(Sequence):
    """
    Data generator class for manage the data and feed it to the model in batches
    Attributes:
    lines: List of strings
    batch_size: Integer

    Methods:
    __len__: Returns the number of batches
    __getitem__: Returns the batch at the given index

    Output:
    X: List of mapped strings without diacritics and with <SOS> and <EOS> tokens
    Y: List of one hot encoded vectors for each character in X
    """
    
    def __init__(self, lines, batch_size):
        self.lines = lines
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.lines) / float(self.batch_size)))

    def __getitem__(self, idx):
        lines = self.lines[idx * self.batch_size:(idx + 1) * self.batch_size]
        X_batch, Y_batch = map_data(lines, max_seq_len)

        X = np.asarray(X_batch)
        Y = np.asarray(Y_batch)

        return X, Y

# 6 - Model Architecture

In [24]:
inputs = Input(shape=(max_seq_len,))

embeddings = Embedding(input_dim=len(CHARACTERS_MAPPING),
                        output_dim=1024,
                        embeddings_initializer=GlorotNormal(seed=961))(inputs)

conv1 = Conv1D(filters=512, kernel_size=3, activation='relu', padding='same')(embeddings)
norm1 = LayerNormalization()(conv1)


# First BLSTM layer
blstm1 = Bidirectional(LSTM(units=256, return_sequences=True, kernel_initializer=GlorotNormal(seed=961)))(norm1)
dropout1 = Dropout(0.6)(blstm1)
res1 = Add()([norm1, dropout1])
norm2 = LayerNormalization()(res1)

# Second BLSTM layer
blstm2 = Bidirectional(LSTM(units=256, return_sequences=True, kernel_initializer=GlorotNormal(seed=961)))(norm2)
dropout2 = Dropout(0.6)(blstm2)
res2 = Add()([norm2, dropout2])


# Dense layers
dense2 = TimeDistributed(Dense(units=512, activation='relu', kernel_initializer=GlorotNormal(seed=961)))(res2)
dense3 = TimeDistributed(Dense(units=512, activation='relu', kernel_initializer=GlorotNormal(seed=961)))(dense2)

# Output layer
output = TimeDistributed(Dense(units=len(CLASSES_MAPPING), activation='softmax', kernel_initializer=GlorotNormal(seed=961)))(dense3)

model = Model(inputs, output)

model.compile(loss='categorical_crossentropy', optimizer=Adam())

In [25]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 400)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 400, 1024)    93184       ['input_1[0][0]']                
                                                                                                  
 conv1d (Conv1D)                (None, 400, 512)     1573376     ['embedding[0][0]']              
                                                                                                  
 layer_normalization (LayerNorm  (None, 400, 512)    1024        ['conv1d[0][0]']                 
 alization)                                                                                   

# 7 - Training

In [26]:
def fit_model(model, epochs, batch_size, train_split, val_split):
    """
    Fit the model on the training data using the given number of epochs and batch size using DataGenerator object
    
    Input: model: Model
    epochs: Integer
    batch_size: Integer
    train_split: List of strings
    val_split: List of strings
    """

    random.shuffle(train_split)
    random.shuffle(val_split)

    training_generator = DataGenerator(train_split, batch_size)
    val_generator = DataGenerator(val_split, batch_size)

    model.fit(training_generator, validation_data=val_generator, epochs=epochs)

In [27]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [None]:
fit_model(model, 5, 128, train_split, val_split)

# 8 - Evaluation

In [29]:
def predict(line, model, max_seq_len):
    """
    Predict the diacritics for the given line using the model
                    
    Input: line: String
    model: Model
    Output: String with diacritics
    """
    
    X, _ = map_data([line], max_seq_len)
    predictions = model.predict(X).squeeze()
    predictions = predictions[1:]

    diacritized_line = ''
    for idx, char in enumerate(line):
        if char in ARABIC_LETTERS_LIST:
            diacritized_line += char
            max_idx = np.argmax(predictions[idx])
            diacritized_line += REV_CLASSES_MAPPING[max_idx]
        else:
            diacritized_line += char
    
    return diacritized_line

In [31]:
# load model .h5
model = tf.keras.models.load_model('..\models\shape400withResiduals2.h5')

In [42]:
output = predict("إن الذي ملأ اللغات محاسنا ... جعل الجمال وسره في الضاد.", model, 400)



In [43]:
output

'إِنَّ الَّذِي مَلَأَ اللُّغَاتِ مَحَاسِنًا ... جَعَلَ الجَمَالَ وَسِرَّهُ فِي الضَّادِ.'