# 1 - Packages

In [None]:
import time
import random
import numpy as np
import pickle as pkl
import tensorflow as tf

from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Dense, Dropout, LSTM, Bidirectional, TimeDistributed, Conv1D, Lambda, Concatenate, MultiHeadAttention, LayerNormalization, SpatialDropout1D, Add
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import Sequence
from tensorflow.keras.initializers import GlorotNormal
from tensorflow.keras.callbacks import ModelCheckpoint


# 2 - Constants

In [None]:
WITH_EXTRA_TRAIN = False
DATASET_PATH = 'Data'
CONSTANTS_PATH = 'helpers/constants'

with open(CONSTANTS_PATH + '/ARABIC_LETTERS_LIST.pickle', 'rb') as file:
    ARABIC_LETTERS_LIST = pkl.load(file)
with open(CONSTANTS_PATH + '/DIACRITICS_LIST.pickle', 'rb') as file:
    DIACRITICS_LIST = pkl.load(file)
if not WITH_EXTRA_TRAIN:
    with open(CONSTANTS_PATH + '/RNN_BIG_CHARACTERS_MAPPING.pickle', 'rb') as file:
        CHARACTERS_MAPPING = pkl.load(file)
else:
    with open(CONSTANTS_PATH + '/RNN_BIG_CHARACTERS_MAPPING.pickle', 'rb') as file:
        CHARACTERS_MAPPING = pkl.load(file)
with open(CONSTANTS_PATH + '/RNN_CLASSES_MAPPING.pickle', 'rb') as file:
    CLASSES_MAPPING = pkl.load(file)
with open(CONSTANTS_PATH + '/RNN_REV_CLASSES_MAPPING.pickle', 'rb') as file:
    REV_CLASSES_MAPPING = pkl.load(file)

REV_CHARACTERS_MAPPING = {value: key for key, value in CHARACTERS_MAPPING.items()}

# 3 - Load Data

In [None]:
batch_size = 50
train_raw = []

for i in range(1, 280, batch_size):
    batch_files = []
    for j in range(i, min(i + batch_size, 280)):  # Ensure we don't exceed the total file count
        filename = f"/tashkeela_train/tashkeela_train_{j:03}.txt"
        with open(DATASET_PATH + filename, 'r', encoding='utf-8') as file:
            lines = file.readlines()
            batch_files.extend(lines)
    train_raw.extend(batch_files)

In [None]:
val_raw = []

for i in range(1, 15):
    filename = f"/tashkeela_val/tashkeela_val_{i:03}.txt"
    with open(DATASET_PATH + filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        val_raw.extend(lines)

In [None]:
print(len(train_raw))
print(len(val_raw))

2790000
140000


# 4 - Helpers

In [None]:
def remove_diacritics(data_raw):
    return data_raw.translate(str.maketrans('', '', ''.join(DIACRITICS_LIST)))

In [None]:
def to_one_hot(data, size):
    one_hot = list()
    for elem in data:
        cur = [0] * size
        cur[elem] = 1
        one_hot.append(cur)
    return one_hot

# 5 - Prepare Data

In [None]:
max_seq_len = 200

In [None]:
def split_data(data_raw):
    data_new = list()

    for line in data_raw:
        for sub_line in line.split('\n'):
            if len(remove_diacritics(sub_line).strip()) == 0:
                continue

            if len(remove_diacritics(sub_line).strip()) > 0 and len(remove_diacritics(sub_line).strip()) <= max_seq_len:
                data_new.append(sub_line.strip())
            else:
                sub_line = sub_line.split()
                tmp_line = ''
                for word in sub_line:
                    if len(remove_diacritics(tmp_line).strip()) + len(remove_diacritics(word).strip()) + 1 > max_seq_len:
                        if len(remove_diacritics(tmp_line).strip()) > 0:
                            data_new.append(tmp_line.strip())
                        tmp_line = word
                    else:
                        if tmp_line == '':
                            tmp_line = word
                        else:
                            tmp_line += ' '
                            tmp_line += word
                if len(remove_diacritics(tmp_line).strip()) > 0:
                    data_new.append(tmp_line.strip())

    return data_new

In [None]:
train_split = split_data(train_raw)
val_split = split_data(val_raw)

In [None]:
print('Training examples (split):', len(train_split))
print('Validation examples (split):', len(val_split))

Training examples (split): 2847305
Validation examples (split): 142865


In [None]:
def map_data(data_raw):
    X = list()
    Y = list()

    for line in data_raw:
        x = [CHARACTERS_MAPPING['<SOS>']]
        y = [CLASSES_MAPPING['<SOS>']]

        for idx, char in enumerate(line):
                if char in DIACRITICS_LIST:
                    continue

                # if char wasn't a diacritic add it to x
                try:
                    x.append(CHARACTERS_MAPPING[char])
                except KeyError as e:
                    print(f"Error: Character '{char}' not found in CHARACTERS_MAPPING at index {idx} in line: {line}")

                # if char wasn't a diacritic and wasn't an arabic letter add '' to y (no diacritic)
                if char not in ARABIC_LETTERS_LIST:
                    y.append(CLASSES_MAPPING[''])
                # if char was an arabic letter only.
                else:
                    char_diac = ''
                    if idx + 1 < len(line) and line[idx + 1] in DIACRITICS_LIST:
                        char_diac = line[idx + 1]
                        if idx + 2 < len(line) and line[idx + 2] in DIACRITICS_LIST and char_diac + line[idx + 2] in CLASSES_MAPPING:
                            char_diac += line[idx + 2]
                        elif idx + 2 < len(line) and line[idx + 2] in DIACRITICS_LIST and line[idx + 2] + char_diac in CLASSES_MAPPING: # شدة فتحة = فتحة شدة
                            char_diac = line[idx + 2] + char_diac
                    y.append(CLASSES_MAPPING[char_diac])


        assert(len(x) == len(y))

        x.append(CHARACTERS_MAPPING['<EOS>'])
        y.append(CLASSES_MAPPING['<EOS>'])

        # Padding
        pad_len = max_seq_len - len(x)
        x.extend([CHARACTERS_MAPPING['<PAD>']] * pad_len)  # Pad with '<PAD>' token
        y.extend([CLASSES_MAPPING['<PAD>']] * pad_len)  # Pad with '<PAD>' token

        y = to_one_hot(y, len(CLASSES_MAPPING))

        X.append(x)
        Y.append(y)
    return X, Y

# 6 - Model Structure

In [None]:
class DataGenerator(Sequence):
    def __init__(self, lines, batch_size):
        self.lines = lines
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.lines) / float(self.batch_size)))

    def __getitem__(self, idx):
        lines = self.lines[idx * self.batch_size:(idx + 1) * self.batch_size]
        X_batch, Y_batch = map_data(lines)
        lines_no_diac = [remove_diacritics(line) for line in lines]

        X_max_seq_len = max_seq_len
        Y_max_seq_len = max_seq_len

        # assert(X_max_seq_len == Y_max_seq_len)

        X = list()
        for x in X_batch:
            x = list(x)
            x = x[:X_max_seq_len]
            x.extend([CHARACTERS_MAPPING['<PAD>']] * (X_max_seq_len - len(x)))
            X.append(np.asarray(x))

        Y_tmp = list()
        for y in Y_batch:
            y_new = list(y)
            y_new = y_new[:Y_max_seq_len]
            y_new.extend(to_one_hot([CLASSES_MAPPING['<PAD>']] * (Y_max_seq_len - len(y)), len(CLASSES_MAPPING)))
            Y_tmp.append(np.asarray(y_new))
        Y_batch = Y_tmp

        X = np.asarray(X)
        Y_batch = np.asarray(Y_batch)


        return X, Y_batch, lines_no_diac

In [None]:
batch_size = 128
numpy_data = np.load('numpy_data_batch128_max200.npy', allow_pickle=True)
training_generator = DataGenerator(train_split, batch_size) # Convert characters to indices and get the actual text
embedding_dim = numpy_data[0][1].shape[2]

In [None]:
char_inputs = Input(shape=(max_seq_len,), name='char_inputs')
word_embeddings_input = Input(shape=(max_seq_len, embedding_dim), name='word_embeddings')

char_embeddings = Embedding(input_dim=len(CHARACTERS_MAPPING), output_dim=1024, embeddings_initializer=GlorotNormal(seed=961))(char_inputs)
merged_embeddings = Concatenate()([char_embeddings, word_embeddings_input])

dense1 = Dense(units=256, activation='relu', kernel_initializer=GlorotNormal(seed=961))(merged_embeddings)
conv1 = Conv1D(filters=128, kernel_size=3, activation='relu', padding='same')(dense1)
blstm1 = Bidirectional(LSTM(units=256, return_sequences=True, kernel_initializer=GlorotNormal(seed=961)))(conv1)
dropout1 = Dropout(0.5)(blstm1)
blstm2 = Bidirectional(LSTM(units=256, return_sequences=True, kernel_initializer=GlorotNormal(seed=961)))(dropout1)
dropout2 = Dropout(0.5)(blstm2)
dense2 = TimeDistributed(Dense(units=512, activation='relu', kernel_initializer=GlorotNormal(seed=961)))(dropout2)
dense3 = TimeDistributed(Dense(units=512, activation='relu', kernel_initializer=GlorotNormal(seed=961)))(dense2)
output = TimeDistributed(Dense(units=len(CLASSES_MAPPING), activation='softmax', kernel_initializer=GlorotNormal(seed=961)))(dense3)

model = Model(inputs=[char_inputs, word_embeddings_input], outputs=output)
model.compile(loss='categorical_crossentropy', optimizer=Adam())

In [None]:
# Train the model
# model.fit([char_input_data, aligned_word_embeddings], training_generator[0][1], epochs=10, batch_size=1)
# I want to create a loop to train the model on all the batches
for j in range(3):
    for i in range(len(training_generator)):
        char_input_data = np.load(f"Embeddings/char_input_data_{i}.npy")
        aligned_word_embeddings = np.load(f"Embeddings/aligned_word_embeddings_{i}.npy")
        target_data = training_generator[i][1]
        model.fit([char_input_data, aligned_word_embeddings], target_data, epochs=1, batch_size=16)





In [None]:
# save the model
model.save('modelembedd.h5')