In [1]:
import time
import random
import numpy as np
import pickle as pkl
from tensorflow.keras.utils import Sequence

# Load Data

In [2]:
WITH_EXTRA_TRAIN = False
DATASET_PATH = 'Data'
CONSTANTS_PATH = 'helpers/constants'

with open(CONSTANTS_PATH + '/ARABIC_LETTERS_LIST.pickle', 'rb') as file:
    ARABIC_LETTERS_LIST = pkl.load(file)
with open(CONSTANTS_PATH + '/DIACRITICS_LIST.pickle', 'rb') as file:
    DIACRITICS_LIST = pkl.load(file)
if not WITH_EXTRA_TRAIN:
    with open(CONSTANTS_PATH + '/RNN_BIG_CHARACTERS_MAPPING.pickle', 'rb') as file:
        CHARACTERS_MAPPING = pkl.load(file)
else:
    with open(CONSTANTS_PATH + '/RNN_BIG_CHARACTERS_MAPPING.pickle', 'rb') as file:
        CHARACTERS_MAPPING = pkl.load(file)
with open(CONSTANTS_PATH + '/RNN_CLASSES_MAPPING.pickle', 'rb') as file:
    CLASSES_MAPPING = pkl.load(file)
with open(CONSTANTS_PATH + '/RNN_REV_CLASSES_MAPPING.pickle', 'rb') as file:
    REV_CLASSES_MAPPING = pkl.load(file)

In [3]:
DATASET_PATH = 'Data'

train_data_raw = []
valid_data_raw = []
test_data_raw = []

for i in range(1, 2):
    filename = f"/tashkeela_train/tashkeela_train_{i:03}.txt"
    with open(DATASET_PATH + filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        train_data_raw.extend(lines)


for i in range(1, 2):
    filename = f"/tashkeela_val/tashkeela_val_{i:03}.txt"
    with open(DATASET_PATH + filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        valid_data_raw.extend(lines)

for i in range(1, 2):
    filename = f"/tashkeela_test/tashkeela_test_{i:03}.txt"
    with open(DATASET_PATH + filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        test_data_raw.extend(lines)

In [4]:
def remove_diacritics(data_raw):
    return data_raw.translate(str.maketrans('', '', ''.join(DIACRITICS_LIST)))

In [5]:
def to_one_hot(data, size):
    one_hot = list()
    for elem in data:
        cur = [0] * size
        cur[elem] = 1
        one_hot.append(cur)
    return one_hot

In [6]:
max_seq_len = 200

In [7]:
def split_data(data_raw):
    data_new = list()

    for line in data_raw:
        for sub_line in line.split('\n'):
            if len(remove_diacritics(sub_line).strip()) == 0:
                continue

            if len(remove_diacritics(sub_line).strip()) > 0 and len(remove_diacritics(sub_line).strip()) <= max_seq_len:
                data_new.append(sub_line.strip())
            else:
                sub_line = sub_line.split()
                tmp_line = ''
                for word in sub_line:
                    if len(remove_diacritics(tmp_line).strip()) + len(remove_diacritics(word).strip()) + 1 > max_seq_len:
                        if len(remove_diacritics(tmp_line).strip()) > 0:
                            data_new.append(tmp_line.strip())
                        tmp_line = word
                    else:
                        if tmp_line == '':
                            tmp_line = word
                        else:
                            tmp_line += ' '
                            tmp_line += word
                if len(remove_diacritics(tmp_line).strip()) > 0:
                    data_new.append(tmp_line.strip())

    return data_new

In [8]:
train_split = split_data(train_data_raw)
val_split = split_data(valid_data_raw)

In [9]:
print('Training examples (split):', len(train_split))
print('Validation examples (split):', len(val_split))

Training examples (split): 10195
Validation examples (split): 10188


In [10]:
# My explanation for map_data function:
# X is a mapped text without diacritics and with <SOS> and <EOS> tokens
# Y is a one-hot encoded list of diacritics for each character in X
# So, for each character in X, we have a corresponding diacritic in Y
# Then X == Y

def map_data(data_raw):
    X = list()
    Y = list()

    for line in data_raw:
        x = [CHARACTERS_MAPPING['<SOS>']]
        y = [CLASSES_MAPPING['<SOS>']]

        for idx, char in enumerate(line):
                if char in DIACRITICS_LIST:
                    continue

                # if char wasn't a diacritic add it to x
                try:
                    x.append(CHARACTERS_MAPPING[char])
                except KeyError as e:
                    print(f"Error: Character '{char}' not found in CHARACTERS_MAPPING at index {idx} in line: {line}")

                # if char wasn't a diacritic and wasn't an arabic letter add '' to y (no diacritic)
                if char not in ARABIC_LETTERS_LIST:
                    y.append(CLASSES_MAPPING[''])
                # if char was an arabic letter only.
                else:
                    char_diac = ''
                    if idx + 1 < len(line) and line[idx + 1] in DIACRITICS_LIST:
                        char_diac = line[idx + 1]
                        if idx + 2 < len(line) and line[idx + 2] in DIACRITICS_LIST and char_diac + line[idx + 2] in CLASSES_MAPPING:
                            char_diac += line[idx + 2]
                        elif idx + 2 < len(line) and line[idx + 2] in DIACRITICS_LIST and line[idx + 2] + char_diac in CLASSES_MAPPING: # شدة فتحة = فتحة شدة
                            char_diac = line[idx + 2] + char_diac
                    y.append(CLASSES_MAPPING[char_diac])

        
        assert(len(x) == len(y))

        x.append(CHARACTERS_MAPPING['<EOS>'])
        y.append(CLASSES_MAPPING['<EOS>'])

        # Padding
        pad_len = max_seq_len - len(x)
        x.extend([CHARACTERS_MAPPING['<PAD>']] * pad_len)  # Pad with '<PAD>' token
        y.extend([CLASSES_MAPPING['<PAD>']] * pad_len)  # Pad with '<PAD>' token

        y = to_one_hot(y, len(CLASSES_MAPPING))

        X.append(x)
        Y.append(y)

        # print(len(x))
        # print("yyyyyyyyyyyyyyyyyyyyyy")
        # print(len(y))

    # X = np.asarray(X)
    # Y = np.asarray(Y)

    return X, Y

# Load The Embedding Model

In [11]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load the SentenceTransformer model
embedding_model = SentenceTransformer('intfloat/multilingual-e5-small')

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
def generate_word_embeddings(sentences):
    word_embeddings = []
    for sentence in sentences:
        words = sentence.split()  # Tokenize sentence into words
        embeddings = embedding_model.encode(words)  # Generate word embeddings
        word_embeddings.append((words, embeddings))
    return word_embeddings

In [13]:
def align_embeddings_with_characters(sentences, word_embeddings_data, max_seq_len, embedding_dim):
    aligned_embeddings = np.zeros((len(sentences), max_seq_len, embedding_dim))
    
    for i, sentence in enumerate(sentences):
        words, embeddings = word_embeddings_data[i]
        char_pos = 0
        for word, embedding in zip(words, embeddings):
            word_len = len(word)
            if char_pos + word_len > max_seq_len:
                break  # Ensure we do not exceed max_seq_len
            aligned_embeddings[i, char_pos:char_pos + word_len] = embedding
            char_pos += word_len
            if char_pos >= max_seq_len:
                break
    
    return aligned_embeddings

In [14]:
class DataGenerator(Sequence):
    def __init__(self, lines, batch_size):
        self.lines = lines
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.lines) / float(self.batch_size)))


    def __getitem__(self, idx):
        lines = self.lines[idx * self.batch_size:(idx + 1) * self.batch_size]
        X_batch, Y_batch = map_data(lines)
        lines_no_diac = [remove_diacritics(line) for line in lines]

        X_max_seq_len = max_seq_len
        Y_max_seq_len = max_seq_len

        # assert(X_max_seq_len == Y_max_seq_len)

        X = list()
        for x in X_batch:
            x = list(x)
            x = x[:X_max_seq_len]
            x.extend([CHARACTERS_MAPPING['<PAD>']] * (X_max_seq_len - len(x)))
            X.append(np.asarray(x))

        Y_tmp = list()
        for y in Y_batch:
            y_new = list(y)
            y_new = y_new[:Y_max_seq_len]
            y_new.extend(to_one_hot([CLASSES_MAPPING['<PAD>']] * (Y_max_seq_len - len(y)), len(CLASSES_MAPPING)))
            Y_tmp.append(np.asarray(y_new))
        Y_batch = Y_tmp

        X = np.asarray(X)
        Y_batch = np.asarray(Y_batch)


        return X, Y_batch, lines_no_diac

# The Start of creating the Embeddings

In [15]:
batch_size = 128

training_generator = DataGenerator(train_split, batch_size)

In [16]:
char_input_data = training_generator[0][0] # Batch 0, indices vectors (batch_size, max_seq_len)
word_embeddings_data = generate_word_embeddings(training_generator[0][2]) # For batch 0, (128 Sentences, (each Word with its Embeddings)) 
                                                                          # if in sentence3 7 words then word_embeddings_data[2][0] = 7 
                                                                          # and word_embeddings_data[2][1] = 7x(Embedding Dim) Embeddings for each word

In [18]:
embedding_dim = word_embeddings_data[0][1].shape[1]
aligned_word_embeddings = align_embeddings_with_characters(training_generator[0][2], word_embeddings_data, max_seq_len, embedding_dim) # This will return the following:
                                                                                                                                       # (Batch_size, max_seq_len, Embedding_dim)
                                                                                                                                       # (128, 200, 384)
                                                                                                                                       # 200: each element is the embeddings of a word
                                                                                                                                       # that related to a character in the sentence

In [19]:
char_input_data = np.array(char_input_data)
aligned_word_embeddings = np.array(aligned_word_embeddings)

In [20]:
# Save the generated data
np.save('char_input_data.npy', char_input_data)
np.save('aligned_word_embeddings.npy', aligned_word_embeddings)

In [75]:
# Let's simulate the process of loading the data
char_input_data = np.load('char_input_data.npy')
aligned_word_embeddings = np.load('aligned_word_embeddings.npy')

In [54]:
# Let's simulate the process of generating the data
training_generator = DataGenerator(train_split, batch_size)

numpy_data = np.zeros((len(training_generator), 2), dtype=object)

for batch in range(len(training_generator)): # For each batch
    char_input_data = training_generator[batch][0] # Vector of indices (batch_size, max_seq_len)
    word_embeddings_data = generate_word_embeddings(training_generator[batch][2])
    embedding_dim = word_embeddings_data[0][1].shape[1]
    aligned_word_embeddings = align_embeddings_with_characters(training_generator[batch][2], word_embeddings_data, max_seq_len, embedding_dim)

    char_input_data = np.array(char_input_data)
    aligned_word_embeddings = np.array(aligned_word_embeddings)

    # Add to numpy_data
    numpy_data[batch, 0] = char_input_data
    numpy_data[batch, 1] = aligned_word_embeddings

# Save the generated data
np.save('numpy_data_batch128_max200.npy', numpy_data)


In [None]:
# Load data
numpy_data = np.load('numpy_data_batch128_max200.npy', allow_pickle=True)

In [59]:
char_input_data = numpy_data[0][0]
aligned_word_embeddings = numpy_data[0][1]

(128, 200, 384)