In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
import pickle
import regex as re
import nltk
import os

tf.keras.backend.clear_session()

os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

2023-12-10 04:47:30.305912: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-10 04:47:30.305988: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-10 04:47:30.341910: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-10 04:47:30.417140: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
train_dir = 'Dataset/train_clean.txt'
val_dir = 'Dataset/val_clean.txt'
test_dir = 'Dataset/test_clean.txt'

In [3]:
with open(train_dir, 'r', encoding='utf8') as f:
    train = f.read()
with open(val_dir, 'r', encoding='utf8') as f:
    val = f.read()
with open(test_dir, 'r', encoding='utf8') as f:
    test = f.read()

In [4]:
def remove_diacritics(text):
    return re.sub(r'[\u064B-\u0652]', '', text)

In [5]:
def extract_data(text, no_diacritics_dumpfile, labels_dumpfile):
    no_diacritics = remove_diacritics(text)
    labels = np.zeros((len(no_diacritics), 2)).astype(np.int16)
    j = -1
    k = 0
    for i in range(len(text)):
        if 0x64B <= ord(text[i]) <= 0x652:
            labels[j, k] = ord(text[i])
            k += 1
        else:
            j += 1
            k = 0
    with open(no_diacritics_dumpfile, 'wb') as f:
        pickle.dump(no_diacritics, f)
    with open(labels_dumpfile, 'wb') as f:
        pickle.dump(labels, f)

In [6]:
# extract_data(train, 'train_no_diacritics.txt', 'train_labels.txt')
# extract_data(val, 'val_no_diacritics.txt', 'val_labels.txt')
# extract_data(test, 'test_no_diacritics.txt', 'Dataset/test_labels.txt')

In [7]:
with open('train_no_diacritics.txt', 'rb') as f:
    train_no_diacritics = pickle.load(f)
with open('train_labels.txt', 'rb') as f:
    train_labels = pickle.load(f)
with open('val_no_diacritics.txt', 'rb') as f:
    val_no_diacritics = pickle.load(f)
with open('val_labels.txt', 'rb') as f:
    val_labels = pickle.load(f)
with open('test_no_diacritics.txt', 'rb') as f:
    test_no_diacritics = pickle.load(f)
with open('Dataset/test_labels.txt', 'rb') as f:
    test_labels = pickle.load(f)

In [8]:
def extract_sentences(text, labels, max_len):
    sentences_list = text.split('.')

    sentences = np.zeros((len(sentences_list), max_len)).astype(np.int16)
    diacritics = np.zeros((len(sentences_list), max_len, 2)).astype(np.int16)

    j = 0
    for i, sentence in enumerate(sentences_list):
        sentences[i, :len(sentence)] = np.array([ord(char) for char in sentence], dtype=np.uint16)[0:min(max_len, len(sentence))]
        diacritics[i, :len(sentence)] = labels[j : j + len(sentence)][0:min(max_len, len(sentence))]
        j += len(sentence) + 1
    
    return sentences, diacritics

In [9]:
max_len = 500

train_sentences, train_diacritics = extract_sentences(train_no_diacritics, train_labels, max_len)
val_sentences, val_diacritics = extract_sentences(val_no_diacritics, val_labels, max_len)
test_sentences, test_diacritics = extract_sentences(test_no_diacritics, test_labels, max_len)

In [10]:
test_len = 90000
np.random.seed(42)
indices = np.arange(len(test_sentences))
np.random.shuffle(indices)
test_sentences = test_sentences[indices]
test_diacritics = test_diacritics[indices]
test_sentences = test_sentences[:test_len]
test_diacritics = test_diacritics[:test_len]

In [11]:
sentence_encoder = LabelEncoder().fit(train_sentences.flatten())
X_train = sentence_encoder.transform(train_sentences.flatten()).reshape(train_sentences.shape)
X_val = sentence_encoder.transform(val_sentences.flatten()).reshape(val_sentences.shape)
X_test = sentence_encoder.transform(test_sentences.flatten()).reshape(test_sentences.shape)

In [12]:
del train_sentences, val_sentences, test_sentences
del train_no_diacritics, val_no_diacritics, test_no_diacritics

In [13]:
label_encoder = LabelEncoder().fit(train_diacritics.flatten())
y_train = label_encoder.transform(train_diacritics.flatten()).reshape(train_diacritics.shape)
y_val = label_encoder.transform(val_diacritics.flatten()).reshape(val_diacritics.shape)
y_test = label_encoder.transform(test_diacritics.flatten()).reshape(test_diacritics.shape)

y_train = 9 * y_train[:, :, 0] + y_train[:, :, 1]
y_val = 9 * y_val[:, :, 0] + y_val[:, :, 1]
y_test = 9 * y_test[:, :, 0] + y_test[:, :, 1]

label_encoder2 = LabelEncoder().fit(y_train.flatten())
y_train = label_encoder2.transform(y_train.flatten()).reshape(y_train.shape)
y_val = label_encoder2.transform(y_val.flatten()).reshape(y_val.shape)
y_test = label_encoder2.transform(y_test.flatten()).reshape(y_test.shape)

In [14]:
del train_diacritics, val_diacritics, test_diacritics

In [15]:
# model = Sequential()

# model.add(Embedding(np.unique(X_train).shape[0], 25, input_length=max_len))

# model.add(Bidirectional(LSTM(256, return_sequences=True)))
# model.add(BatchNormalization())
# model.add(Dropout(rate=0.5))

# model.add(Bidirectional(LSTM(256, return_sequences=True)))
# model.add(BatchNormalization())
# model.add(Dropout(rate=0.5))

# model.add(TimeDistributed(Dense(512, activation='relu')))
# model.add(BatchNormalization())

# model.add(TimeDistributed(Dense(512, activation='relu')))
# model.add(BatchNormalization())

# model.add(TimeDistributed(Dense(np.unique(y_train).shape[0], activation='softmax')))

# model.summary()

# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model = tf.keras.models.load_model('model2.h5')

2023-12-10 04:47:59.652315: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-12-10 04:47:59.759473: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-12-10 04:47:59.759535: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-12-10 04:47:59.760726: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-12-10 04:47:59.760800: I external/local_xla/xla/stream_executor

In [16]:
# history = model.fit(X_train, tf.keras.utils.to_categorical(y_train), epochs=15, batch_size=128, validation_data=(X_val, tf.keras.utils.to_categorical(y_val)))

In [17]:
# tf.keras.backend.clear_session()

In [18]:
model.save('model2.h5')

  saving_api.save_model(


In [19]:
predictions = model.predict(X_test)
predictions = np.argmax(predictions, axis=2)
accuracy = np.sum(predictions == y_test) / (predictions.shape[0] * predictions.shape[1])
print('accuracy: ', accuracy)

2023-12-10 04:48:04.381778: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904




In [None]:
def test(sentence_str):
    sentence = np.array([ord(char) for char in sentence_str], dtype=np.uint16)
    sentence = sentence_encoder.transform(sentence)
    if len(sentence) < max_len:
        sentence = np.pad(sentence, (0, max_len - len(sentence)), 'constant')
    elif len(sentence) > max_len:
        sentence = sentence[:max_len]

    sentence = sentence.reshape(1, max_len)

    pred = model.predict(sentence)
    pred = np.argmax(pred, axis=-1).flatten()
    pred = label_encoder2.inverse_transform(pred)
    pred1 = pred // 9
    pred2 = pred % 9
    pred1 = label_encoder.inverse_transform(pred1)
    pred2 = label_encoder.inverse_transform(pred2)

    sentence = ''
    for i in range(len(sentence_str)):
        sentence += sentence_str[i]
        if pred1[i] != 0:
            sentence += chr(pred1[i])
        if pred2[i]  != 0:
            sentence += chr(pred2[i])
            
    return sentence

In [None]:
sentence_test_str = 'وَمَنْ كَانَ مَعَهُ أَلْفٌ فَقَالَ هِيَ مُضَارَبَةٌ لِفُلَانٍ بِالنِّصْفِ وَقَدْ رَبِحَ أَلْفًا فَقَالَ فُلَانٌ هِيَ بِضَاعَةٌ فَالْقَوْلُ قَوْلُ رَبِّ الْمَالِ'
no_diac = remove_diacritics(sentence_test_str)
print(sentence_test_str)
print(test(no_diac))

وَمَنْ كَانَ مَعَهُ أَلْفٌ فَقَالَ هِيَ مُضَارَبَةٌ لِفُلَانٍ بِالنِّصْفِ وَقَدْ رَبِحَ أَلْفًا فَقَالَ فُلَانٌ هِيَ بِضَاعَةٌ فَالْقَوْلُ قَوْلُ رَبِّ الْمَالِ
وَمَنْ كَانَ مَعَهُ أَلْفٌ فَقَالَ هِيَ مُضَارَبَةٌ لِفُلَانٍ بِالنِّصْفِ وَقَدْ رَبِحَ أَلْفًا فَقَالَ فُلانٌ هِيَ بِضَاعَةٌ فَالْقَوْلُ قَوْلُ رَبِّ الْمَالِ
