In [1]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import re
import sys
import os
sys.path.insert(0, os.path.abspath('../..'))
from preprocessing.preprocess import preprocess_data

In [2]:

dataset_path = '../../dataset'

# preprocess and save the data
preprocess_data(data_type='train', dataset_path=dataset_path)
preprocess_data(data_type='val', dataset_path=dataset_path)

# load data
with open(f'{dataset_path}/cleaned_train_data_without_diacritics.txt', 'r', encoding='utf-8') as file:
    # read all lines into a single string
    training_data = re.compile(r'[\n\r\t\s]').sub('', file.read())
with open(f'{dataset_path}/cleaned_val_data_without_diacritics.txt', 'r', encoding='utf-8') as file:
    # read all lines into a single string
    validation_data = re.compile(r'[\n\r\t\s]').sub('', file.read())
    
with open(f'{dataset_path}/cleaned_train_data_with_diacritics.txt', 'r', encoding='utf-8') as file:
    # read all lines into a single string
    training_data_with_diacritics = re.compile(r'[\n\r\t\s]').sub('', file.read())
    
with open(f'{dataset_path}/cleaned_val_data_with_diacritics.txt', 'r', encoding='utf-8') as file:
    # read all lines into a single string
    validation_data_with_diacritics = re.compile(r'[\n\r\t\s]').sub('', file.read())

print(len(training_data))
print(len(validation_data))
print(len(training_data_with_diacritics))
print(len(validation_data_with_diacritics))
# Tokenize the text into sequences at the character level
vocab = set(''.join(training_data + validation_data))

char_to_index = {char: idx + 1 for idx, char in enumerate(vocab)}
index_to_char = {idx + 1: char for idx, char in enumerate(vocab)}

print(char_to_index)
print(index_to_char)

8351479
421099
15637321
788621
{'د': 1, 'خ': 2, 'ذ': 3, 'ة': 4, 'ج': 5, '~': 6, 'ء': 7, 'ب': 8, 'ا': 9, 'ن': 10, 'ف': 11, 'ع': 12, 'س': 13, 'إ': 14, 'ر': 15, 'ئ': 16, 'ل': 17, 'ي': 18, 'ش': 19, 'ه': 20, 'أ': 21, 'ض': 22, 'ق': 23, 'ح': 24, 'ص': 25, 'و': 26, 'ت': 27, 'ث': 28, 'ك': 29, 'ز': 30, 'ط': 31, 'غ': 32, 'ى': 33, 'ؤ': 34, 'آ': 35, 'ظ': 36, 'م': 37}
{1: 'د', 2: 'خ', 3: 'ذ', 4: 'ة', 5: 'ج', 6: '~', 7: 'ء', 8: 'ب', 9: 'ا', 10: 'ن', 11: 'ف', 12: 'ع', 13: 'س', 14: 'إ', 15: 'ر', 16: 'ئ', 17: 'ل', 18: 'ي', 19: 'ش', 20: 'ه', 21: 'أ', 22: 'ض', 23: 'ق', 24: 'ح', 25: 'ص', 26: 'و', 27: 'ت', 28: 'ث', 29: 'ك', 30: 'ز', 31: 'ط', 32: 'غ', 33: 'ى', 34: 'ؤ', 35: 'آ', 36: 'ظ', 37: 'م'}


In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu"); print(device)

cuda


In [3]:
# define the diacritics unicode and their corresponding labels classes indices
# note that index 0 is reserved for no diacritic
labels = {
    # no diacritic
    0: 0,
    # fath
    1614: 1,
    # damm
    1615: 2,
    # kasr
    1616: 3,
    # shadd
    1617: 4,
    # sukun
    1618: 5,
    # tanween bel fath
    1611: 6,
    # tanween bel damm
    1612: 7,
    # tanween bel kasr
    1613: 8,
    # shadd and fath
    (1617, 1614): 9,
    # shadd and damm
    (1617, 1615): 10,
    # shadd and kasr
    (1617, 1616): 11,
    # shadd and tanween bel fath
    (1617, 1611): 12,
    # shadd and tanween bel damm
    (1617, 1612): 13,
    # shadd and tanween bel kasr
    (1617, 1613): 14
}

indicies_to_labels = {
    # no diacritic
    0: 0,
    # fath
    1: 1614,
    # damm
    2: 1615,
    # kasr
    3: 1616,
    # shadd
    4: 1617,
    # sukun
    5: 1618,
    # tanween bel fath
    6: 1611,
    # tanween bel damm
    7: 1612,
    # tanween bel kasr
    8: 1613,
    # shadd and fath
    9: (1617, 1614),
    # shadd and damm
    10: (1617, 1615),
    # shadd and kasr
    11: (1617, 1616),
    # shadd and tanween bel fath
    12: (1617, 1611),
    # shadd and tanween bel damm
    13: (1617, 1612),
    # shadd and tanween bel kasr
    14: (1617, 1613)
}

print(labels)
print(indicies_to_labels)

{0: 0, 1614: 1, 1615: 2, 1616: 3, 1617: 4, 1618: 5, 1611: 6, 1612: 7, 1613: 8, (1617, 1614): 9, (1617, 1615): 10, (1617, 1616): 11, (1617, 1611): 12, (1617, 1612): 13, (1617, 1613): 14}
{0: 0, 1: 1614, 2: 1615, 3: 1616, 4: 1617, 5: 1618, 6: 1611, 7: 1612, 8: 1613, 9: (1617, 1614), 10: (1617, 1615), 11: (1617, 1616), 12: (1617, 1611), 13: (1617, 1612), 14: (1617, 1613)}


In [4]:
# build one array that holds all sequences of training data
training_data_sequences = [char_to_index[char] for char in training_data]
print(training_data_sequences[:10])
print(len(training_data_sequences))

# build one array that holds all sequences of validation data
validation_data_sequences = [char_to_index[char] for char in validation_data]
print(validation_data_sequences[:10])
print(len(validation_data_sequences))

fixed_sequence_length = 50

# Create fixed-length sequences
fixed_sequences = [training_data_sequences[i:i+fixed_sequence_length] for i in range(0, len(training_data_sequences), fixed_sequence_length)]

# Pad 0 to last sequence if it is less than fixed_sequence_length
if len(fixed_sequences[-1]) < fixed_sequence_length:
    fixed_sequences[-1] += [0] * (fixed_sequence_length - len(fixed_sequences[-1]))

training_data_sequences = torch.tensor(fixed_sequences)

fixed_sequence_length = 50

# Create fixed-length sequences
fixed_sequences = [validation_data_sequences[i:i+fixed_sequence_length] for i in range(0, len(validation_data_sequences), fixed_sequence_length)]

# Pad 0 to last sequence if it is less than fixed_sequence_length
if len(fixed_sequences[-1]) < fixed_sequence_length:
    fixed_sequences[-1] += [0] * (fixed_sequence_length - len(fixed_sequences[-1]))

validation_data_sequences = torch.tensor(fixed_sequences)

[23, 26, 17, 20, 21, 26, 23, 31, 12, 9]
8351479
[23, 26, 17, 20, 26, 17, 9, 27, 29, 15]
421099


In [5]:
training_data_labels = []
training_size = len(training_data_with_diacritics)
index = 0
while index < training_size:
    if ord(training_data_with_diacritics[index]) not in labels:
        # char is not a diacritic
        if (index + 1) < training_size and ord(training_data_with_diacritics[index + 1]) in labels:
            # char has a diacritic
            if ord(training_data_with_diacritics[index + 1]) == 1617:
                # char has a shadd diacritic
                if (index + 2) < training_size and ord(training_data_with_diacritics[index + 2]) in labels:
                    # char has a shadd and another diacritic
                    training_data_labels.append(labels[(1617, ord(training_data_with_diacritics[index + 2]))])
                    # skip next 2 diacritics chars
                    index += 3  # increment by 3 to skip two diacritic chars
                    continue
                else:
                    # char has a shadd and no other diacritic
                    training_data_labels.append(labels[1617])
                    # skip next diacritic char
                    index += 2
                    continue
            # char has a diacritic other than shadd
            training_data_labels.append(labels[ord(training_data_with_diacritics[index + 1])])
            # skip next diacritic char
            index += 2  # increment by 2 to skip one diacritic char
            continue
        else:
            # char has no diacritic
            training_data_labels.append(0)
    index += 1  # increment by 1 for normal iteration

print(len(training_data_labels))
print(training_data_labels[:10])

# Create fixed-length sequences
fixed_labels = [training_data_labels[i:i+fixed_sequence_length] for i in range(0, len(training_data_labels), fixed_sequence_length)]

# Pad 0 to last sequence if it is less than fixed_sequence_length
if len(fixed_labels[-1]) < fixed_sequence_length:
    fixed_labels[-1] += [0] * (fixed_sequence_length - len(fixed_labels[-1]))

training_data_labels = torch.tensor(fixed_labels)

8351479
[1, 5, 2, 2, 1, 5, 1, 1, 1, 0]


In [6]:
validation_data_labels = []
validation_size = len(validation_data_with_diacritics)
index = 0
while index < validation_size:
    if ord(validation_data_with_diacritics[index]) not in labels:
        # char is not a diacritic
        if (index + 1) < validation_size and ord(validation_data_with_diacritics[index + 1]) in labels:
            # char has a diacritic
            if ord(validation_data_with_diacritics[index + 1]) == 1617:
                # char has a shadd diacritic
                if (index + 2) < validation_size and ord(validation_data_with_diacritics[index + 2]) in labels:
                    # char has a shadd and another diacritic
                    validation_data_labels.append(labels[(1617, ord(validation_data_with_diacritics[index + 2]))])
                    # skip next 2 diacritics chars
                    index += 3  # increment by 3 to skip two diacritic chars
                    continue
                else:
                    # char has a shadd and no other diacritic
                    validation_data_labels.append(labels[1617])
                    # skip next diacritic char
                    index += 2
                    continue
            # char has a diacritic other than shadd
            validation_data_labels.append(labels[ord(validation_data_with_diacritics[index + 1])])
            # skip next diacritic char
            index += 2  # increment by 2 to skip one diacritic char
            continue
        else:
            # char has no diacritic
            validation_data_labels.append(0)
    index += 1  # increment by 1 for normal iteration

print(len(validation_data_labels))
print(validation_data_labels[:10])

# Create fixed-length sequences
fixed_labels = [validation_data_labels[i:i+fixed_sequence_length] for i in range(0, len(validation_data_labels), fixed_sequence_length)]

# Pad 0 to last sequence if it is less than fixed_sequence_length
if len(fixed_labels[-1]) < fixed_sequence_length:
    fixed_labels[-1] += [0] * (fixed_sequence_length - len(fixed_labels[-1]))

validation_data_labels = torch.tensor(fixed_labels)

421099
[1, 5, 2, 2, 1, 1, 0, 2, 5, 1]


In [7]:
training_dataset = TensorDataset(training_data_sequences, training_data_labels)

batch_size = 32
training_dataloader = DataLoader(training_dataset, batch_size=batch_size)

validation_dataset = TensorDataset(validation_data_sequences, validation_data_labels)

batch_size = 32
validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size)

# LSTM

## LSTM Class

In [8]:
class CharLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, output_size, drop_prob=0.5, num_layers=1):
        super(CharLSTM, self).__init__()
        # chars embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        
        # LSTM layers
        # batch_first: it means that the input tensor has its first dimension representing the batch size
        # TODO: BLSTM
        self.lstm = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        
        # Drop out layer, how likely would it drop some neurons (assign zeros to them)
        self.dropout = nn.Dropout(drop_prob)
        
        # output layer
        self.output = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x) # batch_size * seq_length * embedding_size
        lstm_out, _ = self.lstm(embedded) # batch_size * seq_length * hidden_size
        after_dropout = self.dropout(lstm_out) # batch_size * seq_length *  hidden_size
        output = self.output(after_dropout)  # batch_size * seq_length * output_size
        output_softmax = F.softmax(output, dim=1)  # Apply softmax to the output
        return output_softmax
    
num_layers = 2
vocab_size = len(char_to_index) + 1 # +1 for the 0 padding
embedding_size = 200
output_size = len(labels)
hidden_size = 256
drop_prob = 0.5
lr=0.001

model = CharLSTM(vocab_size, embedding_size,  hidden_size, output_size, drop_prob, num_layers)

print(model)

CharLSTM(
  (embedding): Embedding(38, 200)
  (lstm): LSTM(200, 256, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (output): Linear(in_features=256, out_features=15, bias=True)
)


## Training part

In [9]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
num_epochs = 20
for epoch in range(num_epochs):
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    for batch_sequences, batch_labels in training_dataloader:
        optimizer.zero_grad()
        outputs = model(batch_sequences).float() # batch_size * seq_length * output_size
        # convert batch_labels to one hot encoding
        batch_labels_one_hot = F.one_hot(batch_labels, num_classes=output_size).float() # batch_size * seq_length * output_size
        
        loss = criterion(outputs, batch_labels_one_hot)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    for validation_batch_sequences, validation_batch_labels in validation_dataloader:
        outputs = model(validation_batch_sequences).float() # batch_size * seq_length * output_size
        # Calculate accuracy
        predicted_labels = outputs.argmax(dim=2)  # Get the index with the maximum probability
        correct_predictions += (predicted_labels == validation_batch_labels).sum().item()
        total_predictions += validation_batch_labels.numel()
        
    accuracy = correct_predictions / total_predictions
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss}, Accuracy: {accuracy * 100:.2f}%')


KeyboardInterrupt: 

## Test LSTM

In [10]:
def lstm_predict(model, sentence):
    model.eval() # evaluation mode
    sentence = [char_to_index[char] for char in sentence]
    
    # Create fixed-length sequences
    fixed_sequences = [sentence[i:i+fixed_sequence_length] for i in range(0, len(sentence), fixed_sequence_length)]

    # Pad 0 to last sequence if it is less than fixed_sequence_length
    if len(fixed_sequences[-1]) < fixed_sequence_length:
        fixed_sequences[-1] += [0] * (fixed_sequence_length - len(fixed_sequences[-1]))

    sentence_sequences = torch.tensor(fixed_sequences).view(1, -1)  # Assuming batch size 1

    print(sentence_sequences.shape)
    outputs = model(sentence_sequences)
    print(outputs.shape)
    outputs = outputs.argmax(dim=2)
    print(outputs.shape)
    outputs = outputs.tolist()
    print(outputs)
    diacritics = []
    for output in outputs:
        for index in output:
            predicted_class = indicies_to_labels[index]
            if type(predicted_class) is tuple:
                diacritics.append(chr(predicted_class[0]) + chr(predicted_class[1]))
            elif predicted_class == 0:
                diacritics.append('')
            else:
                diacritics.append(chr(predicted_class))
    return diacritics[:len(sentence)]

In [11]:
test_sentence = 'قوله أو قطع الأول يده إلخ قال الزركشي'
test_sentence = re.compile(r'[\n+\r+\t+\s+]').sub('', test_sentence)
predicted_diacritics = lstm_predict(model, test_sentence)

diacritized_sentence = ''
for i in range(len(test_sentence)):
    diacritized_sentence += test_sentence[i] + predicted_diacritics[i]

print(diacritized_sentence)

torch.Size([1, 50])
torch.Size([1, 50, 15])
torch.Size([1, 50])
[[1, 5, 3, 2, 1, 5, 1, 3, 3, 0, 5, 1, 9, 3, 3, 3, 2, 3, 1, 1, 1, 0, 3, 0, 5, 9, 4, 3, 3, 10, 3, 1, 12, 7, 13, 12, 7, 1, 14, 8, 1, 14, 6, 1, 14, 6, 1, 13, 6, 1]]
قَوْلِهُأَوْقَطِعِالْأَوَّلِيِدِهُإِلَخَقَالِالْزَّرّكِشِيُّ
