<h1> Tashkeel </h1>

This project is developed by Abdulhameed Osama, Hossam Nabil and Nourhan Mohamed as a part of the Natural Language Processing course (NLP) at Cairo University.

In this project a Bi-LSTM model is trained to predict the diacritics of Arabic text. The model is trained on a dataset of 18 million characters from various domains. The model is trained on Google Colab.

We achieved an accuracy of 89.5% on the test set.

The model starts with a character embedding layer, followed by a Bi-LSTM layer, then a dense layer and finally a softmax layer. The model is trained using the Adam optimizer and the categorical crossentropy loss function.

<h3> Imports </h3>

In [21]:
import numpy as np
import pickle as pkl
import helper_file as hf
from tqdm import tqdm
import os
import re
from nltk.tokenize import sent_tokenize

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader



# Arabic Text Preprocessing

In [11]:
# Read the dataset
dataset = hf.read_file('dataset/train.txt')

val = hf.read_file('dataset/val.txt')

arabic_letters = hf.read_pickle('Delivery/arabic_letters.pickle')

diacritics = hf.read_pickle('Delivery/diacritics.pickle')

diacritics_to_id = hf.read_pickle('Delivery/diacritics2id.pickle')

In [12]:

# Clean the original dataset
cleaned_dataset_with_diacritics = hf.clean_dataset(dataset, remove_diacritics=False)
cleaned_dataset = hf.clean_dataset(dataset, remove_diacritics=True)
hf.write_file('Delivery/cleaned_dataset.txt', cleaned_dataset)
hf.write_file('Delivery/cleaned_dataset_with_diacritics.txt', cleaned_dataset_with_diacritics)


In [13]:
cleaned_val_with_diacritics = hf.clean_dataset(val, remove_diacritics=False)
cleaned_val = hf.clean_dataset(val, remove_diacritics=True)
hf.write_file('Delivery/cleaned_val.txt', cleaned_val)
hf.write_file('Delivery/cleaned_val_with_diacritics.txt', cleaned_val_with_diacritics)

In [16]:
def split_arabic_sentences(text):
   
    pattern = r'(?<=[.؟!,؛])'

    # Split the text into sentences based on the pattern
    sentences = re.split(pattern, text)
    # remove punctuations
    sentences = [re.sub(r'[.؟!،؛]', '', sent) for sent in sentences]
    sentences = [re.sub(r'\s+', ' ', sent) for sent in sentences]
    sentences = filter(lambda sentences: sentences.strip(), sentences)

    return sentences 


def append_to_file(file_path, content):
    with open(file_path, 'a', encoding='utf-8') as file:
        file.write(content.strip() + '\n')





In [17]:
sentences = split_arabic_sentences(cleaned_dataset)

# Check if the file exists and if it does, delete its content

if os.path.exists('Delivery/sentences.txt'):
    hf.write_file('Delivery/sentences.txt', '')


for s in sentences:
    append_to_file('Delivery/sentences.txt', s)



In [18]:
diacritized_sentences = split_arabic_sentences(cleaned_dataset_with_diacritics)

if os.path.exists('Delivery/diacritized_sentences.txt'):
    hf.write_file('Delivery/diacritized_sentences.txt', '')

for s in diacritized_sentences:
    append_to_file('Delivery/diacritized_sentences.txt', s)


In [19]:
val_sentences = split_arabic_sentences(cleaned_val)

if os.path.exists('Delivery/val_sentences.txt'):
    hf.write_file('Delivery/val_sentences.txt', '')

for s in val_sentences:
  append_to_file('Delivery/val_sentences.txt',s)

In [20]:
d_val = split_arabic_sentences(cleaned_val_with_diacritics)

if os.path.exists('Delivery/d_val.txt'):
    hf.write_file('Delivery/d_val.txt', '')

for s in d_val:
  append_to_file('Delivery/d_val.txt',s)

In [22]:



def get_diacritics(text,chars,diacritics2id):
    text = list(text)
    string = ''
    diacritics = []
    counter = 0 
    for char in text:
        if char == ' ':
            # print('space ////////////////////////////////////')
            # print(f'{string} appeneed to diacritics list space condition')
            diacritics.append(diacritics_to_id[string])
            diacritics.append(14)
            string = ''
            counter += 1
            continue
    
        if char not in chars:
            # print(f"diacritic {char}")
            string += char
            # print('concatenated string',string)
        else:
            # print('regular char ',char)
            if text[counter - 1] in chars:
                # print(f'{string} appeneed to diacritics list counter condition')
                diacritics.append(diacritics2id[string])
                string = ''
            elif string != '':
                # print(f'{string} appeneed to diacritics list normal condidition')
                diacritics.append(diacritics2id[string])
    
            string = ''

        counter += 1    

    return diacritics

def get_tashkeel(text,char,diacritics2id):
    diacritics = get_diacritics(text,char,diacritics2id)
    if text[-1] not in char:
        diacritics.append(diacritics2id[text[-1]])
    return diacritics

        



chars = sorted(arabic_letters) 
char_to_idx = {char: idx + 1 for idx, char in enumerate(chars)}  # Assigning 0 for padding
char_to_idx[' '] = 0
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
idx_to_char[0] = ' ' 



In [26]:
class TashkeelDataset(torch.utils.data.Dataset):

  def __init__(self, x, y, pad):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    max_size = max([len(i) for i in x]) # find the max length of the sentences 
    print('The max size is', max_size)  
    self.x = torch.tensor([i + [pad] * (max_size - len(i)) for i in x]) # pad the sentences with <PAD> token
    self.y = torch.tensor([i + [14] * (max_size - len(i)) for i in y]) # pad the labels with <PAD> token

  def __len__(self):
    return len(self.x)

  def __getitem__(self, idx):
    return (self.x[idx], self.y[idx])

In [24]:
gomal = hf.read_file('Delivery/sentences.txt')
gomal_tashkeel = hf.read_file('Delivery/diacritized_sentences.txt')

x = []
for line in gomal.splitlines():
    x.append([char_to_idx[char] for char in line])

y = []
for line in gomal_tashkeel.splitlines():
    y.append(get_tashkeel(line,chars,diacritics_to_id))




In [25]:
val_gomal = hf.read_file('Delivery/val_sentences.txt')
val_gomal_tashkeel = hf.read_file('Delivery/d_val.txt')

x_val = []
for line in val_gomal.splitlines():
    x_val.append([char_to_idx[char] for char in line])

y_val = []
for line in val_gomal_tashkeel.splitlines():
    y_val.append(get_tashkeel(line,chars,diacritics_to_id))

In [None]:
batch_size = 5
mini_sentences = x[0: 8]
mini_labels = y[0: 8]
mini_dataset = TashkeelDataset(mini_sentences, mini_labels, 40)
dummy_dataloader = torch.utils.data.DataLoader(mini_dataset, batch_size=5)
dg = iter(dummy_dataloader)
X1, Y1 = next(dg)
X2, Y2 = next(dg)
print(Y1.shape, X1.shape, Y2.shape, X2.shape)
print(X1[0][:], "\n", Y1[0][:])

In [28]:
class Tashkeel(nn.Module):
  def __init__(self, vocab_size=37, embedding_dim=37, hidden_size=50, n_classes=15):
    """
    character level tashkeel model
    The constructor of our NER model
    Inputs:
    - vocab_size: the number of unique characters in the dataset
    - embedding_dim: the embedding dimension
    - n_classes: the number of final classes (tags)
    """
    super(Tashkeel, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True,bidirectional=True)
    self.linear = nn.Linear(100, n_classes)

  def forward(self, sentences):
    """
    This function does the forward pass of our model
    Inputs:
    - sentences: tensor of shape (batch_size, max_length)

    Returns:
    - final_output: tensor of shape (batch_size, max_length, n_classes)
    """
    final_output = self.embedding(sentences)
    final_output, _ = self.lstm(final_output)
    final_output = self.linear(final_output)
    return final_output

In [31]:
model = Tashkeel()
print(model)

Tashkeel(
  (embedding): Embedding(37, 37)
  (lstm): LSTM(37, 50, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=100, out_features=15, bias=True)
)


In [32]:
def train(model, train_dataset, batch_size=10, epochs=5, learning_rate=0.01):
  """
  This function implements the training logic
  Inputs:
  - model: the model ot be trained
  - train_dataset: the training set of type NERDataset
  - batch_size: integer represents the number of examples per step
  - epochs: integer represents the total number of epochs (full training pass)
  - learning_rate: the learning rate to be used by the optimizer
  """
  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=5, shuffle=True)
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  # GPU configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()

  for epoch_num in range(epochs):
    total_acc_train = 0
    total_loss_train = 0

    for train_input, train_label in tqdm(train_dataloader):
      train_label = train_label.to(device)
      train_input = train_input.to(device)
      output = model(train_input)
      
      batch_loss = criterion(output.view(-1, 15), train_label.view(-1))
      total_loss_train += batch_loss.item()

      acc = (output.argmax(dim=2) == train_label).sum().item()
      total_acc_train += acc
      optimizer.zero_grad()
      batch_loss.backward()
      optimizer.step()
      
    epoch_loss = total_loss_train / len(train_dataset)
    epoch_acc = total_acc_train / (len(train_dataset) * len(train_dataset[0][0]))
    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
        | Train Accuracy: {epoch_acc}\n')


In [51]:
train_dataset = TashkeelDataset(x[0:1000], y[0:1000], 0)
val_dataset = TashkeelDataset(x_val[0:500], y_val[0:500], 0)


The max size is 2544
The max size is 1739


In [None]:
train(model, train_dataset)

In [None]:
def correct_evaluation(model,test_dataset,batch_size=5):
     # Create the test data loader
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

    # GPU Configuration
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()

    total_correct_predictions = 0
    total_predictions = 0
    letter_count = 0
    correct_letter_d = 0
    all_predictions = []
    all_test_inputs = []
    with torch.no_grad():
        for test_input, test_label in tqdm(test_dataloader):
            # Move the test input to the device
            test_label = test_label.to(device)
            # Move the test label to the device
            test_input = test_input.to(device)

            # Perform the forward pass
            output = model(test_input)

            # Get predicted labels
            predicted_labels = output.argmax(dim=2)
            # print(test_input.shape)
            for j,batch in enumerate(test_input):

              for i,char in enumerate(batch):
                # print(char.shape)
                if char.item() != 0:
                  letter_count += 1
                  if predicted_labels[j][i].item() == test_label[j][i].item():
                    correct_letter_d += 1

            all_test_inputs.append(test_input)
            all_predictions.append(predicted_labels)






            # Count correct predictions, excluding label 15
            # correct_predictions = ((predicted_labels == test_label) & (test_label != 15)).sum().item()
            # total_correct_predictions += correct_predictions

            # # Count total predictions, excluding instances where the ground truth label is 15
            # valid_predictions = (test_label != 15).sum().item()
            # total_predictions += valid_predictions
            # space_pred = (predicted_labels == 14).sum().item()
            # total_correct_predictions +=space_pred
            # valid_predictions = (test_label == 14).sum().item()
            # total_predictions += valid_predictions



    # Calculate the overall accuracy excluding label 15
    overall_accuracy = correct_letter_d / letter_count
    print(len(all_test_inputs),len(all_predictions))
    print(f'overall accuracy: {overall_accuracy}')
    return all_test_inputs,all_predictions


In [None]:
test_inputs,predictions = correct_evaluation(model,val_dataset)

In [38]:
def extract_data_from_id(test_input, predictions, id_to_diacritics, idx_to_char):
  for k,test in enumerate(test_input):
    for j,batch in enumerate(test):
        string = ''
        for i,char in enumerate(batch):
          # print(char.shape)
          # print("len ", len(predictions))
          # print("shape ", predictions[k].shape)
          # print("prediction ",predictions[k][j][i])
          string += (idx_to_char[test[j][i].item()] + id_to_diacritics[predictions[k][j][i].item()])
        print(string)
        break
    break

In [54]:
print("قَوْلُهُ وَلَا تُكْرَهُ ضِيَافَتُهُ")

id_to_diacritics = {}
for key, value in diacritics_to_id.items():
    id_to_diacritics[value] = key

# pklmodel = torch.load('Delivery/model.pickle')
# test_inputs,predictions = correct_evaluation(pklmodel,val_dataset)

extract_data_from_id(test_inputs, predictions, id_to_diacritics, idx_to_char)

قَوْلُهُ وَلَا تُكْرَهُ ضِيَافَتُهُ


100%|██████████| 100/100 [00:06<00:00, 16.54it/s]

100 100
overall accuracy: 0.7410149770416136
قَوْلُهُ وَلَا تُكْرْهُ ضِيَافَتُهُ                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        


