<h1> Tashkeel </h1>

This project is developed by Abdulhameed Osama, Hossam Nabil and Nourhan Mohamed as a part of the Natural Language Processing course (NLP) at Cairo University.

In this project a Bi-LSTM model is trained to predict the diacritics of Arabic text. The model is trained on a dataset of 18 million characters from various domains. The model is trained on Google Colab using a Tesla K80 GPU.

We achieved an accuracy of 96.5% on the test set.

The model starts with a character embedding layer, followed by a Bi-LSTM layer, then a dense layer and finally a softmax layer. The model is trained using the Adam optimizer and the categorical crossentropy loss function.

<h3> Imports </h3>

In [None]:
import numpy as np
import pickle as pkl
import helper_file as hf

# Arabic Text Preprocessing
# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")

from nltk.tokenize import word_tokenize, sent_tokenize


import pyarabic.araby as araby
# import pyarabic.number as number
# from pyarabic.araby import strip_tashkeel, strip_tatweel, normalize_ligature
# from pyarabic.araby import tokenize, is_arabicrange, is_arabicword

# Arabic Stopwords
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer


# Arabic Text Preprocessing

In [None]:
# Read the dataset
dataset = hf.read_file('dataset/train.txt')


arabic_letters = hf.read_pickle('Delivery/arabic_letters.pickle')


diacritics = hf.read_pickle('Delivery/diacritics.pickle')

diacritics_to_id = hf.read_pickle('Delivery/diacritics2id.pickle')
print(diacritics_to_id)


In [None]:
# Preprocess the dataset using bert tokenizer
# tokenized_text = tokenizer.tokenize(dataset)
# hf.write_pickle('tokenized_text.pickle', tokenized_text)

tokenized_sentences = sent_tokenize(dataset)
hf.write_pickle('tokenized_text_nltk.pickle', tokenized_sentences)

In [None]:
bert = hf.read_pickle('tokenized_text.pickle')  # bert tokenized text
nltk = hf.read_pickle('tokenized_text_nltk.pickle')  # nltk tokenized text

print(bert[:1000])
print(nltk[:1000])

In [None]:
# import re

# Clean the original dataset
cleaned_dataset_with_diacritics = hf.clean_dataset(dataset, remove_diacritics=False)
cleaned_dataset = hf.clean_dataset(dataset, remove_diacritics=True)
hf.write_file('Delivery/cleaned_dataset.txt', cleaned_dataset)
hf.write_file('Delivery/cleaned_dataset_with_diacritics.txt', cleaned_dataset_with_diacritics)



In [None]:
def split_arabic_sentences(text):
   
    pattern = r'(?<=[.؟!,؛])'

    # Split the text into sentences based on the pattern
    sentences = re.split(pattern, text)
    # remove punctuations
    sentences = [re.sub(r'[.؟!،؛]', '', sent) for sent in sentences]
    sentences = [re.sub(r'\s+', ' ', sent) for sent in sentences]
    sentences = filter(lambda sentences: sentences.strip(), sentences)

    return sentences 


def append_to_file(file_path, content):
    with open(file_path, 'a', encoding='utf-8') as file:
        file.write(content.strip() + '\n')


sentences = split_arabic_sentences(cleaned_dataset)


for s in sentences:
    append_to_file('Delivery/sentences.txt', s)




In [187]:


diacritized_sentences = split_arabic_sentences(cleaned_dataset_with_diacritics)
for s in diacritized_sentences:
    append_to_file('Delivery/diacritized_sentences.txt', s)


KeyboardInterrupt: 

In [177]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from pyarabic import araby


def get_diacritics(text,chars,diacritics2id):
    text = list(text)
    string = ''
    diacritics = []
    counter = 0 
    for char in text:
        if char == ' ':
            # print('space ////////////////////////////////////')
            # print(f'{string} appeneed to diacritics list space condition')
            diacritics.append(diacritics_to_id[string])
            diacritics.append(15)
            string = ''
            counter += 1
            continue
    
        if char not in chars:
            # print(f"diacritic {char}")
            string += char
            # print('concatenated string',string)
        else:
            # print('regular char ',char)
            if text[counter - 1] in chars:
                # print(f'{string} appeneed to diacritics list counter condition')
                diacritics.append(diacritics2id[string])
                string = ''
            elif string != '':
                # print(f'{string} appeneed to diacritics list normal condidition')
                diacritics.append(diacritics2id[string])
    
            string = ''

        counter += 1    

    return diacritics

def get_tashkeel(text,char,diacritics2id):
    diacritics = get_diacritics(text,char,diacritics2id)
    if text[-1] not in char:
        diacritics.append(diacritics2id[text[-1]])
    return diacritics

        
            
sen = "وقوله لزمته لما قلنا يريد قوله" 
sen = [char_to_idx[char] for char in sen]

undiacritized_sentences = ["محمد", "جمال", "علي"]



chars = sorted(arabic_letters) 
char_to_idx = {char: idx + 1 for idx, char in enumerate(chars)}  # Assigning 0 for padding
char_to_idx[' '] = 0
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
idx_to_char[0] = ' ' 
undiacritized_sequences = [[char_to_idx[char] for char in sentence] for sentence in undiacritized_sentences]
# diacritic_labels = [get_diacritics(text,chars,diacritics_to_id) for text in diacritized_sentences]
diacritic_labels = get_tashkeel("وَقَوْلُهُ لَزِمَتْهُ لِمَا قُلْنَا يُرِيدُ قَوْلَهُ",chars,diacritics_to_id)
print(sen)
print(diacritic_labels)
print(len(sen))
print(len(diacritic_labels))
print(len(chars))






[34, 28, 34, 30, 33, 0, 30, 18, 31, 10, 33, 0, 30, 31, 7, 0, 28, 30, 32, 7, 0, 36, 17, 36, 15, 0, 28, 34, 30, 33]
[0, 0, 6, 2, 2, 15, 0, 4, 0, 6, 2, 15, 4, 0, 14, 15, 2, 6, 0, 14, 15, 2, 4, 14, 2, 15, 0, 6, 0, 2]
30
30
36


In [229]:
class TashkeelDataset(torch.utils.data.Dataset):

  def __init__(self, x, y, pad):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    ##################### TODO: create two tensors one for x and the other for labels ###############################
    max_size = max([len(i) for i in x]) # find the max length of the sentences 
    print('The max size is', max_size)  
    self.x = torch.tensor([i + [pad] * (max_size - len(i)) for i in x]) # pad the sentences with <PAD> token
    self.y = torch.tensor([i + [15] * (max_size - len(i)) for i in y]) # pad the labels with <PAD> token
    #################################################################################################################

  def __len__(self):
    """
    This function should return the length of the dataset (the number of sentences)
    """
    ###################### TODO: return the length of the dataset #############################
    return len(self.x)

    ###########################################################################################

  def __getitem__(self, idx):
    """
    This function returns a subset of the whole dataset
    """
    ###################### TODO: return a tuple of x and y ###################################
    return (self.x[idx], self.y[idx])
    ##########################################################################################





In [230]:
gomal = hf.read_file('Delivery/sentences.txt')
gomal_tashkeel = hf.read_file('Delivery/diacritized_sentences.txt')

x = []
for line in gomal.splitlines():
    x.append([char_to_idx[char] for char in line])

y = []
for line in gomal_tashkeel.splitlines():
    y.append(get_tashkeel(line,chars,diacritics_to_id))




In [238]:
max_label_size = max([len(i) for i in y[0:8]])
max_x_size = max([len(i) for i in x[0:8]])
print(max_label_size)
print(max_x_size)
for i in range(0,8):
    if len(y[i]) == max_label_size:
        print(i)
        
print(len(diacritics_to_id))        

425
425
1
15


In [239]:
batch_size = 5
mini_sentences = x[0: 8]
mini_labels = y[0: 8]
mini_dataset = TashkeelDataset(mini_sentences, mini_labels, 40)
dummy_dataloader = torch.utils.data.DataLoader(mini_dataset, batch_size=5)
dg = iter(dummy_dataloader)
X1, Y1 = next(dg)
X2, Y2 = next(dg)
print(Y1.shape, X1.shape, Y2.shape, X2.shape)
print(X1[0][:], "\n", Y1[0][:])

The max size is 425
torch.Size([5, 425]) torch.Size([5, 425]) torch.Size([3, 425]) torch.Size([3, 425])
tensor([28, 34, 30, 33,  0,  3, 34,  0, 28, 23, 25,  0,  7, 30,  3, 34, 30,  0,
        36, 15, 33,  0,  5, 30, 14,  0, 28,  7, 30,  0,  7, 30, 18, 17, 29, 20,
        36,  0,  7,  8, 32,  0, 25, 17, 27,  9,  0, 28, 34, 30, 33,  0,  8, 30,
        27, 24,  0, 36, 28, 10, 22, 36, 33,  0, 29,  5, 32, 29,  7, 17,  0, 26,
        36, 17,  0, 13, 15, 36, 11,  0,  8,  7, 30,  5, 19, 30,  7, 31,  0, 34,
        12, 34,  8,  0, 31,  7,  0, 25, 30, 31,  0, 34, 12, 34,  8, 33,  0, 31,
        32,  0,  7, 30, 15, 36, 32,  0, 22, 17, 34, 17,  9,  0, 29,  5, 30, 28,
         7,  1,  0, 31, 21, 13, 27,  0,  8, 28, 16, 17,  0, 34, 20, 15,  0, 18,
        32,  7, 17,  0,  7,  8, 32,  0, 25, 17, 27,  9,  0, 28, 34, 30,  0,  7,
         8, 32,  0, 20,  7, 19,  0,  3, 34,  0,  8, 27, 25, 30,  0, 36, 10, 22,
        31, 32, 33,  0, 33, 34,  0, 29, 30,  8, 19,  0,  7, 30, 18, 32,  7, 17,
         0, 34, 

In [247]:
class Tashkeel(nn.Module):
  def __init__(self, vocab_size=37, embedding_dim=37, hidden_size=50, n_classes=16):
    """
    character level tashkeel model
    The constructor of our NER model
    Inputs:
    - vocab_size: the number of unique characters in the dataset
    - embedding_dim: the embedding dimension
    - n_classes: the number of final classes (tags)
    """
    super(Tashkeel, self).__init__()
    ####################### TODO: Create the layers of your model #######################################
    # (1) Create the embedding layer
    self.embedding = nn.Embedding(vocab_size, embedding_dim)

    # (2) Create an LSTM layer with hidden size = hidden_size and batch_first = True
    self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True,bidirectional=True)

    # (3) Create a linear layer with number of neorons = n_classes
    self.linear = nn.Linear(100, n_classes)
    #####################################################################################################

  def forward(self, sentences):
    """
    This function does the forward pass of our model
    Inputs:
    - sentences: tensor of shape (batch_size, max_length)

    Returns:
    - final_output: tensor of shape (batch_size, max_length, n_classes)
    """

    final_output = None
    ######################### TODO: implement the forward pass ####################################
    # (1) pass the sentences through the embedding layer to get the embeddings

    final_output = self.embedding(sentences)
    # print('op of embedding layer: ', final_output.shape)
    final_output, _ = self.lstm(final_output)
    # print('op of lstm layer: ',final_output.shape)
    final_output = self.linear(final_output)
    # print('op of linear layer: ',final_output.shape)
    # print(_)
    ###############################################################################################
    return final_output

In [248]:
model = Tashkeel()
print(model)

Tashkeel(
  (embedding): Embedding(37, 37)
  (lstm): LSTM(37, 50, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=100, out_features=16, bias=True)
)


In [249]:
def train(model, train_dataset, batch_size=10, epochs=5, learning_rate=0.01):
  """
  This function implements the training logic
  Inputs:
  - model: the model ot be trained
  - train_dataset: the training set of type NERDataset
  - batch_size: integer represents the number of examples per step
  - epochs: integer represents the total number of epochs (full training pass)
  - learning_rate: the learning rate to be used by the optimizer
  """

  ############################## TODO: replace the Nones in the following code ##################################
  
  # (1) create the dataloader of the training set (make the shuffle=True)
  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=5, shuffle=True)


  # (2) make the criterion cross entropy loss
  criterion = nn.CrossEntropyLoss()

  # (3) create the optimizer (Adam)
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  # GPU configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()

  for epoch_num in range(epochs):
    total_acc_train = 0
    total_loss_train = 0

    for train_input, train_label in tqdm(train_dataloader):

      # (4) move the train input to the device
      train_label = train_label.to(device)

      # (5) move the train label to the device
      train_input = train_input.to(device)


      # (6) do the forward pass
      output = model(train_input)
      
      # (7) loss calculation (you need to think in this part how to calculate the loss correctly)
      
      batch_loss = criterion(output.view(-1, 16), train_label.view(-1))

      # (8) append the batch loss to the total_loss_train
      total_loss_train += batch_loss.item()
      
      # (9) calculate the batch accuracy (just add the number of correct predictions)
      acc = (output.argmax(dim=2) == train_label).sum().item()
      total_acc_train += acc

      # (10) zero your gradients
      optimizer.zero_grad()

      # (11) do the backward pass
      batch_loss.backward()

      # (12) update the weights with your optimizer
      optimizer.step()
      
    # epoch loss
    epoch_loss = total_loss_train / len(train_dataset)

    # (13) calculate the accuracy
    epoch_acc = total_acc_train / (len(train_dataset) * len(train_dataset[0][0]))
    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
        | Train Accuracy: {epoch_acc}\n')


In [257]:
train_dataset = TashkeelDataset(x, y, 0)


The max size is 814
The max size is 851


In [251]:
from tqdm import tqdm
train(model, train_dataset)

100%|██████████| 10/10 [00:01<00:00,  9.15it/s]


Epochs: 1 | Train Loss: 0.20267097234725953         | Train Accuracy: 0.7248402948402949



100%|██████████| 10/10 [00:01<00:00,  7.63it/s]


Epochs: 2 | Train Loss: 0.08530916124582291         | Train Accuracy: 0.8774447174447174



100%|██████████| 10/10 [00:00<00:00, 10.36it/s]


Epochs: 3 | Train Loss: 0.07445384323596954         | Train Accuracy: 0.8821867321867322



100%|██████████| 10/10 [00:01<00:00,  9.63it/s]


Epochs: 4 | Train Loss: 0.07002351671457291         | Train Accuracy: 0.8837592137592137



100%|██████████| 10/10 [00:01<00:00,  7.38it/s]

Epochs: 5 | Train Loss: 0.06565391570329666         | Train Accuracy: 0.8919656019656019






In [258]:
def evaluate(model, test_dataset, batch_size=5):
  """
  This function takes a NER model and evaluates its performance (accuracy) on a test data
  Inputs:
  - model: a NER model
  - test_dataset: dataset of type NERDataset
  """
  ########################### TODO: Replace the Nones in the following code ##########################

  # (1) create the test data loader
  test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

  # GPU Configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()

  total_acc_test = 0
  
  # (2) disable gradients
  with torch.no_grad():

    for test_input, test_label in tqdm(test_dataloader):
      # (3) move the test input to the device
      test_label = test_label.to(device) 

      # (4) move the test label to the device
      test_input = test_input.to(device)

      # (5) do the forward pass
      output = model(test_input)

      # accuracy calculation (just add the correct predicted items to total_acc_test)
      acc = (output.argmax(dim=2) == test_label).sum().item()
      total_acc_test += acc
    
    # (6) calculate the over all accuracy
    total_acc_test /= (len(test_dataset) * len(test_dataset[0][0]))
  ##################################################################################################

  
  print(f'\nTest Accuracy: {total_acc_test}')

In [259]:
evaluate(model, test_dataset)

100%|██████████| 10/10 [00:00<00:00, 12.61it/s]


Test Accuracy: 0.8622091656874266



