<h1> Tashkeel </h1>

This project is developed by Abdulhameed Osama, Hossam Nabil and Nourhan Mohamed as a part of the Natural Language Processing course (NLP) at Cairo University.

In this project a Bi-LSTM model is trained to predict the diacritics of Arabic text. The model is trained on a dataset of 18 million characters from various domains. The model is trained on Google Colab.

We achieved an accuracy of 89.5% on the test set.

The model starts with a character embedding layer, followed by a Bi-LSTM layer, then a dense layer and finally a softmax layer. The model is trained using the Adam optimizer and the categorical crossentropy loss function.

<h3> Imports </h3>

In [21]:
import numpy as np
import pickle as pkl
import helper_file as hf
from tqdm import tqdm
import os
import re
from nltk.tokenize import sent_tokenize

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader



# Arabic Text Preprocessing

In [527]:
# Read the dataset
dataset = hf.read_file('dataset/train.txt')

val = hf.read_file('dataset/val.txt')

arabic_letters = hf.read_pickle('Delivery/arabic_letters.pickle')

diacritics = hf.read_pickle('Delivery/diacritics.pickle')

diacritics_to_id = hf.read_pickle('Delivery/diacritics2id.pickle')



In [389]:
id_to_diacritics = {}
for key, value in diacritics_to_id.items():
    id_to_diacritics[value] = key

In [528]:

# Clean the original dataset
cleaned_dataset_with_diacritics = hf.clean_dataset(dataset, remove_diacritics=False)
cleaned_dataset = hf.clean_dataset(dataset, remove_diacritics=True)
hf.write_file('Delivery/cleaned_dataset.txt', cleaned_dataset)
hf.write_file('Delivery/cleaned_dataset_with_diacritics.txt', cleaned_dataset_with_diacritics)


In [529]:
cleaned_val_with_diacritics = hf.clean_dataset(val, remove_diacritics=False)
cleaned_val = hf.clean_dataset(val, remove_diacritics=True)
hf.write_file('Delivery/cleaned_val.txt', cleaned_val)
hf.write_file('Delivery/cleaned_val_with_diacritics.txt', cleaned_val_with_diacritics)

In [16]:
def split_arabic_sentences(text):
   
    pattern = r'(?<=[.؟!,؛])'

    # Split the text into sentences based on the pattern
    sentences = re.split(pattern, text)
    # remove punctuations
    sentences = [re.sub(r'[.؟!،؛]', '', sent) for sent in sentences]
    sentences = [re.sub(r'\s+', ' ', sent) for sent in sentences]
    sentences = filter(lambda sentences: sentences.strip(), sentences)

    return sentences 


def append_to_file(file_path, content):
    with open(file_path, 'a', encoding='utf-8') as file:
        file.write(content.strip() + '\n')






In [421]:
import os
sentences = split_arabic_sentences(cleaned_dataset)

# Check if the file exists and if it does, delete its content

if os.path.exists('Delivery/sentences.txt'):
    hf.write_file('Delivery/sentences.txt', '')


for s in sentences:
    append_to_file('Delivery/sentences.txt', s)



In [18]:
diacritized_sentences = split_arabic_sentences(cleaned_dataset_with_diacritics)

if os.path.exists('Delivery/diacritized_sentences.txt'):
    hf.write_file('Delivery/diacritized_sentences.txt', '')

for s in diacritized_sentences:
    append_to_file('Delivery/diacritized_sentences.txt', s)


In [530]:
val_sentences = split_arabic_sentences(cleaned_val)

if os.path.exists('Delivery/val_sentences.txt'):
    hf.write_file('Delivery/val_sentences.txt', '')

for s in val_sentences:
  append_to_file('Delivery/val_sentences.txt',s)

In [531]:
d_val = split_arabic_sentences(cleaned_val_with_diacritics)

if os.path.exists('Delivery/d_val.txt'):
    hf.write_file('Delivery/d_val.txt', '')

for s in d_val:
  append_to_file('Delivery/d_val.txt',s)

In [309]:



def get_diacritics(text,chars,diacritics2id):
    text = list(text)
    string = ''
    diacritics = []
    counter = 0 
    for char in text:
        if char == ' ':
            # print('space ////////////////////////////////////')
            # print(f'{string} appeneed to diacritics list space condition')
            diacritics.append(diacritics_to_id[string])
            diacritics.append(14)
            string = ''
            counter += 1
            continue
    
        if char not in chars:
            # print(f"diacritic {char}")
            string += char
            # print('concatenated string',string)
        else:
            # print('regular char ',char)
            if text[counter - 1] in chars:
                # print(f'{string} appeneed to diacritics list counter condition')
                diacritics.append(diacritics2id[string])
                string = ''
            elif string != '':
                # print(f'{string} appeneed to diacritics list normal condidition')
                diacritics.append(diacritics2id[string])
    
            string = ''

        counter += 1    

    return diacritics

def get_tashkeel(text,char,diacritics2id):
    diacritics = get_diacritics(text,char,diacritics2id)
    if text[-1] not in char:
        diacritics.append(diacritics2id[text[-1]])
    return diacritics

        



chars = sorted(arabic_letters) 
char_to_idx = {char: idx + 1 for idx, char in enumerate(chars)}  # Assigning 0 for padding
char_to_idx[' '] = 0
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
idx_to_char[0] = ' ' 



In [310]:
class TashkeelDataset(torch.utils.data.Dataset):

  def __init__(self, x, y, pad):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    max_size = max([len(i) for i in x]) # find the max length of the sentences 
    print('The max size is', max_size)  
    self.x = torch.tensor([i + [pad] * (max_size - len(i)) for i in x]) # pad the sentences with <PAD> token
    self.y = torch.tensor([i + [14] * (max_size - len(i)) for i in y]) # pad the labels with <PAD> token

  def __len__(self):
    return len(self.x)

  def __getitem__(self, idx):
    return (self.x[idx], self.y[idx])

In [311]:
gomal = hf.read_file('Delivery/sentences.txt')
gomal_tashkeel = hf.read_file('Delivery/diacritized_sentences.txt')

x = []
for line in gomal.splitlines():
    x.append([char_to_idx[char] for char in line])

y = []
for line in gomal_tashkeel.splitlines():
    y.append(get_tashkeel(line,chars,diacritics_to_id))




In [331]:
val_gomal = hf.read_file('Delivery/val_sentences.txt')
val_gomal_tashkeel = hf.read_file('Delivery/d_val.txt')

x_val = []
for line in val_gomal.splitlines():
    x_val.append([char_to_idx[char] for char in line])

y_val = []
for line in val_gomal_tashkeel.splitlines():
    y_val.append(get_tashkeel(line,chars,diacritics_to_id))

In [27]:
batch_size = 5
mini_sentences = x[0: 8]
mini_labels = y[0: 8]
mini_dataset = TashkeelDataset(mini_sentences, mini_labels, 40)
dummy_dataloader = torch.utils.data.DataLoader(mini_dataset, batch_size=5)
dg = iter(dummy_dataloader)
X1, Y1 = next(dg)
X2, Y2 = next(dg)
print(Y1.shape, X1.shape, Y2.shape, X2.shape)
print(X1[0][:], "\n", Y1[0][:])


The max size is 425
torch.Size([5, 425]) torch.Size([5, 425]) torch.Size([3, 425]) torch.Size([3, 425])
tensor([28, 34, 30, 33,  0,  3, 34,  0, 28, 23, 25,  0,  7, 30,  3, 34, 30,  0,
        36, 15, 33,  0,  5, 30, 14,  0, 28,  7, 30,  0,  7, 30, 18, 17, 29, 20,
        36,  0,  7,  8, 32,  0, 25, 17, 27,  9,  0, 28, 34, 30, 33,  0,  8, 30,
        27, 24,  0, 36, 28, 10, 22, 36, 33,  0, 29,  5, 32, 29,  7, 17,  0, 26,
        36, 17,  0, 13, 15, 36, 11,  0,  8,  7, 30,  5, 19, 30,  7, 31,  0, 34,
        12, 34,  8,  0, 31,  7,  0, 25, 30, 31,  0, 34, 12, 34,  8, 33,  0, 31,
        32,  0,  7, 30, 15, 36, 32,  0, 22, 17, 34, 17,  9,  0, 29,  5, 30, 28,
         7,  1,  0, 31, 21, 13, 27,  0,  8, 28, 16, 17,  0, 34, 20, 15,  0, 18,
        32,  7, 17,  0,  7,  8, 32,  0, 25, 17, 27,  9,  0, 28, 34, 30,  0,  7,
         8, 32,  0, 20,  7, 19,  0,  3, 34,  0,  8, 27, 25, 30,  0, 36, 10, 22,
        31, 32, 33,  0, 33, 34,  0, 29, 30,  8, 19,  0,  7, 30, 18, 32,  7, 17,
         0, 34, 

In [312]:
class Tashkeel(nn.Module):
  def __init__(self, vocab_size=37, embedding_dim=37, hidden_size=50, n_classes=15):
    """
    character level tashkeel model
    The constructor of our NER model
    Inputs:
    - vocab_size: the number of unique characters in the dataset
    - embedding_dim: the embedding dimension
    - n_classes: the number of final classes (tags)
    """
    super(Tashkeel, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    
    self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True,bidirectional=True) 
    self.linear = nn.Linear(100, n_classes)

  def forward(self, sentences):
    """
    This function does the forward pass of our model
    Inputs:
    - sentences: tensor of shape (batch_size, max_length)

    Returns:
    - final_output: tensor of shape (batch_size, max_length, n_classes)
    """
    final_output = self.embedding(sentences)
    final_output, _ = self.lstm(final_output)
    final_output = self.linear(final_output)
    return final_output

In [31]:
model = Tashkeel()
print(model)

Tashkeel(
  (embedding): Embedding(37, 37)
  (lstm): LSTM(37, 50, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=100, out_features=15, bias=True)
)


In [32]:
def train(model, train_dataset, batch_size=10, epochs=5, learning_rate=0.01):
  """
  This function implements the training logic
  Inputs:
  - model: the model ot be trained
  - train_dataset: the training set of type NERDataset
  - batch_size: integer represents the number of examples per step
  - epochs: integer represents the total number of epochs (full training pass)
  - learning_rate: the learning rate to be used by the optimizer
  """
  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=5, shuffle=True)
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  # GPU configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()

  for epoch_num in range(epochs):
    total_acc_train = 0
    total_loss_train = 0

    for train_input, train_label in tqdm(train_dataloader):
      train_label = train_label.to(device)
      train_input = train_input.to(device)
      output = model(train_input)
      
      batch_loss = criterion(output.view(-1, 15), train_label.view(-1))
      total_loss_train += batch_loss.item()

      acc = (output.argmax(dim=2) == train_label).sum().item()
      total_acc_train += acc
      optimizer.zero_grad()
      batch_loss.backward()
      optimizer.step()
      
    epoch_loss = total_loss_train / len(train_dataset)
    epoch_acc = total_acc_train / (len(train_dataset) * len(train_dataset[0][0]))
    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
        | Train Accuracy: {epoch_acc}\n')


In [333]:
train_dataset = TashkeelDataset(x[0:1000], y[0:1000], 0)
val_dataset = TashkeelDataset(x_val[0:500], y_val[0:500], 0)


The max size is 2544
The max size is 1739


In [349]:
train(model, train_dataset)

100%|██████████| 200/200 [01:12<00:00,  2.77it/s]


Epochs: 1 | Train Loss: 0.015868429139256478         | Train Accuracy: 0.9731387578616352



100%|██████████| 200/200 [01:11<00:00,  2.79it/s]


Epochs: 2 | Train Loss: 0.013220682865008711         | Train Accuracy: 0.9773474842767296



100%|██████████| 200/200 [01:06<00:00,  3.01it/s]


Epochs: 3 | Train Loss: 0.012505192011594772         | Train Accuracy: 0.9786859276729559



100%|██████████| 200/200 [00:52<00:00,  3.83it/s]


Epochs: 4 | Train Loss: 0.011913397298194468         | Train Accuracy: 0.9795365566037736



100%|██████████| 200/200 [01:01<00:00,  3.27it/s]

Epochs: 5 | Train Loss: 0.011786960931494832         | Train Accuracy: 0.9798172169811321






In [351]:
def correct_evaluation(model,test_dataset,batch_size=5):
     # Create the test data loader
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

    # GPU Configuration
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()

    total_correct_predictions = 0
    total_predictions = 0
    letter_count = 0
    correct_letter_d = 0
    all_predictions = []
    all_test_inputs = []
    with torch.no_grad():
        for test_input, test_label in tqdm(test_dataloader):
            # Move the test input to the device
            test_label = test_label.to(device)
            # Move the test label to the device
            test_input = test_input.to(device)

            # Perform the forward pass
            output = model(test_input)

            # Get predicted labels
            predicted_labels = output.argmax(dim=2)
            # print(test_input.shape)
            for j,batch in enumerate(test_input):

              for i,char in enumerate(batch):
                # print(char.shape)
                if char.item() != 0:
                  letter_count += 1
                  if predicted_labels[j][i].item() == test_label[j][i].item():
                    correct_letter_d += 1

            all_test_inputs.append(test_input)
            all_predictions.append(predicted_labels)






            # Count correct predictions, excluding label 15
            # correct_predictions = ((predicted_labels == test_label) & (test_label != 15)).sum().item()
            # total_correct_predictions += correct_predictions

            # # Count total predictions, excluding instances where the ground truth label is 15
            # valid_predictions = (test_label != 15).sum().item()
            # total_predictions += valid_predictions
            # space_pred = (predicted_labels == 14).sum().item()
            # total_correct_predictions +=space_pred
            # valid_predictions = (test_label == 14).sum().item()
            # total_predictions += valid_predictions



    # Calculate the overall accuracy excluding label 15
    overall_accuracy = correct_letter_d / letter_count
    print(len(all_test_inputs),len(all_predictions))
    print(f'overall accuracy: {overall_accuracy}')
    return all_test_inputs,all_predictions


In [None]:
test_inputs,predictions = correct_evaluation(model,val_dataset)

In [38]:
def extract_data_from_id(test_input, predictions, id_to_diacritics, idx_to_char):
  for k,test in enumerate(test_input):
    for j,batch in enumerate(test):
        string = ''
        for i,char in enumerate(batch):
          # print(char.shape)
          # print("len ", len(predictions))
          # print("shape ", predictions[k].shape)
          # print("prediction ",predictions[k][j][i])
          string += (idx_to_char[test[j][i].item()] + id_to_diacritics[predictions[k][j][i].item()])
        print(string)
        break
    break

In [54]:
print("قَوْلُهُ وَلَا تُكْرَهُ ضِيَافَتُهُ")

id_to_diacritics = {}
for key, value in diacritics_to_id.items():
    id_to_diacritics[value] = key

# pklmodel = torch.load('Delivery/model.pickle')
# test_inputs,predictions = correct_evaluation(pklmodel,val_dataset)

extract_data_from_id(test_inputs, predictions, id_to_diacritics, idx_to_char)

قَوْلُهُ وَلَا تُكْرَهُ ضِيَافَتُهُ


100%|██████████| 100/100 [00:06<00:00, 16.54it/s]

100 100
overall accuracy: 0.7410149770416136
قَوْلُهُ وَلَا تُكْرْهُ ضِيَافَتُهُ                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        




# Positional Feature


In [276]:
def extract_positional_features(text):
    # Character Index
    char_index = {char: idx for idx, char in enumerate(text)}
    
    # Relative Position
    rel_positions = {char: (idx / len(text)) for idx, char in enumerate(text)}
    print(rel_positions)

    return char_index, rel_positions

# rel_char_index, rel_positions = extract_positional_features(sample_text)

In [281]:
class Pf_Dataset(torch.utils.data.Dataset):

  def __init__(self,x):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    max_size = max([len(i) for i in x]) # find the max length of the sentences 
    print('The max size is', max_size)  
    self.x = torch.tensor([i + [pad] * (max_size - len(i)) for i in x]) # pad the sentences with <PAD> token

  def __len__(self):
    return len(self.x)

  def __getitem__(self, idx):
    return self.x[idx]

In [278]:
gomal = hf.read_file('Delivery/sentences.txt')
gomal_tashkeel = hf.read_file('Delivery/diacritized_sentences.txt')

x_pos = []
for line in gomal.splitlines():
    _ , rel_pos  = extract_positional_features(line)
    x_pos.append([rel_pos[char] for char in line])




In [336]:
pf_dataset = Pf_Dataset(x_pos[0:1000], 0)

The max size is 2544


In [None]:
batch_size = 5
mini_sentences = x_pos[0: 8]
mini_dataset = Pf_Dataset(mini_sentences,0)
dummy_dataloader = torch.utils.data.DataLoader(mini_dataset, batch_size=5)
dg = iter(dummy_dataloader)
X1= next(dg)
X2 = next(dg)
print( X1.shape, X2.shape)
print(X1[0][:])

In [326]:
class Tashkeel_PF(nn.Module):
  def __init__(self, vocab_size=37, embedding_dim=37, hidden_size=50, n_classes=15):
    """
    character level tashkeel model
    The constructor of our NER model
    Inputs:
    - vocab_size: the number of unique characters in the dataset
    - embedding_dim: the embedding dimension
    - n_classes: the number of final classes (tags)
    """
    super(Tashkeel_PF, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    
    self.lstm = nn.LSTM(embedding_dim + 1, hidden_size, batch_first=True,bidirectional=True) 
    self.linear = nn.Linear(2 * hidden_size, n_classes)

  def forward(self, sentences,positional_features):
    """
    This function does the forward pass of our model
    Inputs:
    - sentences: tensor of shape (batch_size, max_length)

    Returns:
    - final_output: tensor of shape (batch_size, max_length, n_classes)
    """
    embedding = self.embedding(sentences)
    positional_features = positional_features.unsqueeze(-1)
    combined = torch.cat((embedding,positional_features),dim=2)
    final_output , _ = self.lstm(combined)
    final_output = self.linear(final_output)
    return final_output

In [327]:
model_PF = Tashkeel_PF()
print(model_PF)

Tashkeel_PF(
  (embedding): Embedding(37, 37)
  (lstm): LSTM(38, 50, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=100, out_features=15, bias=True)
)


In [328]:
def train_PF(model, train_dataset,positiona_features, batch_size=10, epochs=5, learning_rate=0.01):
  """
  This function implements the training logic
  Inputs:
  - model: the model ot be trained
  - train_dataset: the training set of type NERDataset
  - batch_size: integer represents the number of examples per step
  - epochs: integer represents the total number of epochs (full training pass)
  - learning_rate: the learning rate to be used by the optimizer
  """
  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=5, shuffle=False)
  pos_dataloader = torch.utils.data.DataLoader(positiona_features, batch_size=5, shuffle=False)
  p_iter = iter(pos_dataloader)  

  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  # GPU configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()

  for epoch_num in range(epochs):
    total_acc_train = 0
    total_loss_train = 0

    for train_input, train_label in tqdm(train_dataloader):
      try:
        pf = next(p_iter)
      except StopIteration:
        p_iter = iter(pos_dataloader)  # Reset the iterator if it reaches the end
        pf = next(p_iter) 
        
      train_label = train_label.to(device)
      train_input = train_input.to(device)
      output = model(train_input,pf)
      
      batch_loss = criterion(output.view(-1, 15), train_label.view(-1))
      total_loss_train += batch_loss.item()

      acc = (output.argmax(dim=2) == train_label).sum().item()
      total_acc_train += acc
      optimizer.zero_grad()
      batch_loss.backward()
      optimizer.step()
      
    epoch_loss = total_loss_train / len(train_dataset)
    epoch_acc = total_acc_train / (len(train_dataset) * len(train_dataset[0][0]))
    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
        | Train Accuracy: {epoch_acc}\n')


In [338]:
train_PF(model_PF, train_dataset,pf_dataset)

100%|██████████| 200/200 [01:16<00:00,  2.62it/s]


Epochs: 1 | Train Loss: 0.01637888864427805         | Train Accuracy: 0.9721603773584906



100%|██████████| 200/200 [01:16<00:00,  2.62it/s]


Epochs: 2 | Train Loss: 0.013647409402765334         | Train Accuracy: 0.9769889937106918



100%|██████████| 200/200 [01:26<00:00,  2.30it/s]


Epochs: 3 | Train Loss: 0.012667141629382967         | Train Accuracy: 0.9784606918238994



100%|██████████| 200/200 [01:13<00:00,  2.74it/s]


Epochs: 4 | Train Loss: 0.012123758212663233         | Train Accuracy: 0.9793219339622642



100%|██████████| 200/200 [00:54<00:00,  3.67it/s]

Epochs: 5 | Train Loss: 0.011732063310220838         | Train Accuracy: 0.9799229559748428






In [344]:
def correct_evaluation_pf(model,test_dataset,pf_dataset,batch_size=5):
     # Create the test data loader
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)
    p_test_dataloader = torch.utils.data.DataLoader(pf_dataset, batch_size=batch_size)

    # GPU Configuration
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
    total_correct_predictions = 0
    total_predictions = 0
    letter_count = 0
    correct_letter_d = 0
    all_predictions = []
    all_test_inputs = []
    p_iter = iter(p_test_dataloader)
    with torch.no_grad():
        for test_input, test_label in tqdm(test_dataloader):
            pf = next(p_iter)
            # Move the test input to the device
            test_label = test_label.to(device)
            # Move the test label to the device
            test_input = test_input.to(device)

            # Perform the forward pass
            output = model(test_input,pf)

            # Get predicted labels
            predicted_labels = output.argmax(dim=2)
            # print(test_input.shape)
            for j,batch in enumerate(test_input):

              for i,char in enumerate(batch):
                # print(char.shape)
                if char.item() != 0:
                  letter_count += 1
                  if predicted_labels[j][i].item() == test_label[j][i].item():
                    correct_letter_d += 1

            all_test_inputs.append(test_input)
            all_predictions.append(predicted_labels)






            # Count correct predictions, excluding label 15
            # correct_predictions = ((predicted_labels == test_label) & (test_label != 15)).sum().item()
            # total_correct_predictions += correct_predictions

            # # Count total predictions, excluding instances where the ground truth label is 15
            # valid_predictions = (test_label != 15).sum().item()
            # total_predictions += valid_predictions
            # space_pred = (predicted_labels == 14).sum().item()
            # total_correct_predictions +=space_pred
            # valid_predictions = (test_label == 14).sum().item()
            # total_predictions += valid_predictions



    # Calculate the overall accuracy excluding label 15
    overall_accuracy = correct_letter_d / letter_count
    print(len(all_test_inputs),len(all_predictions))
    print(f'overall accuracy: {overall_accuracy}')
    return all_test_inputs,all_predictions


In [346]:

x_pos_val = []
for line in val_gomal.splitlines():
    _ , rel_pos  = extract_positional_features(line)
    x_pos_val.append([rel_pos[char] for char in line])

In [347]:
val_pf_dataset = Pf_Dataset(x_pos_val[0:500], 0)

The max size is 1739


In [348]:
test_inputs,predictions = correct_evaluation_pf(model_PF,val_dataset,val_pf_dataset)

100%|██████████| 100/100 [00:16<00:00,  6.02it/s]

100 100
overall accuracy: 0.6964564752265788





# WORD2VEC


In [422]:
# Feature Extraction
from gensim.models.fasttext import FastText

# Load the pre-trained FastText model
fasttext_model = FastText.load('models/arabic_word2vec_model.bin')


In [451]:
sen = hf.read_file('Delivery/sentences.txt')
d_sen = hf.read_file('Delivery/diacritized_sentences.txt')

sen_t = [s.split() for s in sen.splitlines()]
d_sen_t = [s.split() for s in d_sen.splitlines()]


In [533]:
val_gomal = hf.read_file('Delivery/val_sentences.txt')
val_gomal_tashkeel = hf.read_file('Delivery/d_val.txt')

val_t = [s.split() for s in val_gomal.splitlines()]
d_val_t = [s.split() for s in val_gomal_tashkeel.splitlines()]


In [431]:

def get_word_embeddings(sen_t):
    word_embeddings = []
    for s in sen_t:
        for word in s:
            try:
                word_embeddings.append(fasttext_model.wv[word])
            except KeyError:
                word_embeddings.append(np.zeros(100))
    return word_embeddings

In [470]:
def get_char_to_word_mapping(sentences,word_embeddings,model):
    word_embedding_for_char = []
    for sen in sentences:
        # print(sen)
        s = []
        for word in sen:
            # print(word)
            embedding = model.wv[word]
            emdedding = embedding.tolist()
            # print(embedding)
            for char in word:
                # print(char)
                s.append(embedding)
                # print(embedding.shape)

        word_embedding_for_char.append(s)        


    return word_embedding_for_char


embeddings = get_char_to_word_mapping(sen_t , get_word_embeddings(sen_t),fasttext_model)

    
       



In [534]:
embeddings_val = get_char_to_word_mapping(val_t , get_word_embeddings(val_t),fasttext_model)    


In [None]:
print(type(embeddings[0][0]))

In [462]:

x_wv = []
for s in sen_t:
    sen = []
    for word in s:
        sen += [char_to_idx[char] for char in word]
    x_wv.append(sen)    



In [463]:
y_wv = []
for s in d_sen_t:
    sen = []
    for word in s:
        sen += get_tashkeel(word,chars,diacritics_to_id)
    y_wv.append(sen)    


In [535]:
x_wv_val = []
for s in val_t:
    sen = []
    for word in s:
        sen += [char_to_idx[char] for char in word]
    x_wv_val.append(sen)
    

In [536]:
y_wv_val = []
for s in d_val_t:
    sen = []
    for word in s:
        sen += get_tashkeel(word,chars,diacritics_to_id)
    y_wv_val.append(sen)

In [541]:
train_dataset_wv = TashkeelDataset(x_wv[0:5000], y_wv[0:5000], 0)


The max size is 2034


In [537]:
val_dataset_wv = TashkeelDataset(x_wv_val[0:500], y_wv_val[0:500], 0)

The max size is 1397


In [473]:
class W2V_Dataset(torch.utils.data.Dataset):

  def __init__(self,x):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    max_size = max([len(i) for i in x]) # find the max length of the sentences 
    print('The max size is', max_size)  
    self.x = torch.tensor([i + [np.zeros(100)] * (max_size - len(i)) for i in x]) # pad the sentences with <PAD> token

  def __len__(self):
    return len(self.x)

  def __getitem__(self, idx):
    return self.x[idx]

In [525]:
w2v_dataset = W2V_Dataset(embeddings[0:1000])

The max size is 2034


In [538]:
w2v_dataset_val = W2V_Dataset(embeddings_val[0:500])

The max size is 1397


In [476]:
batch_size = 5
mini_sentences = embeddings[0: 8]
mini_dataset = W2V_Dataset(mini_sentences)
dummy_dataloader = torch.utils.data.DataLoader(mini_dataset, batch_size=5)
dg = iter(dummy_dataloader)
X1= next(dg)
X2 = next(dg)
print( X1.shape, X2.shape)
print(X1[0][:])

The max size is 331
torch.Size([5, 331, 100]) torch.Size([3, 331, 100])
tensor([[ 1.7722,  0.6402, -0.5912,  ...,  0.1362,  0.6359, -0.1419],
        [ 1.7722,  0.6402, -0.5912,  ...,  0.1362,  0.6359, -0.1419],
        [ 1.7722,  0.6402, -0.5912,  ...,  0.1362,  0.6359, -0.1419],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       dtype=torch.float64)


In [518]:
class Tashkeel_W2V(nn.Module):
  def __init__(self, vocab_size=37, embedding_dim=37, hidden_size=50, n_classes=15):
    """
    character level tashkeel model
    The constructor of our NER model
    Inputs:
    - vocab_size: the number of unique characters in the dataset
    - embedding_dim: the embedding dimension
    - n_classes: the number of final classes (tags)
    """
    super(Tashkeel_W2V, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    
    self.lstm = nn.LSTM(embedding_dim + 100, hidden_size, batch_first=True,bidirectional=True) 
    self.linear = nn.Linear(2 * hidden_size, n_classes)

  def forward(self, sentences,w2v):
    """
    This function does the forward pass of our model
    Inputs:
    - sentences: tensor of shape (batch_size, max_length)

    Returns:
    - final_output: tensor of shape (batch_size, max_length, n_classes)
    """
    embedding = self.embedding(sentences)
    w2v = w2v.to(embedding.dtype)
    # print("after sq",w2v.shape)
    combined = torch.cat((embedding,w2v),dim=2)
    final_output , _ = self.lstm(combined)
    final_output = self.linear(final_output)
    return final_output

In [519]:
model_w2v = Tashkeel_W2V()
print(model_w2v)

Tashkeel_W2V(
  (embedding): Embedding(37, 37)
  (lstm): LSTM(137, 50, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=100, out_features=15, bias=True)
)


In [522]:
def train_W2V(model, train_dataset,w2v_features, batch_size=10, epochs=5, learning_rate=0.01):
  """
  This function implements the training logic
  Inputs:
  - model: the model ot be trained
  - train_dataset: the training set of type NERDataset
  - batch_size: integer represents the number of examples per step
  - epochs: integer represents the total number of epochs (full training pass)
  - learning_rate: the learning rate to be used by the optimizer
  """
  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=5, shuffle=False)
  w2v_dataloader = torch.utils.data.DataLoader(w2v_features, batch_size=5, shuffle=False)
  p_iter = iter(w2v_dataloader)  

  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  # GPU configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()

  for epoch_num in range(epochs):
    total_acc_train = 0
    total_loss_train = 0

    for train_input, train_label in tqdm(train_dataloader):
      try:
        pf = next(p_iter)
      except StopIteration:
        p_iter = iter(w2v_dataloader)  # Reset the iterator if it reaches the end
        pf = next(p_iter) 
        
      train_label = train_label.to(device)
      train_input = train_input.to(device)
      output = model(train_input,pf)
      
      batch_loss = criterion(output.view(-1, 15), train_label.view(-1))
      total_loss_train += batch_loss.item()

      acc = (output.argmax(dim=2) == train_label).sum().item()
      total_acc_train += acc
      optimizer.zero_grad()
      batch_loss.backward()
      optimizer.step()
      
    epoch_loss = total_loss_train / len(train_dataset)
    epoch_acc = total_acc_train / (len(train_dataset) * len(train_dataset[0][0]))
    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
        | Train Accuracy: {epoch_acc}\n')


In [543]:
train_W2V(model_w2v, train_dataset_wv,w2v_dataset)

100%|██████████| 1000/1000 [06:18<00:00,  2.64it/s]


Epochs: 1 | Train Loss: 0.013128688661009073         | Train Accuracy: 0.9769163225172075



100%|██████████| 1000/1000 [06:36<00:00,  2.52it/s]


Epochs: 2 | Train Loss: 0.012160730931349098         | Train Accuracy: 0.9788111111111111



100%|██████████| 1000/1000 [06:49<00:00,  2.44it/s]


Epochs: 3 | Train Loss: 0.011781908652558922         | Train Accuracy: 0.9795470009832842



100%|██████████| 1000/1000 [05:04<00:00,  3.28it/s]


Epochs: 4 | Train Loss: 0.011533078382257372         | Train Accuracy: 0.98000668633235



100%|██████████| 1000/1000 [04:41<00:00,  3.55it/s]

Epochs: 5 | Train Loss: 0.011300006229896098         | Train Accuracy: 0.9804890855457227






In [539]:
def correct_evaluation_wv(model,test_dataset,wv_dataset,batch_size=5):
     # Create the test data loader
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)
    wv_test_dataloader = torch.utils.data.DataLoader(wv_dataset, batch_size=batch_size)

    # GPU Configuration
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
    total_correct_predictions = 0
    total_predictions = 0
    letter_count = 0
    correct_letter_d = 0
    all_predictions = []
    all_test_inputs = []
    p_iter = iter(wv_test_dataloader)
    with torch.no_grad():
        for test_input, test_label in tqdm(test_dataloader):
            pf = next(p_iter)
            # Move the test input to the device
            test_label = test_label.to(device)
            # Move the test label to the device
            test_input = test_input.to(device)

            # Perform the forward pass
            output = model(test_input,pf)

            # Get predicted labels
            predicted_labels = output.argmax(dim=2)
            # print(test_input.shape)
            for j,batch in enumerate(test_input):

              for i,char in enumerate(batch):
                # print(char.shape)
                if char.item() != 0:
                  letter_count += 1
                  if predicted_labels[j][i].item() == test_label[j][i].item():
                    correct_letter_d += 1

            all_test_inputs.append(test_input)
            all_predictions.append(predicted_labels)






            # Count correct predictions, excluding label 15
            # correct_predictions = ((predicted_labels == test_label) & (test_label != 15)).sum().item()
            # total_correct_predictions += correct_predictions

            # # Count total predictions, excluding instances where the ground truth label is 15
            # valid_predictions = (test_label != 15).sum().item()
            # total_predictions += valid_predictions
            # space_pred = (predicted_labels == 14).sum().item()
            # total_correct_predictions +=space_pred
            # valid_predictions = (test_label == 14).sum().item()
            # total_predictions += valid_predictions



    # Calculate the overall accuracy excluding label 15
    overall_accuracy = correct_letter_d / letter_count
    print(len(all_test_inputs),len(all_predictions))
    print(f'overall accuracy: {overall_accuracy}')
    return all_test_inputs,all_predictions


In [None]:
correct_evaluation_wv(model_w2v, val_dataset_wv, w2v_dataset_val)

# Test Set Evaluation



In [378]:
class Test_Dataset(torch.utils.data.Dataset):

  def __init__(self,x,pad):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
 
    max_size = max([len(i) for i in x]) # find the max length of the sentences 
    print('The max size is', max_size)
    self.x = torch.tensor([i + [pad] * (max_size - len(i)) for i in x]) # pad the sentences with <PAD> token

  def __len__(self):
    return len(self.x)

  def __getitem__(self, idx):
    return self.x[idx]

In [408]:
def test_evaluation(model,test_dataset,batch_size=5):
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

    # GPU Configuration
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()

    all_predictions = []
    all_test_inputs = []
    with torch.no_grad():
        for test_input in tqdm(test_dataloader):
            # print(test_input.shape)
            # Move the test input to the device
            test_input = test_input.to(device)

            # Perform the forward pass
            output = model(test_input)

            # Get predicted labels
            predicted_labels = output.argmax(dim=2)
            

            all_test_inputs.append(test_input)
            all_predictions.append(predicted_labels)

    return all_test_inputs,all_predictions

In [544]:
def test_evaluation_wv(model,test_dataset,wv_dataset,batch_size=5):
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)
    embeddings_dataloader = torch.utils.data.DataLoader(wv_dataset, batch_size=batch_size)
    p_iter = iter(embeddings_dataloader)
    # GPU Configuration
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
    all_predictions = []
    all_test_inputs = []
    with torch.no_grad():
        for test_input in tqdm(test_dataloader):
            pf = next(p_iter)
            # print(test_input.shape)
            # Move the test input to the device
            test_input = test_input.to(device)

            # Perform the forward pass
            output = model(test_input,pf)

            # Get predicted labels
            predicted_labels = output.argmax(dim=2)
            

            all_test_inputs.append(test_input)
            all_predictions.append(predicted_labels)

    return all_test_inputs,all_predictions

In [411]:
def extract_data_from_id_csv(test_input, predictions, id_to_diacritics, idx_to_char):
    id = 0  
    with open('Delivery/predictions.csv', 'w', encoding='utf-8') as file:
        file.write('ID,label\n')

    to_csv = []    

    for k,test in enumerate(test_input):
        for j,batch in enumerate(test):
            string = ''
            for i,char in enumerate(batch):
                if test[j][i].item() != 0:
                    to_csv_string = str(id) + ',' + str(predictions[k][j][i].item()) + '\n'
                    to_csv.append(to_csv_string)
                    id += 1
                # print(char.shape)
                # print("len ", len(predictions))
                # print("shape ", predictions[k].shape)
                # print("prediction ",predictions[k][j][i])
                # string += (idx_to_char[test[j][i].item()] + id_to_diacritics[predictions[k][j][i].item()])

                # output a csv file that has an iterator in first column and the diacritic in the second column
            # print(string)
     
    for item in to_csv:
        append_to_file('Delivery/predictions.csv', item)        


    return     
        


In [None]:
test_file = hf.read_file('Delivery/test_no_diacritics.txt')
cleaned_test = hf.clean_dataset(test_file,remove_diacritics=False)
test_sentences = split_arabic_sentences(cleaned_test)
if os.path.exists('Delivery/test_sentences.txt'):
    hf.write_file('Delivery/test_sentences.txt', '')

for s in test_sentences:
  append_to_file('Delivery/test_sentences.txt',s)



In [405]:
test_gomal = hf.read_file('Delivery/test_sentences.txt')

x_test = []
for line in test_gomal.splitlines():
    x_test.append([char_to_idx[char] for char in line])

print(x_test)

testeval_dataset = Test_Dataset(x_test, 0)

[[30, 36, 19, 0, 30, 30, 34, 29, 36, 30, 0, 8, 7, 30, 28, 8, 22, 0, 3, 32, 0, 36, 8, 17, 3, 0, 7, 30, 31, 15, 36, 32, 0, 3, 34, 0, 36, 33, 8, 0, 7, 30, 15, 36, 32, 0, 30, 33, 0, 3, 34, 0, 36, 3, 14, 16, 0, 17, 33, 32, 7, 0, 31, 32, 0, 7, 30, 31, 15, 36, 32, 0, 27, 36, 0, 31, 28, 7, 8, 30, 0, 7, 30, 15, 36, 32, 0, 3, 34, 0, 36, 28, 8, 30, 0, 5, 13, 7, 30, 10, 33, 0, 25, 30, 35, 0, 20, 14, 21, 0, 2, 14, 17, 0, 30, 29, 32, 0, 30, 33, 0, 3, 32, 0, 36, 3, 14, 16, 0, 29, 27, 36, 30, 7, 0, 30, 29, 32, 0, 30, 36, 19, 0, 30, 33, 0, 3, 32, 0, 36, 3, 14, 16, 0, 29, 27, 36, 30, 7, 0, 8, 20, 17, 23, 0, 8, 17, 7, 1, 9, 0, 7, 30, 3, 21, 36, 30, 0, 7, 32, 24, 17, 0, 7, 30, 31, 7, 15, 9, 0, 7, 30, 3, 32, 28, 17, 34, 36, 0, 7, 30, 23, 13, 23, 7, 34, 36, 0, 34, 21, 17, 9, 0, 7, 30, 27, 10, 7, 34, 35, 0, 7, 30, 8, 13, 17], [28, 34, 30, 33, 0, 34, 36, 28, 25, 0, 27, 36, 0, 8, 25, 22, 0, 7, 30, 32, 19, 14, 0, 8, 31, 32, 27, 25, 9, 0, 34, 31, 25, 36, 32, 0, 3, 36, 0, 3, 34, 21, 35, 0, 8, 31, 12, 31, 34, 25, 

In [545]:
test_t = [s.split() for s in test_gomal.splitlines()]

In [546]:
embeddings_test = get_char_to_word_mapping(test_t , get_word_embeddings(test_t),fasttext_model)

In [547]:
x_wv_test = []
for s in test_t:
    sen = []
    for word in s:
        sen += [char_to_idx[char] for char in word]
    x_wv_test.append(sen)

In [551]:
testeval_dataset_wv = Test_Dataset(x_wv_test, 0)

The max size is 1621


In [549]:
w2v_dataset_test = W2V_Dataset(embeddings_test)

The max size is 1621


In [552]:
test_inputs,predictions = test_evaluation_wv(model_w2v, testeval_dataset_wv, w2v_dataset_test)


100%|██████████| 541/541 [00:57<00:00,  9.38it/s]


In [554]:
torch.save(model_w2v, 'models/model_wv.pickle')

In [553]:
extract_data_from_id_csv(test_inputs, predictions, id_to_diacritics, idx_to_char)

'الظَّاهِرِالْأَوَّلِ                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [395]:
test_inputs,predictions = test_evaluation(model,testeval_dataset)
extract_data_from_id_csv(test_inputs, predictions, id_to_diacritics, idx_to_char)


100%|██████████| 1/1 [00:00<00:00, 90.88it/s]

torch.Size([1, 226])





'لَيْسَ لِلْوَكِيلِ بِالْقَبَضِ أَنْ يَبْرَأَ الْمَدَينَ أَوْ يَهْبُ الدَّيْنَ لَهُ أَوْ يَأْخُذُ رَهْنَا مِنْ الْمَدَينِ فِي مُقَابَلِ الدَّيْنِ أَوْ يَقْبَلُ إحَالَتُهُ عَلَى شَخْصٍ آخَرَ لَكِنْ لَهُ أَنْ يَأْخُذَ كَفِّيلًا لَكِنْ لَيْسَ لَهُ أَنْ يَأْخُذَ كَفِّيلًا بِشَرْطِ بِرَاءَةِ الْأَصِيلِ انْظَرَ الْمَادَةُ الْأَنْقُرَوي الطَّحْطَاوِيُّ وَصَرَةَ الْفَتَاوَى الْبَحْرْ'

In [410]:
import pickle


model_2 = torch.load('models/model3.pickle')
print(model_2)

Tashkeel(
  (embedding): Embedding(37, 37)
  (lstm): LSTM(37, 50, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=100, out_features=15, bias=True)
)


In [412]:
test_inputs,predictions = test_evaluation(model_2,testeval_dataset)
extract_data_from_id_csv(test_inputs, predictions, id_to_diacritics, idx_to_char)

  0%|          | 0/541 [00:00<?, ?it/s]

100%|██████████| 541/541 [00:41<00:00, 12.93it/s]


'الظَّاهِرُ الْأَوَّلِ                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

In [None]:
test_file = hf.read_file('Delivery/test_no_diacritics.txt')
cleaned_test = hf.clean_dataset(test_file,remove_diacritics=False)
test_sentences = split_arabic_sentences(cleaned_test)
if os.path.exists('Delivery/test_sentences.txt'):
    hf.write_file('Delivery/test_sentences.txt', '')

for s in test_sentences:
  append_to_file('Delivery/test_sentences.txt',s)
  
test_gomal = hf.read_file('Delivery/test_sentences.txt')

x_test = []
for line in test_gomal.splitlines():
    x_test.append([char_to_idx[char] for char in line])

print(x_test)

testeval_dataset = Test_Dataset(x_test, 0)  
model_demo = torch.load('models/model.pickle')

test_inputs,predictions = test_evaluation(model_demo,testeval_dataset)
extract_data_from_id_csv(test_inputs, predictions, id_to_diacritics, idx_to_char)