In [23]:
import os
import re
import torch
import helper_file as hf
from tqdm import tqdm
import torch.nn as nn

In [30]:
def split_arabic_sentences(text):
   
    pattern = r'(?<=[.؟!,؛])'

    # Split the text into sentences based on the pattern
    sentences = re.split(pattern, text)
    # remove punctuations
    sentences = [re.sub(r'[.؟!،؛]', '', sent) for sent in sentences]
    sentences = [re.sub(r'\s+', ' ', sent) for sent in sentences]
    sentences = filter(lambda sentences: sentences.strip(), sentences)

    return sentences 


def append_to_file(file_path, content):
    with open(file_path, 'a', encoding='utf-8') as file:
        file.write(content.strip() + '\n')


arabic_letters = hf.read_pickle('Delivery/arabic_letters.pickle')

diacritics = hf.read_pickle('Delivery/diacritics.pickle')

diacritics_to_id = hf.read_pickle('Delivery/diacritics2id.pickle')

chars = sorted(arabic_letters) 
char_to_idx = {char: idx + 1 for idx, char in enumerate(chars)}  # Assigning 0 for padding
char_to_idx[' '] = 0
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
idx_to_char[0] = ' ' 


def extract_data_from_id_csv(test_input, predictions, id_to_diacritics, idx_to_char):
    id = 0  
    with open('Delivery/predictions.csv', 'w', encoding='utf-8') as file:
        file.write('ID,label\n')

    to_csv = []    

    for k,test in enumerate(test_input):
        for j,batch in enumerate(test):
            string = ''
            for i,char in enumerate(batch):
                if test[j][i].item() != 0:
                    to_csv_string = str(id) + ',' + str(predictions[k][j][i].item()) + '\n'
                    to_csv.append(to_csv_string)
                    id += 1
                string += (idx_to_char[test[j][i].item()] + id_to_diacritics[predictions[k][j][i].item()])

                # output a csv file that has an iterator in first column and the diacritic in the second column
            print(string)
     
    # for item in to_csv:
    #     append_to_file('Delivery/predictions.csv', item)        
 
        
id_to_diacritics = {}
for key, value in diacritics_to_id.items():
    id_to_diacritics[value] = key



In [31]:
class Test_Dataset(torch.utils.data.Dataset):

  def __init__(self,x,pad):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
 
    max_size = max([len(i) for i in x]) # find the max length of the sentences 
    print('The max size is', max_size)
    self.x = torch.tensor([i + [pad] * (max_size - len(i)) for i in x]) # pad the sentences with <PAD> token

  def __len__(self):
    return len(self.x)

  def __getitem__(self, idx):
    return self.x[idx]
  
  
class Tashkeel(nn.Module):
  def __init__(self, vocab_size=37, embedding_dim=37, hidden_size=50, n_classes=15):
    """
    character level tashkeel model
    The constructor of our NER model
    Inputs:
    - vocab_size: the number of unique characters in the dataset
    - embedding_dim: the embedding dimension
    - n_classes: the number of final classes (tags)
    """
    super(Tashkeel, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    
    self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True,bidirectional=True) 
    self.linear = nn.Linear(embedding_dim * 2, n_classes)

  def forward(self, sentences):
    """
    This function does the forward pass of our model
    Inputs:
    - sentences: tensor of shape (batch_size, max_length)

    Returns:
    - final_output: tensor of shape (batch_size, max_length, n_classes)
    """
    final_output = self.embedding(sentences)
    final_output, _ = self.lstm(final_output)
    final_output = self.linear(final_output)
    return final_output


def test_evaluation(model,test_dataset,batch_size=5):
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

    # GPU Configuration
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()

    all_predictions = []
    all_test_inputs = []
    with torch.no_grad():
        for test_input in tqdm(test_dataloader):
            # print(test_input.shape)
            # Move the test input to the device
            test_input = test_input.to(device)

            # Perform the forward pass
            output = model(test_input)

            # Get predicted labels
            predicted_labels = output.argmax(dim=2)
            

            all_test_inputs.append(test_input)
            all_predictions.append(predicted_labels)

    return all_test_inputs,all_predictions

In [32]:
test_file = hf.read_file('live_demo/test_no_diacritics.txt')
cleaned_test = hf.clean_dataset(test_file,remove_diacritics=False)
test_sentences = split_arabic_sentences(cleaned_test)
if os.path.exists('live_demo/test_sentences.txt'):
    hf.write_file('live_demo/test_sentences.txt', '')

for s in test_sentences:
  append_to_file('live_demo/test_sentences.txt',s)
  
test_gomal = hf.read_file('live_demo/test_sentences.txt')

x_test = []
for line in test_gomal.splitlines():
    x_test.append([char_to_idx[char] for char in line])


testeval_dataset = Test_Dataset(x_test, 0)  
model_demo = torch.load('models/model.pickle')

test_inputs,predictions = test_evaluation(model_demo,testeval_dataset)
extract_data_from_id_csv(test_inputs, predictions, id_to_diacritics, idx_to_char)

The max size is 572


  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00, 113.12it/s]

لَيْسَ لِلْوَكِيلِ بِالْقَبْضِ أَنْ يَبْرَأَ الْمَدَينِ أَوْ يَهَبَ الدَّيْنِ لَهُ أَوْ يَأْخُذُ رَهُنًا مِنْ الْمَدَينِ فِي مُقَابَلِ الدَّيْنِ أَوْ يُقْبَلُ إحْالَتَهُ عَلَى شَخْصٍ آخَرَ لَكِنَّ لَهُ أَنْ يَأْخُذَ كَفِيلًا لَكِنْ لَيْسَ لَهُ أَنْ يَأْخُذَ كَفِيلًا بِشَرْطِ بَرِاءَةِ الْأَصِيلِ انْظُرْ الْمَادَّةِ الْأَنْقْرُوِيُّ الطَّحَطَاوِيُّ وَصَرَّةَ الْفَتَاوَى الْبَحْرِ                                                                                                                                                                                                                                                                                                                                                          
قَوْلُهُ وَيَقَعُ فِي بَعْضِ النَّسْخِ بِمَنْفَعَةٍ وَمُعَيَّنٌ أَيْ أَوْصَى بِمَجْمُوعِ شَيْئَينٍ بِمَنْفَعَةِ شَيْءٍ وَبِمُعَينٍ وَقَوْلُهُ وَلَيْسَ ذَلِكَ بِصَحِيحٍ كَأَنَّ عَدَمَ الصِّحَّةِ مِنْ جِهَةٍ أَنَّ هَذِهِ الْمَسْأَلَةَ فِيهَا نَصَّ بِهَذَا الْحُكْمِ الَّذِي 




وَانْظُرْ هَلْ مِنْ تَعْيِينِ الزَّمَانِ التَّقْيِيدِ بِشَهْرِ دُونَ تَعْيِينِهِ بِكَوْنِهِ رَبَيعًا مَثَلًا أَيْ بِالْإِشَارَةِ إلَيْهِ أَوْ لَيْسَ ذَلِكَ تَعْيِّينًا وَحِينَئِذٍ فَالْقِسْمَةُ غَيْرُ صَحِيحَةٍ وَالثَّانِي هُوَ مَا اخْتَارَهُ ابْنُ عَرَفَةَ وَاخْتَار شَيْخْنَا الْعَدْوِي أَنَّهُ تَعْيِينٌ قَوْلُهُ أَوْ نُونِ أَيْ مَضْمُومَةٌ فَهَمْزَةً وَيَجُوزُ قَلَبَ الْهَمْزَةِ يَاءً وَحِينَئِذٍ تَقْلِبُ ضمَّةُ النُّوْنِ الْوَاقِعَةَ قَبْلَهَا كَسْرَةٍ قَوْلْهُ لَا أَكْثَرَ أَيْ                                                                                                                                                                                                                                                                                         
لِأَنَّ الْمُدَّةَّ الَّتِي يَقَعُ الْقَبْضُ بَعْدَهَا هُنَا كَالْمُدَّةِ فِي الْإِجَارَةِ فَكَمَا لَا يَجُوزُ إجَارَةُ عَبْدْ مُعَيَّنٍ عَلَى أَنْ يَقْبَضَ بَعْدَ أَكْثَرَ مِنْ شَهْرِ لَا يَجُوزُ فِي الْمُهَايَأَةِ أَنْ يَسْتَعْمَل