In [71]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("./")
from collections import Counter
from utils import *
import pandas as pd
import torch 
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from tqdm import tqdm
import math

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Read Files

In [72]:
files = get_file_list('tashkeel-data/texts.txt/')

In [73]:
keys = list(files.keys())
print("folder names: ")
keys

folder names: 


['tashkeel-data/texts.txt/',
 'tashkeel-data/texts.txt/msa',
 'tashkeel-data/texts.txt/msa/كتب حديثة',
 'tashkeel-data/texts.txt/msa/sulaity',
 'tashkeel-data/texts.txt/msa/aljazeera',
 'tashkeel-data/texts.txt/msa/al-kalema.org',
 'tashkeel-data/texts.txt/msa/manual',
 'tashkeel-data/texts.txt/msa/منوع',
 'tashkeel-data/texts.txt/msa/enfal.de']

# EDA

## Word Level EDA

In [74]:
original_words = Counter()
for i in range(len(files[keys[0]])):
    with open(files[keys[0]][i], 'r') as f:
        lines = f.readlines()
        for line in lines:
            words = line.strip().split()
            original_words.update(words)

KeyboardInterrupt: 

In [None]:
get_word_statistics(original_words)

Total words: 74761989
Arabic words: 87.79097089030095%
Diacritics in AR words: 98.20337014154006%
No diacritics in AR words: 1.7966298584599303$
Non Arabic words: 0.12209029109699047


In [None]:
print("Most common words:")
original_words.most_common(50)

Most common words:


[('،', 2105298),
 ('فِي', 1779532),
 (')', 1271716),
 ('(', 1259799),
 (':', 1204024),
 ('مِنْ', 1088095),
 ('.', 1011062),
 ('عَلَى', 895029),
 ('لَا', 790846),
 ('أَوْ', 719862),
 ('قَالَ', 603025),
 ('عَنْ', 588331),
 ('مَا', 553010),
 ('عَلَيْهِ', 476417),
 ('قَوْلُهُ', 473619),
 ('أَيْ', 424830),
 ('أَنْ', 415670),
 ('كَانَ', 406927),
 ('لَمْ', 397072),
 ('أَنَّ', 361389),
 ('لَهُ', 351426),
 ('؛', 342197),
 ('ذَلِكَ', 340369),
 ('إذَا', 320632),
 ('وَلَا', 317265),
 ('اللَّهُ', 310251),
 ('بِهِ', 305624),
 ('إلَى', 301030),
 ('أَنَّهُ', 280791),
 ('بْنُ', 273902),
 ('وَإِنْ', 266372),
 ('لِأَنَّ', 258107),
 ('فِيهِ', 256790),
 ('وَهُوَ', 248018),
 ('اللَّهِ', 239760),
 ('هَذَا', 237286),
 ('كَمَا', 233270),
 ('بْنِ', 231669),
 ('ثُمَّ', 229850),
 ('إنْ', 211916),
 ('فَلَا', 197053),
 ('لَوْ', 196245),
 ('وَلَوْ', 194181),
 ('لِأَنَّهُ', 193841),
 ('إلَّا', 192622),
 ('حَدَّثَنَا', 184796),
 ('هُوَ', 169151),
 ('فَإِنْ', 167499),
 ('مَنْ', 161669),
 ('-', 161146)]

In [None]:
print("Least common words:")
original_words.most_common()[-50:]

Least common words:


[('مَحَامِدَهُ', 1),
 ('فَلَا(12/476)', 1),
 ('لِعَمُودِي', 1),
 ('طَلَّاقِهَا', 1),
 ('وَكِيدَةٍ', 1),
 ('،(12/486)', 1),
 ('رَجُلٌ(12/489)', 1),
 ('الرَّبَعَةُ', 1),
 ('،(12/490)', 1),
 ('فِيهِ(12/491)', 1),
 ('بِهَاشِمَةِ', 1),
 ('وَثُيُوبَةِ', 1),
 ('وَكَالْخَبَرِ', 1),
 ('ثَبَتَ(12/495)', 1),
 ('غَلَطِنَا', 1),
 ('لِلشَّافِعِيِّ(13/2)', 1),
 ('رِضَاعٌ', 1),
 ('بِالتَّسْدِيسِ', 1),
 ('فَكَرُجُوعِ', 1),
 ('فَيُتَوَجَّهَانِ', 1),
 ('فَيُهَدِّدُهُ', 1),
 ('فَيَدْهَشُ', 1),
 ('وَدَهِشْت', 1),
 ('وَحَيَّيْنِ', 1),
 ('فَكَوَلَدٍ', 1),
 ('وَحُرُمِهِ', 1),
 ('الْقَاضِي(13/47)', 1),
 ('الرِّوَايَاتِ(13/49)', 1),
 ('بِنَاقِصِهِ', 1),
 ('وَازِنُهُ', 1),
 ('فَلَوْ(13/58)', 1),
 ('الْمِقْدَارَانِ', 1),
 ('أَتَسَلَّمْهُ', 1),
 ('بِصَيْدٍ(13/70)', 1),
 ('بِجَوْزَةٍ', 1),
 ('وَيُسَفِّهُ', 1),
 ('فَكَالْجَرِّ', 1),
 ('كَحِلِّهِ', 1),
 ('فَقَفِيزِ', 1),
 ('،(13/82)', 1),
 ('فَهَلْ(13/84)', 1),
 ('الْمُسَرَّجَةِ', 1),
 ('مَسْطُورَةً(13/90)', 1),
 ('وَكَالْمِائَةِ', 1),
 ('وَقِرَابٍ', 1),
 ('وَبِتَعْد

In [None]:
characters = Counter()
for word in original_words:
    for char in word:
        characters.update(char)

In [None]:
non_arabic_characters = [c for c in characters if not is_arabic_char(c) and not is_harakah(c)]
print(f"# non_arabic_characters: {len(non_arabic_characters)}")
non_arabic_characters

# non_arabic_characters: 116


[':',
 'h',
 't',
 'p',
 '/',
 'w',
 '.',
 'a',
 'l',
 '-',
 'i',
 's',
 'm',
 'c',
 'o',
 '[',
 ']',
 '(',
 '1',
 ')',
 '،',
 '2',
 '{',
 '}',
 '3',
 '4',
 '؛',
 '5',
 '؟',
 '6',
 '7',
 '8',
 '9',
 '0',
 '~',
 ';',
 '«',
 '»',
 '’',
 'd',
 '=',
 ',',
 '"',
 '`',
 "'",
 '*',
 '+',
 'L',
 'e',
 '–',
 '!',
 '…',
 '\u200d',
 '<',
 'P',
 'F',
 'O',
 'N',
 'T',
 'S',
 'I',
 'Z',
 'E',
 'x',
 'A',
 'M',
 'Y',
 'r',
 'n',
 'b',
 'f',
 'u',
 'B',
 'R',
 'D',
 'G',
 'H',
 '#',
 'C',
 'K',
 'U',
 'y',
 'W',
 '>',
 '´',
 '_',
 'ے',
 '\ufeff',
 'X',
 'g',
 '?',
 '&',
 'q',
 '‘',
 '“',
 '•',
 'j',
 'v',
 'k',
 'V',
 'Q',
 '‰',
 '﴿',
 '﴾',
 '١',
 '٣',
 '٢',
 '٧',
 '٤',
 '٩',
 '٠',
 '٨',
 '٥',
 '٦',
 '\\',
 '\u200f']

In [None]:
# non arabic words
non_arabic_words = [w for w in original_words if not is_arabic_word(w)]
print(f"# non arabic words: {len(non_arabic_words)}")
non_arabic_words

# non arabic words: 472784


[':',
 'http://www.al-islam.com',
 '[',
 ']',
 '.',
 '(1/1)',
 '،',
 '(',
 ')',
 'بِالْكُتُبِ(1/2)',
 '{',
 '}',
 'فَيَصَدَّقُ(1/3)',
 'أَحَدِهِمَا(1/4)',
 '؛',
 'الِابْتِدَاءِ(1/5)',
 '؟',
 'الِاشْتِقَاقِ(1/6)',
 'إدْرَاكِ(1/7)',
 'الْأُسْتَاذِ(1/8)',
 'فَالتَّفَضُّلُ(1/9)',
 'وَهُوَ(1/10)',
 'عَلَى(1/11)',
 'تَقْدِيمَ(1/12)',
 'لِأَنَّهُ(1/13)',
 'فَيَمْتَنِعُ(1/14)',
 '.(1/15)',
 'ذَكَرْنَا(1/16)',
 '(1/17)',
 'جَدُّهُ(1/18)',
 'وَقَوْلِي(1/19)',
 'سُبْحَانَهُ(1/20)',
 '.(1/21)',
 'اللَّهُ(1/22)',
 'كَتَبَهُ(1/23)',
 'يَرْوِي(1/24)',
 'الْحَمْدِ(1/25)',
 'فِي(1/26)',
 'وَبِهَذَا(1/27)',
 '،(1/28)',
 '،(1/29)',
 '.(1/30)',
 '،(1/31)',
 'الْإِنْسَانَ(1/32)',
 '-',
 'أَيْ(1/33)',
 'فِي(1/34)',
 '.(1/35)',
 'الضَّمِيرَ(1/36)',
 'لَذَّةَ(1/37)',
 'وَالنُّقْصَانَ(1/38)',
 'فِيهِ(1/39)',
 '.(1/40)',
 '.(1/41)',
 '(1/42)',
 'قَالَ(1/43)',
 'وَهُوَ(1/44)',
 'فِي(1/45)',
 'مِنْ(1/46)',
 'وَمَا(1/47)',
 'يَأْتِي(1/48)',
 'بَعْدَ(1/49)',
 'الْجَوَابِ(1/50)',
 'إلَّا(1/51)',
 '.(1/52)',
 '(1/53)

In [None]:
sanitazied_words = Counter()
for word in original_words:
    valid_word = clean_words([word]) # remove non arabic characters
    if valid_word:
        sanitazied_words[valid_word[0]] += original_words[word]
        
get_word_statistics(sanitazied_words)

Total words: 66471156
Arabic words: 99.9983060321683%
Diacritics in AR words: 98.05408091436095%
No diacritics in AR words: 1.9459190856390467$
Non Arabic words: 1.6939678317013173e-05


In [None]:
non_diac_words = [w for w in sanitazied_words if not has_any_diacritics(w)]
print(f"# non disacritized words: {len(non_diac_words)}")
non_diac_words

# non disacritized words: 46422


['تم',
 'إعداد',
 'هذا',
 'الملف',
 'آليا',
 'بواسطة',
 'المكتبة',
 'الشاملة',
 'الكتاب',
 'الفواكه',
 'الدواني',
 'على',
 'رسالة',
 'ابن',
 'أبي',
 'زيد',
 'القيرواني',
 'مصدر',
 'موقع',
 'الإسلام',
 'مشكول',
 'ومرقم',
 'غير',
 'موافق',
 'للمطبوع',
 'ا',
 'هـ',
 'وللمقدورية',
 'ص',
 'تقايئه',
 'اجتزءوا',
 'تت',
 'الم',
 'حم',
 'المص',
 'ق',
 'عبق',
 'بالمفضولية',
 'و',
 'وعذيطة',
 'والعذيطة',
 'بالعبدلاوي',
 'البفت',
 'الـ',
 'وضربوب',
 'إنية',
 'بتبة',
 'كورايا',
 'أبج',
 'بهسترم',
 'عج',
 'مغازي',
 'الواقدي',
 'المؤلف',
 'أبو',
 'عبد',
 'الله',
 'محمد',
 'بن',
 'عمر',
 'واقد',
 'المتوفى',
 'وترقيمه',
 'المغازي',
 'للواقدي',
 'مقدمة',
 'إي',
 'يس',
 'إيه',
 'أ',
 'إ',
 'ذكر',
 'لمهلك',
 'أهلهالشعر',
 'لصوتيالشعر',
 'أمي',
 'لو',
 'أمرت',
 'بقتلهالشعر',
 'وذراريم',
 'رضي',
 'با',
 'لهم',
 'لعمرو',
 'ما',
 'فلا',
 'فانتظمها',
 'فيه',
 'ثم',
 'خرج',
 'به',
 'أو',
 'يا',
 'بي',
 'في',
 'عليه',
 'وسلم',
 'خيثمة',
 'فأقامني',
 'فإذا',
 'الذباب',
 'يطلع',
 'من',
 'ب',
 'معد',
 'بأمرهالشعر',

In [None]:
same_sarf_counter = Counter()
sanitazied_words1 = sanitazied_words.copy()
for w1 in sanitazied_words:
    sanitazied_words1.pop(w1)
    for w2 in sanitazied_words1:
        if w1 != w2 and same_sarf(w1, w2) and not same_sarf_counter[(w1, w2)] and not same_sarf_counter[(w2, w1)]:
            same_sarf_counter[(w1, w2)] = sanitazied_words[w1] + sanitazied_words[w2]

KeyboardInterrupt: 

In [None]:
same_sarf_counter.most_common()[-50:]

In [None]:
same_irab_counter = Counter()
for w1,_ in sanitazied_words.most_common(100):
    for w2,_ in sanitazied_words.most_common(100):
        if w1 != w2 and same_irab(w1, w2) and not same_irab_counter[(w1, w2)] and not same_irab_counter[(w2, w1)]:
            same_irab_counter[(w1, w2)] = sanitazied_words[w1] + sanitazied_words[w2]

In [None]:
same_irab_counter.most_common(50)

In [None]:
same_irab_counter.most_common()[-50:]

## Sentence Level EDA

In [None]:
all_sentences = []
filename = []
for i in range(len(files[keys[0]])):
    with open(files[keys[0]][i], 'r') as f:
        lines = f.readlines()
        file = files[keys[0]][i].split('/')[-1]
        for line in lines:
            sentences = line.strip().split('.')
            filename += [file]*len(sentences)
            all_sentences += sentences

df = pd.DataFrame({'filename': filename, 'sentence': all_sentences})

In [None]:
import math

max_length = 0
min_length = math.inf
mean_length = 0

for sentence in all_sentences:
    words = sentence.strip().split()
    max_length = max(max_length, len(words))
    min_length = min(min_length, len(words))
    mean_length += len(words)

mean_length /= len(all_sentences)

print(f"max_length: {max_length}, min_length: {min_length}, mean_length: {mean_length}")

max_length: 1613, min_length: 0, mean_length: 24.218289044876695


# Data Preprocessing

In [None]:
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

# read files into Document objects
docs = []
# for i in range(len(files[keys[0]])):
for i in range(8):
    with open(files[keys[0]][i], 'r') as f:
        lines = f.readlines()
        lines = ' '.join(lines)
        file = files[keys[0]][i].split('/')[-1]
        metadata = {"source": file}
        doc = Document(page_content=lines, metadata=metadata)
        docs.append(doc)

## Splitting

In [None]:
train_len = math.ceil(len(docs)*0.8)
valid_len = math.floor(len(docs)*0.1)
test_len = math.ceil(len(docs)*0.1)

# Calculates the length of words in a text (tokens)
def custom_length(text: list[str]) -> int:
    return len(text.split())

# Split the documents into chunks of 256 words
splitter = RecursiveCharacterTextSplitter(chunk_size=2,
            chunk_overlap=0,
            length_function=custom_length,
            separators=["\n\n", "\n", ".", " "])

# validation_docs = splitter.split_documents(docs[train_len:train_len+valid_len])
# testing_docs = splitter.split_documents(docs[train_len+valid_len:]

splitted_docs = splitter.split_documents(docs)

In [None]:
train_len = math.ceil(len(splitted_docs)*0.8)
valid_len = math.floor(len(splitted_docs)*0.1)
test_len = math.ceil(len(splitted_docs)*0.1)

training_docs = splitted_docs[:train_len]
validation_docs = splitted_docs[train_len:train_len+valid_len]
testing_docs = splitted_docs[train_len+valid_len:]

print(f"training_docs: {len(training_docs)}, validation_docs: {len(validation_docs)}, testing_docs: {len(testing_docs)}")

training_docs: 2409028, validation_docs: 301128, testing_docs: 301129


## Tokenization

In [None]:
from tokenizer import Tokenizer

tokenizer = Tokenizer(character_level=True)
tokenizer.build_tokenizer_table(training_docs)

100%|██████████| 2409028/2409028 [00:10<00:00, 219718.17it/s]


In [None]:
# random pairs from the validation set
tt = training_docs[205].page_content
input, target = tokenizer.get_pair(tt, encoded=False)
print("original: ",tt)
print("input: ",input)
print("target: ",target)
print("output: ",shakkel(input, target))
print("original == output: ",tt == shakkel(input, target))

original:  قَلِيلُ الْبَرَكَةِ
input:  قليل البركة
target:  َ ِ <UNK> ُ <PAD> <UNK> ْ َ َ َ ِ
output:  قَلِيلُ الْبَرَكَةِ
original == output:  True


In [None]:
input_ids, target_ids = tokenizer.get_pair(tt, encoded=True)
print("input: ",input_ids)
print("target: ",target_ids)
text = tokenizer.decode(input_ids)
harkat = tokenizer.decode(target_ids, is_harakat=True)
print("output: ",shakkel(text, harkat))
print("original == output: ",tt == shakkel(text, harkat))

input:  [1, 46, 48, 54, 48, 0, 19, 48, 20, 29, 47, 21, 2]
target:  [1, 4, 6, 3, 5, 0, 3, 11, 4, 4, 4, 6, 2]
output:  قَلِيلُ الْبَرَكَةِ
original == output:  True


In [None]:
tokenizer.get_pair("يًّ", encoded=True)

([1, 54, 2], [1, 10, 7, 2])

# Training

## Data Preparation

In [None]:
def prepare_data(docs):
    max_input = 0  
    max_output = 0
    pairs = []
    for doc in tqdm(docs):
        try:
            input_ids, target_ids = tokenizer.get_pair(doc.page_content, encoded=True)
        except:
            # print("exception: ", doc.page_content)
            continue
        pairs.append((input_ids, target_ids))
        max_input = max(max_input, len(input_ids))
        max_output = max(max_output, len(target_ids))
    return pairs, max_input, max_output

## Data Loading

In [273]:
device = torch.device("cpu")
def get_dataloader(pairs, max_input, max_target, batch_size, truncate=True):
    n = len(pairs)
    input_ids = np.zeros((n, max_input), dtype=np.int32)
    target_ids = np.zeros((n, max_target), dtype=np.int32)

    for idx, (inp_ids, tgt_ids) in tqdm(enumerate(pairs)):
        if truncate:
            inp_ids = inp_ids[:max_input]
            tgt_ids = tgt_ids[:max_target]
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids

    data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))

    sampler = RandomSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)
    return dataloader

In [278]:
train_pairs, max_input, max_target = prepare_data(training_docs[:10000])
validate_pairs, _, _ = prepare_data(validation_docs[:10000])

100%|██████████| 10000/10000 [00:00<00:00, 88248.74it/s]
100%|██████████| 10000/10000 [00:00<00:00, 103476.70it/s]


In [279]:
print(f"input token length: {max_input}, target output length: {max_target}")

input token length: 22, target output length: 25


In [280]:
train_dataloader = get_dataloader(train_pairs, max_input, max_target, 32)
validate_dataloader = get_dataloader(validate_pairs, max_input, max_target, 32)

10000it [00:00, 652282.04it/s]
10000it [00:00, 725532.61it/s]


In [309]:
from model import EncoderRNN, DecoderRNN
from train import main
from tokenizer import PAD_TOKEN

params = {
    'input_size': tokenizer.n_tokens, # num of (character or word) tokens
    'hidden_size': 32, # size of embedding
    'output_size': tokenizer.output_size, # harakat size
    'n_layers': 2, 
    'dropout': 0.1,
    'max_length': max_target,  # max length of output sequence
    'device': device,
    'PAD_TOKEN': PAD_TOKEN,
}

encoder = EncoderRNN(params).to(device)
decoder = DecoderRNN(params).to(device)

main(train_dataloader, validate_dataloader, encoder, decoder, 30, print_every=5)

time (time left): 0m 36s (- 3m 3s)
epoch:5/30
train loss:  0.767, validate loss: 0.570
time (time left): 1m 13s (- 2m 27s)
epoch:10/30
train loss:  0.613, validate loss: 0.536
time (time left): 1m 50s (- 1m 50s)
epoch:15/30
train loss:  0.571, validate loss: 0.491
time (time left): 2m 27s (- 1m 13s)
epoch:20/30
train loss:  0.511, validate loss: 0.443
time (time left): 3m 4s (- 0m 36s)
epoch:25/30
train loss:  0.472, validate loss: 0.417


In [296]:
from train import inference
from tokenizer import SOS_TOKEN, EOS_TOKEN

input_text = "يوسف"
input_tensor = tokenizer.encode(input_text)
input_tensor = torch.LongTensor(input_tensor).reshape(1, -1).to(device) # to make batch first

output = inference(encoder, decoder, input_tensor)

for out in output:
    # slice tensor from SOS to EOS token
    print("output: ", out)
    out = out[out != SOS_TOKEN]
    out = out[out != EOS_TOKEN]
    harakat = tokenizer.decode(out.tolist(), is_harakat=True)
    shakkelled = shakkel(input_text, harakat)
    print("Model Tashkeel: ", shakkelled)
    print("original: ", input_text)
    print("output == original: ", shakkelled == input_text)


output:  tensor([ 1,  4, 11,  4,  4,  4,  0,  4,  3,  2,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0])
Model Tashkeel:  يَوْسَفَ
original:  يوسف
output == original:  False
