In [12]:
# read train.text 
with open('val.txt', 'r') as f:
    # print the number of lines that conatins arabic characters less than 5
    l = [line for line in f if len([c for c in line if c in 'ابتثجحخدذرزسشصضطظعغفقكلمنهوي']) < 2]
    print(l)

['أَوْ( 7 / 54 )\n', 'أَيْ :( 3 / 81 )\n']


In [42]:

HARAQAT = ["ْ", "ّ", "ٌ", "ٍ", "ِ", "ً", "َ", "ُ"]
PUNCTUATIONS = [".", "،", ":", "؛", "-", "؟"]
ARAB_CHARS = "ىعظحرسيشضق ثلصطكآماإهزءأفؤغجئدةخوبذتن"
ARAB_CHARS_NO_SPACE = "ىعظحرسيشضقثلصطكآماإهزءأفؤغجئدةخوبذتن"
ARAB_CHARS_PUNCTUATIONS = ARAB_CHARS + "".join(PUNCTUATIONS)
VALID_ARABIC = HARAQAT + list(ARAB_CHARS)
BASIC_HARAQAT = {
    "َ": "Fatha              ",
    "ً": "Fathatah           ",
    "ُ": "Damma              ",
    "ٌ": "Dammatan           ",
    "ِ": "Kasra              ",
    "ٍ": "Kasratan           ",
    "ْ": "Sukun              ",
    "ّ": "Shaddah            ",
}
ALL_POSSIBLE_HARAQAT = {
    "": "No Diacritic       ",
    "َ": "Fatha              ",
    "ً": "Fathatah           ",
    "ُ": "Damma              ",
    "ٌ": "Dammatan           ",
    "ِ": "Kasra              ",
    "ٍ": "Kasratan           ",
    "ْ": "Sukun              ",
    "ّ": "Shaddah            ",
    "َّ": "Shaddah + Fatha    ",
    "ًّ": "Shaddah + Fathatah ",
    "ُّ": "Shaddah + Damma    ",
    "ٌّ": "Shaddah + Dammatan ",
    "ِّ": "Shaddah + Kasra    ",
    "ٍّ": "Shaddah + Kasratan ",
}
CHARACTERS = [
    " ",
    "-",
    ".",
    ":",
    "،",
    "؛",
    "؟",
    "ء",
    "آ",
    "أ",
    "ؤ",
    "إ",
    "ئ",
    "ا",
    "ب",
    "ة",
    "ت",
    "ث",
    "ج",
    "ح",
    "خ",
    "د",
    "ذ",
    "ر",
    "ز",
    "س",
    "ش",
    "ص",
    "ض",
    "ط",
    "ظ",
    "ع",
    "غ",
    "ف",
    "ق",
    "ك",
    "ل",
    "م",
    "ن",
    "ه",
    "و",
    "ى",
    "ي",
]

DIACRITICS = [
    "",
    "ً",
    "ٌ",
    "ٍ",
    "َ",
    "ُ",
    "ِ",
    "ّ",
    "ًّ",
    "ٌّ",
    "ٍّ",
    "َّ",
    "ُّ",
    "ِّ",
    "ْ",
]

def extract_stack(stack, correct_reversed: bool = True):
    char_haraqat = []
    while len(stack) != 0:
        char_haraqat.append(stack.pop())
    full_haraqah = "".join(char_haraqat)
    reversed_full_haraqah = "".join(reversed(char_haraqat))
    if full_haraqah in ALL_POSSIBLE_HARAQAT:
        out = full_haraqah
    elif reversed_full_haraqah in ALL_POSSIBLE_HARAQAT and correct_reversed:
        out = reversed_full_haraqah
    else:
        raise ValueError(
            f"""The chart has the following haraqat which are not found in
        all possible haraqat: {'|'.join([ALL_POSSIBLE_HARAQAT[diacritic]
                                         for diacritic in full_haraqah ])}"""
        )
    return out
def extract_haraqat(text: str, correct_reversed: bool = True):

    if len(text.strip()) == 0:
        return text, [" "] * len(text), [""] * len(text)
    
    stack = []
    haraqat_list = []
    txt_list = []
    for char in text:
        # if chart is a diacritic, then extract the stack and empty it
        if char not in BASIC_HARAQAT.keys():
            stack_content = extract_stack(stack, correct_reversed=correct_reversed)
            haraqat_list.append(stack_content)
            txt_list.append(char)
            stack = []
        else:
            stack.append(char)
    if len(haraqat_list) > 0:
        del haraqat_list[0]
    haraqat_list.append(extract_stack(stack))

    return text, txt_list, haraqat_list

def normalize_diacritic(diacritics: list[str]):
    reverse_diacritic = "".join(reversed(diacritics))
    normal_diacritic = "".join(diacritics) 
    # check both normal and reverse diacritics
    if normal_diacritic in DIACRITICS:
        return normal_diacritic
    if reverse_diacritic in DIACRITICS:
        return reverse_diacritic
    raise ValueError(f"{diacritics} list not known diacritic")
    

def extract_diacritics(text: str):
    current_diacritics = []
    diacritics =[]
    chars=[]
    for char in text:
        if char in DIACRITICS:
            current_diacritics.append(char)
        else:
            diacritics.append(normalize_diacritic(current_diacritics))
            chars.append(char)
            current_diacritics = []
    
    if len(diacritics):
        del diacritics[0]
        
    diacritics.append(normalize_diacritic(current_diacritics))
    
    return text, chars, diacritics



In [49]:
# i = 0
# with open('val.txt', 'r') as f:
#     i+=1
txt1, txt_list1, haraqat_list1 = extract_haraqat(
        " الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ "
    )
txt2, txt_list2, haraqat_list2 = extract_diacritics(
        " الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ "
    )
# check equality
assert txt1 == txt2
assert txt_list1 == txt_list2
print(haraqat_list1)
print(haraqat_list2)
assert haraqat_list1 == (haraqat_list2)

['', '', 'ْ', 'َ', 'ْ', 'ُ', '', 'ِ', 'َّ', 'ِ', '', 'َ', 'ِّ', '', '', 'ْ', 'َ', '', 'َ', 'ِ', '', 'َ', '']
['', '', 'ْ', 'َ', 'ْ', 'ُ', '', 'ِ', 'َّ', 'ِ', '', 'َ', 'ِّ', '', '', 'ْ', 'َ', '', 'َ', 'ِ', '', 'َ', '']
