# TEXT PROCESSING
## Converting Uthmani Quranic Script to IndoPak Script

In [1]:
#Nibtahil Nafees 21I-0330
#Aiman Karim 21I-0664
#BS-AI
#BS-CS

In [2]:
import pandas as pd
import numpy as np
import pyarabic.araby as araby
import pyarabic.number as number

In [8]:
    zabar =  'َ'
    zeyr = 'ﹺ'
    double_zabar  = 'ً'
    double_zeyr = 'ٍ' 
    dammatan = 'ٌ'
    damma = 'ُ'
    shadda = 'ّ'
    sakun = 'ْ'
    Allah = 'ﷲ'
    dagger_alif = 'ٰ'
    hamza_tulwasal = 'ٱ'
    hamza_above = 'ٲ'
    hamza_below = 'ٳ'
    high_hamza = 'ٴ'
    dotless_khah = 'ۡ'
    alif = 'ا'
    lam_with_alifhamza_above = 'ﻷ'
    lam_with_alifhamza_below = 'ﻹ'
    lam_with_alif = 'ﻻ'
    yay = 'ﻱ'
    hamza_abovee = 'ٲَ',
    hamza_beloww = 'ٳِ',
    word_uthmani = 'إِيَّاكَ',
    word_indopak = 'اِیَّاكَ'

In [10]:
import os
import xml.etree.ElementTree as ET

#using xml file to load dictionary

def load_dictionary_from_xml(file_path):
    dictionary = {}
    tree = ET.parse(file_path)
    root = tree.getroot()
    for entry in root.findall('entry'):
        key = entry.find('key').text
        value = entry.find('value').text
        dictionary[key.strip()] = value.strip()
    return dictionary

def modify_surahs(input_file, output_file, dictionary_file):
    arabic_dict = load_dictionary_from_xml(dictionary_file)
    dotless_khah = arabic_dict.get('dotless_khah', "\u06E1")
    dagger_alif = arabic_dict.get('dagger_alif', "\u0670")
    shadda = arabic_dict.get('shadda', "\u0651")
    hamza_abovee = arabic_dict.get('hamza_abovee', 'ٲَ')
    hamza_beloww = arabic_dict.get('hamza_beloww', 'ٳِ')
    lam_with_alifhamza_above = arabic_dict.get('lam_with_alifhamza_above', 'ﻷ')
    lam_with_alifhamza_below = arabic_dict.get('lam_with_alifhamza_below', 'ﻹ')
    alif = arabic_dict.get('alif', 'ا')

    # Reading the input text file
    with open(input_file, 'r', encoding='utf-8') as file:
        text = file.read()

    previous_had_shadda_or_alef = False
    modified_text = ""

    for i in range(len(text)):
        if i > 0 and (text[i-1] == shadda or (text[i-1] == dagger_alif and text[i] != shadda)):
            previous_had_shadda_or_alef = True
        else:
            previous_had_shadda_or_alef = False
        
        if previous_had_shadda_or_alef and text[i] == "ي":
            modified_text += text[i] + dotless_khah
        else:
            modified_text += text[i]

    # Replacing the Alif Maqsoora with Alif
    modified_text = modified_text.replace('إ', 'ا')
    modified_text = modified_text.replace('أ' , 'ا')
    modified_text = modified_text.replace('ٱللَّهِ',' اللهِ')
    modified_text = modified_text.replace('ٰ','ﹶ')

    # Mapping Uthmani script to Indo script
    mapping = {
        arabic_dict.get('hamza_tulwasal', ''): alif,
        sakun: dotless_khah,
        hamza_abovee: alif,
        hamza_beloww: alif,
        lam_with_alifhamza_above: arabic_dict.get('lam_with_alif', ''),
        lam_with_alifhamza_below: arabic_dict.get('lam_with_alif', ''),
    }

    indo_script = ""
    for char in modified_text:
        if char in mapping:
            indo_script += mapping[char]
        else:
            indo_script += char

    output_dir = os.path.dirname(output_file)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(indo_script)


input_file = r'C:\Users\user\Desktop\Iknex_internship\quran-uthmani-correct.txt'
output_file = r'C:\Users\user\Desktop\Iknex_internship\Output\output.txt'
dictionary_file = 'arabic_dict.xml'
modify_surahs(input_file, output_file, dictionary_file)


## CODE FOR COMPARING WITH GOLD TRUTH

In [12]:
import difflib

def compare_text_files(file1_path, file2_path, output_file_path):
    try:
        with open(file1_path, 'r', encoding='utf-8') as file1:
            text1 = file1.read()
            
        with open(file2_path, 'r', encoding='utf-8') as file2:
            text2 = file2.read()
            
        words1 = text1.split()
        words2 = text2.split()
        diff = list(difflib.ndiff(words1, words2))

        # Extracting differences
        differences = []
        for i, d in enumerate(diff):
            if d.startswith('- ') or d.startswith('+ '):
                differences.append((i, d))
#writing diff to a file
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            for index, diff in differences:
                output_file.write(f"Difference at position {index}: {diff}\n")

        return "Differences written to the output file."

    except FileNotFoundError:
        return "One or both files not found."

file1_path = r"C:\Users\user\Desktop\Iknex_internship\Output\output.txt"
file2_path = r"C:\Users\user\Desktop\Iknex_internship\Output\indopak_text.txt"
output_file_path = r"C:\Users\user\Desktop\Iknex_internship\Output\comparison.txt"
result = compare_text_files(file1_path, file2_path, output_file_path)
print(result)


Differences written to the output file.
