In [None]:
import json
def analyze_unicode_chars(json_path, ignore_tags=True):
    """
    Analyze all unique character-unicode pairs from 'text' fields in a JSON file.
    
    Args:
        json_path (str): Path to the JSON file
        ignore_tags (bool): If True, ignore HTML-style tags and their contents
    """
    import re
    
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    char_unicode_pairs = set()
    
    # Regex to match HTML-style tags and their contents
    tag_pattern = re.compile(r'<[^>]*>.*?</[^>]*>|<[^>]*/?>')
    
    for key, value in data.items():
        if isinstance(value, dict) and 'text' in value:
            text = value['text']
            
            if ignore_tags:
                # Remove HTML-style tags and their contents
                text = tag_pattern.sub('', text)
            
            for char in text:
                unicode_code = f"U+{ord(char):04X}"
                char_unicode_pairs.add((char, unicode_code))
    
    # Sort pairs for consistent output
    sorted_pairs = sorted(char_unicode_pairs, key=lambda x: ord(x[0]))
    
    print(f"Found {len(sorted_pairs)} unique character-unicode pairs:")
    for char, unicode_code in sorted_pairs:
        print(f"'{char}' -> {unicode_code}")
    
    return sorted_pairs

analyze_unicode_chars("resources/Quran.json")

In [None]:
import json
def analyze_chars_by_key(json_path, key_value):
    """
    Analyze characters in the 'text' field of a specific key in a JSON file.
    
    Args:
        json_path (str): Path to the JSON file
        key_value (str): The key to look up in the JSON
    """
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    if key_value not in data:
        print(f"Key '{key_value}' not found in JSON file")
        return
    
    if not isinstance(data[key_value], dict) or 'text' not in data[key_value]:
        print(f"Key '{key_value}' does not contain a 'text' field")
        return
    
    text = data[key_value]['text']
    
    for char in text:
        unicode_code = f"U+{ord(char):04X}"
        print(f"{char}  {unicode_code}")

# Example usage:
analyze_chars_by_key("resources/Quran.json", "1:1:1")

ب  U+0628
ِ  U+0650
س  U+0633
ْ  U+0652
م  U+0645
ِ  U+0650


In [4]:
from core.helpers import display_verses_with_codepoints
display_verses_with_codepoints(["0x06DF"], db_path="resources/Quran.json")

In [None]:
from core.phonemizer import Phonemizer

pm = Phonemizer()
ref = "44:43 - 44:44"

res = pm.phonemize(ref, stops=["verse"])
print(res.text)
print(res.phonemes_str(phoneme_sep="", word_sep=" ", verse_sep=""))

res = pm.phonemize(ref, stops=[])
print(res.phonemes_str(phoneme_sep="", word_sep=" ", verse_sep=""))

In [None]:
from core.phonemizer import Phonemizer

ref = "7:145"
res = Phonemizer().phonemize(ref, stops=["verse"])
res.show_table()

In [1]:
from core.phonemizer import Phonemizer

ref = "68:6"
res = Phonemizer().phonemize(ref, stops=["verse"], debug=True)
print(res.text)
res.phonemes

بِأَييِّكُمُ ٱلْمَفْتُونُ


[['b', 'i', 'ʔ', 'a', 'vowel?', 'jj', 'i', 'k', 'u', 'm', 'u'],
 ['l', 'm', 'a', 'f', 't', 'u:', 'n']]

In [1]:
from core.helpers import phonemize_and_save
s="1-114"
phonemize_and_save(f"{s}", 
    stops=["verse", "preferred_stop", "compulsory_stop", 
        # "optional_stop", "preferred_continue"
    ], 
    output_dir="out/phonemized_refactor1_vqm"
    # output_dir="out/phonemized_refactor2_vqm"
)

Phonemized output saved to: out/phonemized_refactor1_vqm/1-114.txt


In [3]:
from core.helpers import compare_files
compare_files(
    f"out/phonemized_v1/1-114.txt", 
    f"out/phonemized_refactor1_vqm/1-114.txt",
    # f"out/phonemized_refactor2_vqm/1-114.txt",
    ignore_whitespace=True)

Files differ in 5983 lines: 1-114.txt vs 1-114.txt

Verse 2:3 (line 47):
Difference at line 48:
File1 (1-114.txt):
      47: 2:3
>>>   48: ٱلَّذِينَ               ['ʔ', 'a', 'l', 'a', 'ð', 'i:', 'n', 'a']
      49: يُؤۡمِنُونَ              ['j', 'u', 'ʔ', 'm', 'i', 'n', 'u:', 'n', 'a']
File2 (1-114.txt):
      47: 2:3
>>>   48: ٱلَّذِينَ               ['ʔ', 'a', 'll', 'a', 'ð', 'i:', 'n', 'a']
      49: يُؤْمِنُونَ              ['j', 'u', 'ʔ', 'm', 'i', 'n', 'u:', 'n', 'a']


Verse 2:4 (line 56):
Difference at line 66:
File1 (1-114.txt):
      65: قَبۡلِكَ                ['q', 'a', 'b', 'Q', 'l', 'i', 'k', 'a']
>>>   66: وَبِٱلۡأَخِرَةِ            ['w', 'a', 'b', 'i', 'l', 'ʔ', 'a', 'x', 'i', 'r', 'a', 't', 'i']
      67: هُمۡ                  ['h', 'u', 'm']
File2 (1-114.txt):
      65: قَبْلِكَ                ['q', 'a', 'b', 'Q', 'l', 'i', 'k', 'a']
>>>   66: وَبِٱلْـَٔاخِرَةِ           ['w', 'a', 'b', 'i', 'l', 'ʔ', 'a:', 'x', 'i', 'r', 'a', 't', 'i']
      67: هُمْ                 

False