In [4]:
import pandas as pd
import numpy as np
import ast
import csv
import os
import json
import sys
from PyQt5.QtGui import QFont, QFontDatabase, QImage, QPainter, QColor, QFontMetrics
from PyQt5.QtCore import Qt
from PyQt5.QtWidgets import QApplication

In [12]:
def latin_to_mongolian_unicode(latin_text):
    """
    Convert Latin characters to Mongolian Unicode characters using the provided mapping.
    """
    # Updated mapping of Latin characters to Mongolian Unicode
    mapping = {
        '-': '\u202F',  # NARROW NO-BREAK SPACE
        '!': '\u180D',  # MONGOLIAN FREE VARIATION SELECTOR THREE
        '*': '\u200D',  # ZERO WIDTH JOINER
        '^': '\u180B',  # MONGOLIAN FREE VARIATION SELECTOR ONE
        '_': '\u180E',  # MONGOLIAN VOWEL SEPARATOR
        '~': '\u180C',  # MONGOLIAN FREE VARIATION SELECTOR TWO
        'a': '\u1820',  # MONGOLIAN LETTER A
        'b': '\u182A',  # MONGOLIAN LETTER BA
        'c': '\u1834',  # MONGOLIAN LETTER CHA
        'd': '\u1833',  # MONGOLIAN LETTER DA
        'e': '\u1821',  # MONGOLIAN LETTER E
        'E': '\u1827',  # MONGOLIAN LETTER EE
        'f': '\u1839',  # MONGOLIAN LETTER FA
        'g': '\u182D',  # MONGOLIAN LETTER GA
        'G': '\u1829',  # MONGOLIAN LETTER ANG
        'h': '\u182C',  # MONGOLIAN LETTER QA
        'H': '\u183E',  # MONGOLIAN LETTER WA WITH LOOP
        'i': '\u1822',  # MONGOLIAN LETTER I
        'j': '\u1835',  # MONGOLIAN LETTER JA
        'k': '\u183A',  # MONGOLIAN LETTER KA
        'K': '\u183B',  # MONGOLIAN LETTER KHA
        'l': '\u182F',  # MONGOLIAN LETTER LA
        'L': '\u1840',  # MONGOLIAN LETTER LHA
        'm': '\u182E',  # MONGOLIAN LETTER MA
        'n': '\u1828',  # MONGOLIAN LETTER NA
        'o': '\u1823',  # MONGOLIAN LETTER O
        'p': '\u182B',  # MONGOLIAN LETTER PA
        'q': '\u1820',  # Fallback to MONGOLIAN LETTER A
        'r': '\u1837',  # MONGOLIAN LETTER RA
        'R': '\u183F',  # MONGOLIAN LETTER ZRA
        's': '\u1830',  # MONGOLIAN LETTER SA
        'S': '\u1842',  # MONGOLIAN LETTER CHI
        't': '\u1832',  # MONGOLIAN LETTER TA
        'u': '\u1826',  # MONGOLIAN LETTER UE
        'v': '\u1825',  # MONGOLIAN LETTER OE
        'w': '\u1824',  # MONGOLIAN LETTER U
        'W': '\u1838',  # MONGOLIAN LETTER WA
        'x': '\u1831',  # MONGOLIAN LETTER SHA
        'X': '\u1841',  # MONGOLIAN LETTER ZHA
        'y': '\u1836',  # MONGOLIAN LETTER YA
        'z': '\u183C',  # MONGOLIAN LETTER ZA
        'Z': '\u183D',  # MONGOLIAN LETTER HAA
    }
    
    # Basic Mongolian characters for missing letters
    result = ''.join(mapping.get(char, char) for char in latin_text)
    return result

def generate_image(path, text, font="../fonts/CMDASHITSEDEN5.91.TTF", font_size=40):
    font_id = QFontDatabase.addApplicationFont(font)
    font_families = QFontDatabase.applicationFontFamilies(font_id)

    if not font_families:
        print(f"Error: Failed to load font from {font}")
        return  # Exit early if font loading fails

    font_family = font_families[0]  # Use first available font family
    font_obj = QFont(font_family, font_size)

    # Measure text size
    font_metrics = QFontMetrics(font_obj)
    text_width = font_metrics.horizontalAdvance(text)
    text_height = font_metrics.height()

    # Add padding
    padding = 20
    image_width = text_width + padding * 3
    image_height = text_height + padding * 3

    # Create image
    image = QImage(image_width, image_height, QImage.Format_ARGB32)
    image.fill(QColor(255, 255, 255))  # White background

    # Draw text
    painter = QPainter(image)
    painter.setFont(font_obj)
    painter.setPen(QColor(0, 0, 0))  # Black text
    painter.drawText(padding, text_height, text)
    painter.end()

    # Save the image
    image.save(path, "PNG")
def process_dataset(file_path, output_dir, font_path):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    if not os.path.exists(file_path):
        print(f"Error: File not found: {file_path}")
        return []

    try:
        data = []
        unique_labels = set()  # Store unique labels
        json_data = []
        image_counter = 1  # Sequential numbering for images
        # max_images = 10  # Limit to first 10 images

        with open(file_path, "r", encoding="utf-8") as file:
            reader = csv.reader(file)
            for row in reader:
                if len(row) < 5:
                    print(f"Skipping row due to insufficient columns: {row}")
                    continue  # Skip invalid rows
                
                label = row[0]  # Assuming the first column is 'label'
                mongolian_text = latin_to_mongolian_unicode(label)

                if mongolian_text in unique_labels:
                    print(f"Skipping duplicate label: {label}")
                    continue  # Skip duplicate labels
            
                unique_labels.add(mongolian_text)  # Mark label as used

                # Generate image with proper sequential numbering
                image_path = os.path.join(output_dir, f"{image_counter}.png")
                generate_image(image_path, mongolian_text, font=font_path)
                
                json_data.append({"image_path": image_path, "label": mongolian_text})
                image_counter += 1  # Increment the image number
                
                # # Stop after generating 10 images
                # if image_counter > max_images:
                #     break

        # Debugging: Check if json_data is populated
        print(f"Generated {len(json_data)} entries for the JSON file.")

        # Save JSON file after processing all data
        if json_data:  # Ensure there is data to write
            json_path = os.path.join('../../mongolian_json', "mongolian_labels.json")
            with open(json_path, 'w', encoding='utf-8') as f:
                json.dump(json_data, f, ensure_ascii=False, indent=4)
            print(f"JSON file created: {json_path}")
        else:
            print("No data to write to JSON file.")
        
        return json_data

    except Exception as e:
        print(f"Error processing dataset: {e}")
        return []

In [6]:
generate_image("tmp.png", "ᠮᠣᠨᠭᠭᠣᠯ ᠤᠯᠰ")

In [6]:
app = QApplication(sys.argv)

In [14]:

# Process the dataset with robust error handling
try:
    file_path = "../../dataset_preprocessing/MOLHW_preprocess_unicode/MOLHW_preprocess_unicode.txt"
    output_dir = "../../dataset"
    font_path = "../fonts/CMDASHITSEDEN5.91.TTF"
    json_data = process_dataset(file_path, output_dir, font_path=font_path)
    
    if json_data:
        # Display the first few entries
        print("First 3 entries in JSON data:")
        for entry in json_data[:min(3, len(json_data))]:
            print(entry)
        

except Exception as e:
    print(f"Error in main execution: {e}")

Skipping duplicate label: ab
Skipping duplicate label: ab
Skipping duplicate label: ab
Skipping duplicate label: aba
Skipping duplicate label: aba
Skipping duplicate label: aba
Skipping duplicate label: ab
Skipping duplicate label: abaci
Skipping duplicate label: abaciba
Skipping duplicate label: abaciba
Skipping duplicate label: abaciba
Skipping duplicate label: abaciba
Skipping duplicate label: abaciba
Skipping duplicate label: abaciba
Skipping duplicate label: abaciba
Skipping duplicate label: abaciba
Skipping duplicate label: abaciba
Skipping duplicate label: abacibai
Skipping duplicate label: abacibai
Skipping duplicate label: abacibai
Skipping duplicate label: abacibai
Skipping duplicate label: abacibai
Skipping duplicate label: abacibai
Skipping duplicate label: abacibai
Skipping duplicate label: abacibal
Skipping duplicate label: abacibal
Skipping duplicate label: abacibasw
Skipping duplicate label: abacibasw
Skipping duplicate label: abacicihabal
Skipping duplicate label: abac

In [8]:
data = []
file_path = "../../dataset_preprocessing\MOLHW_unicode\MOLHW_unicode.txt"

# Read the file and add words to data
with open(file_path, "r", encoding="utf-8") as file:
    reader = csv.reader(file)
    for row in reader:
        data.append(row[0])

unique_words = set(data)

print(len(unique_words))
        

40605


In [9]:
json_data=[]
i=0
for word in unique_words:
    if(i==5):
        break
    image_path = os.path.join(output_dir, f"{i+1}.png")
    generate_image(image_path, word)
    json_data.append({"image_path": image_path, "label": word})
    i+=1