In [2]:
# Cell 1: Imports and Global Configurations
import sys
import os
import random
import re
import json
from PyQt5.QtWidgets import QApplication
from PyQt5.QtGui import QFont, QPixmap, QPainter, QColor, QFontDatabase, QTransform, QImage, QFontMetrics
from PyQt5.QtCore import Qt, QRectF, QPointF

# --- COMMON CONFIGURATIONS ---
ALL_FONTS_RELATIVE_PATHS = [
    '../fonts/Bichimel.ttf', '../fonts/mnglwritingotf.ttf',
    '../fonts/UnicodeFontForShare/UnicodeFONT MUNKH/MunkhGalUNICODE/Menk Hawang.ttf',
    '../fonts/UnicodeFontForShare/VertNote/Mongol Bichimel.ttf', '../fonts/Chagan.ttf',
    '../fonts/Garchag.ttf', '../fonts/Syurga.ttf', '../fonts/Urga.ttf',
    '../fonts/UnicodeFontForShare/Microsoft/monbaiti 501.ttf',
    '../fonts/UnicodeFontForShare/UnicodeFONT MUNKH/MunkhGalUNICODE/Menk Qagan.ttf',
    '../fonts/UnicodeFontForShare/UnicodeFONT MUNKH/MunkhGalUNICODE/Menk Sonin.ttf',
    '../fonts/UnicodeFontForShare/VertNote/Mongol Garchig.ttf',
    '../fonts/MONGOLIANSYSTEM0.97.TTF', '../fonts/NotoSansMongolian-Regular.ttf',
    '../fonts/UnicodeFontForShare/VertNote/Mongol Web.ttf', '../fonts/mngltitleotf.ttf',
    '../fonts/UnicodeFontForShare/UnicodeFONT MUNKH/MunkhGalUNICODE/Menk Garqig.ttf',
]

TEXT_FILE_RELATIVE_PATH = '../adiya/web_scrap/scraped_data/president.txt'

OUTPUT_BASE_DIR_RELATIVE_PATH = '../../second_dataset_word_groups' # Гаралтын хавтасны нэр
JSON_FILENAME = "labels_word_groups.json"
IMAGES_SUBDIR_NAME = "images"
JSON_IMAGE_PATH_PREFIX_IN_JSON_FILE = "images/"

BASE_ROTATION_ANGLE_CONFIG = 90
RANDOM_TILT_FULL_RANGE = (-5, 5) # Нэмэлт налууны бүтэн хязгаар
PADDING_AMOUNT_CONFIG = 25
MIN_SCALING_FACTOR_CONFIG = 1.1
MAX_SCALING_FACTOR_CONFIG = 1.8
BASE_FONT_SIZE_CONFIG = 100
TEXT_COLOR_CONFIG = QColor("black")
BACKGROUND_COLOR_CONFIG = QColor("white")

# Үг нийлүүлэх магадлал (нийлбэр нь 100% байх албагүй, random.choices ашиглана)
WORD_GROUP_PROBABILITIES = {
    1: 0.60, # 1 үгтэй байх магадлал 30%
    2: 0.20, # 2 үгтэй байх магадлал 60%
    3: 0.20  # 3 үгтэй байх магадлал 10%
}
# Эсвэл шууд жагсаалт ба жингээр:
WORD_GROUP_SIZES = [1, 2, 3]
WORD_GROUP_WEIGHTS = [60, 20, 20] # Нийлбэр 100

NOTEBOOK_CWD = os.getcwd()
print(f"Jupyter Notebook Current Working Directory (CWD): {NOTEBOOK_CWD}")
resolved_output_base_dir = os.path.abspath(os.path.join(NOTEBOOK_CWD, OUTPUT_BASE_DIR_RELATIVE_PATH))
print(f"Target Output Base Directory: {resolved_output_base_dir}")
print(f"  - JSON file: {os.path.join(resolved_output_base_dir, JSON_FILENAME)}")
print(f"  - Images subdir: {os.path.join(resolved_output_base_dir, IMAGES_SUBDIR_NAME)}")
print("-" * 70)

Jupyter Notebook Current Working Directory (CWD): c:\Users\domogdog\Desktop\bigproject\Mongol-tuulgatnii-hoyr-erhem-project-OCR-\munguu
Target Output Base Directory: c:\Users\domogdog\Desktop\bigproject\second_dataset_word_groups
  - JSON file: c:\Users\domogdog\Desktop\bigproject\second_dataset_word_groups\labels_word_groups.json
  - Images subdir: c:\Users\domogdog\Desktop\bigproject\second_dataset_word_groups\images
----------------------------------------------------------------------


In [3]:
# Cell 2: Stage 1 - Function to Generate Metadata JSON with Word Grouping and Execution
def generate_metadata_json_word_groups():
    print("\n" + "-" * 30 + "\nSTAGE 1: GENERATING METADATA JSON (Word Grouping)\n" + "-" * 30)

    available_font_relative_paths = []
    # ... (фонт шалгах хэсэг өмнөхтэй ижил) ...
    print("Info (Metadata Gen): Checking font files...")
    for f_path in ALL_FONTS_RELATIVE_PATHS:
        abs_font_path_check = os.path.abspath(os.path.join(NOTEBOOK_CWD, f_path))
        if os.path.exists(abs_font_path_check): available_font_relative_paths.append(f_path)
        else: print(f"  Warning (Metadata Gen): Font file not found: '{abs_font_path_check}' (from '{f_path}')")
    if not available_font_relative_paths:
        print("Error (Metadata Gen): No valid fonts. Exiting Stage 1."); return False
    print(f"Info (Metadata Gen): Using {len(available_font_relative_paths)} valid font(s).")

    text_file_abs_path = os.path.abspath(os.path.join(NOTEBOOK_CWD, TEXT_FILE_RELATIVE_PATH))
    full_text_content = ""
    # ... (текст файл унших хэсэг өмнөхтэй ижил) ...
    try:
        with open(text_file_abs_path, 'r', encoding='utf-8') as f: full_text_content = f.read()
        # full_text_content = full_text_content[0:100]
        # print(full_text_content)
    except FileNotFoundError: print(f"Error (Metadata Gen): Text file not found: '{text_file_abs_path}'. Exiting Stage 1."); return False
    if not full_text_content: print(f"Info (Metadata Gen): No text content. Exiting Stage 1."); return False
    
    all_words_from_file = full_text_content.split(' ')
    if not all_words_from_file:
        print("Info (Metadata Gen): No words in text. Exiting Stage 1."); return False
    print(f"Info (Metadata Gen): Found {len(all_words_from_file)} total words in the text file.")

    all_image_params_for_json = []
    image_global_index = 0 
    current_word_list_index = 0 # Файл дахь үгүүдийн индексийг хянах
    
    print(f"Info (Metadata Gen): Preparing metadata by grouping words...")

    while current_word_list_index < len(all_words_from_file):
        # Хэдэн үг нийлүүлэхээ санамсаргүйгээр сонгох
        num_words_to_group = random.choices(WORD_GROUP_SIZES, weights=WORD_GROUP_WEIGHTS, k=1)[0]
        
        # Үлдэгдэл үгнээс хэтрэхгүй байхыг шалгах
        if current_word_list_index + num_words_to_group > len(all_words_from_file):
            num_words_to_group = len(all_words_from_file) - current_word_list_index
        
        if num_words_to_group <= 0: # Хэрэв үг үлдээгүй бол давталтаас гарна
            break
            
        # Сонгосон тооны үгийг нэгтгэх
        word_chunk_list = all_words_from_file[current_word_list_index : current_word_list_index + num_words_to_group]
        text_label_for_image = " ".join(word_chunk_list)

        if not text_label_for_image.strip():
            current_word_list_index += num_words_to_group
            continue

        # Энэ үгийн бүлэгт зориулж нэг зураг үүсгэх параметрүүдийг тодорхойлно
        font_to_use = random.choice(available_font_relative_paths)
        scaling_factor = random.uniform(MIN_SCALING_FACTOR_CONFIG, MAX_SCALING_FACTOR_CONFIG)
        # Нэмэлт налууг RANDOM_TILT_FULL_RANGE-аас санамсаргүйгээр сонгоно
        # Энэ нь 0 (налуугүй), эерэг, сөрөг байж болно.
        # Хэрэв та заавал 3 өөр хувилбар (0, -X, +Y) үүсгэхийг хүсвэл өмнөх шиг тус тусад нь хийнэ.
        # Одоогийн байдлаар нэг л удаа, бүрэн санамсаргүй налуутай үүсгэнэ.
        random_tilt_value = random.uniform(RANDOM_TILT_FULL_RANGE[0], RANDOM_TILT_FULL_RANGE[1])
        # Маш бага налууг 0 болгож болно (хэрэв хүсвэл)
        if abs(random_tilt_value) < 1.0: # Жишээ нь 1 градусаас бага бол тэгш гэж үзье
            random_tilt_value = 0
            
        filename_for_image = f"{image_global_index}.png"
        
        all_image_params_for_json.append({
            "label": text_label_for_image,
            "fonts": [os.path.basename(font_to_use)], 
            "path": JSON_IMAGE_PATH_PREFIX_IN_JSON_FILE + filename_for_image, 
            "font_to_use_relpath": font_to_use, 
            "base_rotation": BASE_ROTATION_ANGLE_CONFIG,
            "random_tilt": round(random_tilt_value, 3), 
            "scaling_factor": round(scaling_factor, 3)
        })
        image_global_index += 1
        current_word_list_index += num_words_to_group # Дараагийн үгийн бүлэг рүү шилжих

        if image_global_index % 50 == 0: # 50 зураг төлөвлөх тутамд мэдээлэл
            print(f"   (Metadata Gen) Prepared metadata for {image_global_index} images. Processed up to word index {current_word_list_index-1}...")

    print(f"Finished metadata preparation. Total {image_global_index} image entries planned for JSON file.")

    output_base_dir_abs = os.path.abspath(os.path.join(NOTEBOOK_CWD, OUTPUT_BASE_DIR_RELATIVE_PATH))
    if not os.path.exists(output_base_dir_abs):
        try: os.makedirs(output_base_dir_abs)
        except OSError as e: print(f"Error (Metadata Gen): Could not create base output dir '{output_base_dir_abs}': {e}"); return False
            
    json_output_file_abs_path = os.path.join(output_base_dir_abs, JSON_FILENAME)
    try:
        with open(json_output_file_abs_path, 'w', encoding='utf-8') as f_json:
            json.dump(all_image_params_for_json, f_json, ensure_ascii=False, indent=2)
        print(f"Info (Metadata Gen): Successfully saved image generation parameters to '{json_output_file_abs_path}'")
        return True
    except IOError as e:
        print(f"Error (Metadata Gen): Could not save JSON file '{json_output_file_abs_path}': {e}")
        return False

# --- Execute Stage 1 ---
print("--- Running Stage 1: Metadata Generation (Word Grouping) ---")
stage1_success_flag = generate_metadata_json_word_groups()
# print(f"STAGE 1 EXECUTION FINISHED. Success: {stage1_success_flag}")
# print("-" * 70)

--- Running Stage 1: Metadata Generation (Word Grouping) ---

------------------------------
STAGE 1: GENERATING METADATA JSON (Word Grouping)
------------------------------
Info (Metadata Gen): Checking font files...
Info (Metadata Gen): Using 17 valid font(s).
Info (Metadata Gen): Found 303143 total words in the text file.
Info (Metadata Gen): Preparing metadata by grouping words...
   (Metadata Gen) Prepared metadata for 50 images. Processed up to word index 91...
   (Metadata Gen) Prepared metadata for 100 images. Processed up to word index 168...
   (Metadata Gen) Prepared metadata for 150 images. Processed up to word index 255...
   (Metadata Gen) Prepared metadata for 200 images. Processed up to word index 337...
   (Metadata Gen) Prepared metadata for 250 images. Processed up to word index 420...
   (Metadata Gen) Prepared metadata for 300 images. Processed up to word index 507...
   (Metadata Gen) Prepared metadata for 350 images. Processed up to word index 589...
   (Metadata

In [4]:
# Cell 3: Stage 2 - Functions to Generate Actual Images from JSON and Execution
# generate_actual_image_from_json_params_notebook_v3 функц (өмнөхтэйгөө ижил, өөрчлөлтгүй)
# Энэ функц нь NOTEBOOK_CWD-г ашиглан фонтын замыг зөв олох ёстой.
# Товчлохгүйгээр дахин хуулъя:
def generate_actual_image_from_json_params_notebook_v3(
    app_instance, text_to_render, font_relative_path_from_cwd, 
    output_image_full_path, base_angle_param, random_tilt_param,
    scaling_factor_param
):
    if not text_to_render.strip(): return False
    actual_font_path = os.path.abspath(os.path.join(NOTEBOOK_CWD, font_relative_path_from_cwd))
    if not os.path.exists(actual_font_path):
        print(f"Error (Image Gen): Font file not found at '{actual_font_path}'. Skipping.")
        return False
    font_id = QFontDatabase.addApplicationFont(actual_font_path)
    if font_id == -1: font_families = ["Arial"]
    else: font_families = QFontDatabase.applicationFontFamilies(font_id)
    if not font_families: font = QFont()
    else: font = QFont(font_families[0], BASE_FONT_SIZE_CONFIG)
    fm = QFontMetrics(font)
    text_rect_precise = fm.boundingRect(text_to_render)
    temp_horizontal_pixmap_width = text_rect_precise.width()
    temp_horizontal_pixmap_height = text_rect_precise.height()
    if temp_horizontal_pixmap_width <= 0 or temp_horizontal_pixmap_height <= 0: return False
    temp_horizontal_pixmap = QPixmap(int(temp_horizontal_pixmap_width), int(temp_horizontal_pixmap_height))
    temp_horizontal_pixmap.fill(BACKGROUND_COLOR_CONFIG)
    painter_temp = QPainter(temp_horizontal_pixmap)
    painter_temp.setRenderHint(QPainter.Antialiasing); painter_temp.setFont(font); painter_temp.setPen(TEXT_COLOR_CONFIG)
    painter_temp.drawText(QPointF(-text_rect_precise.x(), -text_rect_precise.y()), text_to_render); painter_temp.end()
    combined_transform_for_bounding_rect = QTransform(); combined_transform_for_bounding_rect.rotate(base_angle_param); combined_transform_for_bounding_rect.rotate(random_tilt_param)
    rotated_text_rect = combined_transform_for_bounding_rect.mapRect(QRectF(0, 0, temp_horizontal_pixmap.width(), temp_horizontal_pixmap.height()))
    final_content_width = int(rotated_text_rect.width()); final_content_height = int(rotated_text_rect.height())
    padded_final_width = final_content_width + PADDING_AMOUNT_CONFIG * 2; padded_final_height = final_content_height + PADDING_AMOUNT_CONFIG * 2
    if padded_final_width <=0 or padded_final_height <=0: return False
    final_oriented_pixmap = QPixmap(padded_final_width, padded_final_height); final_oriented_pixmap.fill(BACKGROUND_COLOR_CONFIG)
    painter_final_oriented = QPainter(final_oriented_pixmap)
    painter_final_oriented.setRenderHint(QPainter.Antialiasing); painter_final_oriented.setRenderHint(QPainter.SmoothPixmapTransform)
    painter_final_oriented.translate(padded_final_width / 2, padded_final_height / 2); painter_final_oriented.rotate(base_angle_param); painter_final_oriented.rotate(random_tilt_param)
    painter_final_oriented.drawPixmap(int(-temp_horizontal_pixmap.width() / 2), int(-temp_horizontal_pixmap.height() / 2), temp_horizontal_pixmap); painter_final_oriented.end()
    final_image_to_save = final_oriented_pixmap.toImage(); current_scaling_factor = scaling_factor_param
    scaled_image_to_save = final_image_to_save
    if not (current_scaling_factor < 1.001 and current_scaling_factor > 0.999):
        if not (current_scaling_factor < 0.1):
            target_width = int(padded_final_width / current_scaling_factor); target_height = int(padded_final_height / current_scaling_factor)
            if not (target_width < 1 or target_height < 1):
                scaled_image_to_save = final_image_to_save.scaled(target_width, target_height, Qt.KeepAspectRatio, Qt.SmoothTransformation)
    output_image_dir = os.path.dirname(output_image_full_path)
    if not os.path.exists(output_image_dir):
        try: os.makedirs(output_image_dir)
        except OSError: pass 
    if scaled_image_to_save.save(output_image_full_path, "PNG"): return True
    else: return False

def process_json_and_generate_actual_images_word_groups(): # Renamed
    print("\n" + "-" * 30 + "\nSTAGE 2: GENERATING IMAGES FROM JSON (Word Grouping)\n" + "-" * 30)
    
    app = QApplication.instance()
    if app is None:
        app = QApplication(sys.argv if hasattr(sys, 'argv') else [])

    base_dir_for_io_abs = os.path.abspath(os.path.join(NOTEBOOK_CWD, OUTPUT_BASE_DIR_RELATIVE_PATH))
    json_file_to_process_abs = os.path.join(base_dir_for_io_abs, JSON_FILENAME)

    if not os.path.exists(json_file_to_process_abs):
        print(f"Error (Image Gen): JSON file not found at '{json_file_to_process_abs}'. Exiting Stage 2.")
        return False

    image_generation_tasks = []
    try:
        with open(json_file_to_process_abs, 'r', encoding='utf-8') as f_json:
            image_generation_tasks = json.load(f_json)
    except Exception as e:
        print(f"Error (Image Gen): Loading JSON file '{json_file_to_process_abs}': {e}. Exiting Stage 2.")
        return False

    if not image_generation_tasks:
        print("Info (Image Gen): No tasks in JSON. Exiting Stage 2.")
        return False
    
    print(f"Info (Image Gen): Base I/O directory: '{base_dir_for_io_abs}'")
    
    generated_count = 0
    failed_count = 0
    total_tasks = len(image_generation_tasks)
    print(f"Info (Image Gen): Starting to generate {total_tasks} images...")

    for i, task in enumerate(image_generation_tasks):
        label = task.get("label")
        font_rel_path = task.get("font_to_use_relpath") 
        relative_image_path_from_json = task.get("path") 
        
        base_rotation = task.get("base_rotation", BASE_ROTATION_ANGLE_CONFIG) 
        random_tilt = task.get("random_tilt", 0)
        scaling_factor = task.get("scaling_factor", 1.0) 

        if not (label and font_rel_path and relative_image_path_from_json):
            print(f"Warning (Image Gen): Skipping task {i+1} due to missing JSON data: {task}")
            failed_count +=1
            continue
            
        target_image_full_path = os.path.abspath(os.path.join(base_dir_for_io_abs, relative_image_path_from_json))
        
        if generate_actual_image_from_json_params_notebook_v3(
            app, label, 
            font_rel_path, 
            target_image_full_path,
            base_rotation, random_tilt, scaling_factor
        ):
            generated_count += 1
        else:
            failed_count += 1

        if (i + 1) % 50 == 0 or (i + 1) == total_tasks:
            print(f"   (Image Gen) Processed {i + 1}/{total_tasks} tasks. Generated: {generated_count}, Failed: {failed_count}")
            
    print(f"Finished image generation. Successfully generated: {generated_count}, Failed: {failed_count}.")
    return True

# --- Execute Stage 1 (Run this cell after defining functions in Cell 2) ---
# print("--- Running Stage 1: Metadata Generation (Word Grouping) ---")
# stage1_success_flag = generate_metadata_json_word_groups()
# print(f"STAGE 1 EXECUTION FINISHED. Success: {stage1_success_flag}")
# print("-" * 70)


print("--- Running Stage 2: Image Generation from JSON (Word Grouping) ---")
# Ensure stage1_success_flag is available from Cell 2's execution or set it manually
if 'stage1_success_flag' in locals() and stage1_success_flag: 
   stage2_success_flag = process_json_and_generate_actual_images_word_groups()
   print(f"STAGE 2 EXECUTION FINISHED. Success: {stage2_success_flag}")
elif 'stage1_success_flag' not in locals():
   print("Warning: Stage 1 success flag not found. Assuming success and proceeding with Stage 2.")
   stage2_success_flag = process_json_and_generate_actual_images_word_groups()
   print(f"STAGE 2 EXECUTION FINISHED. Success: {stage2_success_flag}")
else:
   print("Skipping Stage 2 because Stage 1 did not complete successfully.")
print("-" * 70)

--- Running Stage 2: Image Generation from JSON (Word Grouping) ---

------------------------------
STAGE 2: GENERATING IMAGES FROM JSON (Word Grouping)
------------------------------
Info (Image Gen): Base I/O directory: 'c:\Users\domogdog\Desktop\bigproject\second_dataset_word_groups'
Info (Image Gen): Starting to generate 187543 images...
   (Image Gen) Processed 50/187543 tasks. Generated: 50, Failed: 0
   (Image Gen) Processed 100/187543 tasks. Generated: 100, Failed: 0
   (Image Gen) Processed 150/187543 tasks. Generated: 150, Failed: 0
   (Image Gen) Processed 200/187543 tasks. Generated: 200, Failed: 0
   (Image Gen) Processed 250/187543 tasks. Generated: 250, Failed: 0
   (Image Gen) Processed 300/187543 tasks. Generated: 300, Failed: 0
   (Image Gen) Processed 350/187543 tasks. Generated: 350, Failed: 0
   (Image Gen) Processed 400/187543 tasks. Generated: 400, Failed: 0
   (Image Gen) Processed 450/187543 tasks. Generated: 450, Failed: 0
   (Image Gen) Processed 500/187543 t

In [61]:
# Cell 1: Imports and Global Configurations
import sys
import os
import random
import re
import json
from PyQt5.QtWidgets import QApplication
from PyQt5.QtGui import QFont, QPixmap, QPainter, QColor, QFontDatabase, QTransform, QImage, QFontMetrics
from PyQt5.QtCore import Qt, QRectF, QPointF

# --- COMMON CONFIGURATIONS ---
ALL_FONTS_RELATIVE_PATHS = [
     '../fonts/Bichimel.ttf', '../fonts/mnglwritingotf.ttf',
    '../fonts/UnicodeFontForShare/UnicodeFONT MUNKH/MunkhGalUNICODE/Menk Hawang.ttf',
    '../fonts/UnicodeFontForShare/VertNote/Mongol Bichimel.ttf', '../fonts/Chagan.ttf',
    '../fonts/Garchag.ttf', '../fonts/Syurga.ttf', '../fonts/Urga.ttf',
    '../fonts/UnicodeFontForShare/Microsoft/monbaiti 501.ttf',
    '../fonts/UnicodeFontForShare/UnicodeFONT MUNKH/MunkhGalUNICODE/Menk Qagan.ttf',
    '../fonts/UnicodeFontForShare/UnicodeFONT MUNKH/MunkhGalUNICODE/Menk Sonin.ttf',
    '../fonts/UnicodeFontForShare/VertNote/Mongol Garchig.ttf',
    '../fonts/MONGOLIANSYSTEM0.97.TTF', '../fonts/NotoSansMongolian-Regular.ttf',
    '../fonts/UnicodeFontForShare/VertNote/Mongol Web.ttf', '../fonts/mngltitleotf.ttf',
    '../fonts/UnicodeFontForShare/UnicodeFONT MUNKH/MunkhGalUNICODE/Menk Garqig.ttf',
]

# НЭМЖ боловсруулах текст файлын зам
NEW_TEXT_FILE_TO_PROCESS_RELATIVE_PATH = '../adiya/web_scrap/scraped_data/president.txt' # Энэ замыг зөв тохируулна уу

# ОДОО БАЙГАА, ДЭЭР НЬ НЭМЖ БИЧИХ JSON ФАЙЛ БОЛОН IMAGES ХАВТАС БАЙГАА ҮНДСЭН ХАВТАС
# Энэ нь таны өмнө newsmn.txt-г боловсруулж үүсгэсэн хавтас байна.
EXISTING_OUTPUT_BASE_DIR_RELATIVE_PATH = '../../n_dataset_word_groups' # Таны өмнөх гаралтын хавтасны нэр
EXISTING_JSON_FILENAME = "labels_word_groups.json" # Таны өмнөх JSON файлын нэр
IMAGES_SUBDIR_NAME_IN_BASE_DIR = "images" # Энэ хавтас EXISTING_OUTPUT_BASE_DIR_RELATIVE_PATH дотор байгаа
JSON_IMAGE_PATH_PREFIX_IN_JSON_FILE = "images/" # JSON доторх path-д орох угтвар

BASE_ROTATION_ANGLE_CONFIG = 90
RANDOM_TILT_FULL_RANGE = (-15, 15)
PADDING_AMOUNT_CONFIG = 25
MIN_SCALING_FACTOR_CONFIG = 1.1
MAX_SCALING_FACTOR_CONFIG = 1.8
BASE_FONT_SIZE_CONFIG = 100
TEXT_COLOR_CONFIG = QColor("black")
BACKGROUND_COLOR_CONFIG = QColor("white")

WORD_GROUP_SIZES = [1, 2, 3]
WORD_GROUP_WEIGHTS = [30, 60, 10]

# Үргэлжлүүлэн эхлэх глобал индекс
STARTING_IMAGE_GLOBAL_INDEX_FOR_APPEND = 84430

NOTEBOOK_CWD = os.getcwd()
print(f"Jupyter Notebook Current Working Directory (CWD): {NOTEBOOK_CWD}")
resolved_existing_output_base_dir = os.path.abspath(os.path.join(NOTEBOOK_CWD, EXISTING_OUTPUT_BASE_DIR_RELATIVE_PATH))
print(f"Existing Output Base Directory (to append to): {resolved_existing_output_base_dir}")
print(f"  - Existing JSON file: {os.path.join(resolved_existing_output_base_dir, EXISTING_JSON_FILENAME)}")
print(f"  - Images will be added to: {os.path.join(resolved_existing_output_base_dir, IMAGES_SUBDIR_NAME_IN_BASE_DIR)}")
print(f"New entries will start with global image index: {STARTING_IMAGE_GLOBAL_INDEX_FOR_APPEND}")
print("-" * 70)

Jupyter Notebook Current Working Directory (CWD): c:\Users\domogdog\Desktop\bigproject\Mongol-tuulgatnii-hoyr-erhem-project-OCR-\munguu
Existing Output Base Directory (to append to): c:\Users\domogdog\Desktop\bigproject\n_dataset_word_groups
  - Existing JSON file: c:\Users\domogdog\Desktop\bigproject\n_dataset_word_groups\labels_word_groups.json
  - Images will be added to: c:\Users\domogdog\Desktop\bigproject\n_dataset_word_groups\images
New entries will start with global image index: 84430
----------------------------------------------------------------------


In [62]:
# Cell 2: Stage 1 - Function to Generate/Append Metadata JSON and Execution
def generate_and_append_metadata_json():
    print("\n" + "-" * 30 + "\nSTAGE 1: GENERATING AND APPENDING METADATA JSON\n" + "-" * 30)

    available_font_relative_paths = []
    print("Info (Metadata Gen): Checking font files...")
    for f_path in ALL_FONTS_RELATIVE_PATHS:
        abs_font_path_check = os.path.abspath(os.path.join(NOTEBOOK_CWD, f_path))
        if os.path.exists(abs_font_path_check): available_font_relative_paths.append(f_path)
        else: print(f"  Warning (Metadata Gen): Font file not found: '{abs_font_path_check}' (from '{f_path}')")
    if not available_font_relative_paths:
        print("Error (Metadata Gen): No valid fonts. Exiting Stage 1."); return False
    print(f"Info (Metadata Gen): Using {len(available_font_relative_paths)} valid font(s).")

    # Одоо байгаа JSON файл болон images хавтас байрлах үндсэн хавтасны абсолют зам
    existing_output_base_dir_abs = os.path.abspath(os.path.join(NOTEBOOK_CWD, EXISTING_OUTPUT_BASE_DIR_RELATIVE_PATH))
    
    # Одоо байгаа JSON файлын бүтэн зам
    json_file_to_update_abs_path = os.path.join(existing_output_base_dir_abs, EXISTING_JSON_FILENAME)
    
    all_image_params_for_json = [] # Эцсийн жагсаалт (өмнөх + шинэ)
    
    # Хэрэв JSON файл аль хэдийн байгаа бол уншина
    if os.path.exists(json_file_to_update_abs_path):
        print(f"Info (Metadata Gen): Existing JSON file found at '{json_file_to_update_abs_path}'. Loading entries.")
        try:
            with open(json_file_to_update_abs_path, 'r', encoding='utf-8') as f_json_existing:
                all_image_params_for_json = json.load(f_json_existing)
            print(f"Info (Metadata Gen): Loaded {len(all_image_params_for_json)} existing entries.")
        except Exception as e_load:
            print(f"Warning (Metadata Gen): Could not load or parse existing JSON file '{json_file_to_update_abs_path}': {e_load}. \n"
                  f"Proceeding as if creating a new JSON, starting index from {STARTING_IMAGE_GLOBAL_INDEX_FOR_APPEND}.")
            all_image_params_for_json = [] # Алдаа гарвал хоосноос эхэлж, шинээр үүсгэнэ
    else:
        print(f"Info (Metadata Gen): No existing JSON file found at '{json_file_to_update_abs_path}'. A new file will be created.")
        # Хэрэв файл байхгүй бол үндсэн хавтас байгаа эсэхийг шалгаж, үүсгэнэ
        if not os.path.exists(existing_output_base_dir_abs):
            try:
                os.makedirs(existing_output_base_dir_abs)
                print(f"Info (Metadata Gen): Created base output directory: '{existing_output_base_dir_abs}'")
            except OSError as e:
                print(f"Error (Metadata Gen): Could not create base output directory '{existing_output_base_dir_abs}': {e}")
                return False


    # Шинэ мэдээлэл нэмэхэд ашиглах глобал индекс
    # Хэрэв STARTING_IMAGE_GLOBAL_INDEX_FOR_APPEND нь 0-ээс их бол түүнийг ашиглана,
    # үгүй бол одоо байгаа JSON-ийн хамгийн их индекс + 1-ээс эхэлнэ.
    # Энэ логикийг илүү нарийн болгож болно (жишээ нь, JSON-оос хамгийн их индексийг унших).
    # Одоогийн байдлаар STARTING_IMAGE_GLOBAL_INDEX_FOR_APPEND-г шууд ашиглана.
    image_global_index = STARTING_IMAGE_GLOBAL_INDEX_FOR_APPEND
    print(f"Info (Metadata Gen): New entries will start with image index: {image_global_index}")

    # Шинэ текст файлаас уншиж, мэдээлэл нэмэх
    new_text_file_abs_path = os.path.abspath(os.path.join(NOTEBOOK_CWD, NEW_TEXT_FILE_TO_PROCESS_RELATIVE_PATH))
    print(f"\nProcessing new text file: '{new_text_file_abs_path}' to append data.")
    full_text_content_new = ""
    try:
        with open(new_text_file_abs_path, 'r', encoding='utf-8') as f_new:
            full_text_content_new = f_new.read().strip()
    except FileNotFoundError:
        print(f"Error (Metadata Gen): New text file '{new_text_file_abs_path}' not found. No new data will be added.")
        # Хэрэв шинэ файл байхгүй бол зүгээр л одоо байгаа JSON-г хадгалаад дуусгаж болно, эсвэл алдаа өгч зогсооно.
        # Энд зүгээр л байгаагаа хадгалаад дуусгая.
        if all_image_params_for_json: # Хэрэв өмнө нь юм уншсан бол
             pass # Дараа нь хадгалах хэсэг рүү очно
        else: # Өмнө нь ч юм байгаагүй, шинэ файл ч байхгүй бол
            print("Error (Metadata Gen): No existing JSON and no new text file to process. Exiting.")
            return False

    if full_text_content_new:
        new_words = full_text_content_new.split(' ')
        if not new_words:
            print(f"Info (Metadata Gen): No words found in the new text file '{new_text_file_abs_path}'.")
        else:
            print(f"Info (Metadata Gen): Found {len(new_words)} words/segments in '{NEW_TEXT_FILE_TO_PROCESS_RELATIVE_PATH}'. Adding them to JSON.")
            current_word_list_index_for_new_file = 0
            new_entries_count = 0
            while current_word_list_index_for_new_file < len(new_words):
                num_words_to_group = random.choices(WORD_GROUP_SIZES, weights=WORD_GROUP_WEIGHTS, k=1)[0]
                if current_word_list_index_for_new_file + num_words_to_group > len(new_words):
                    num_words_to_group = len(new_words) - current_word_list_index_for_new_file
                if num_words_to_group <= 0: break
                
                word_chunk_list = new_words[current_word_list_index_for_new_file : current_word_list_index_for_new_file + num_words_to_group]
                text_label_for_image = " ".join(word_chunk_list)

                if not text_label_for_image.strip():
                    current_word_list_index_for_new_file += num_words_to_group
                    continue

                font_to_use = random.choice(available_font_relative_paths)
                scaling_factor = random.uniform(MIN_SCALING_FACTOR_CONFIG, MAX_SCALING_FACTOR_CONFIG)
                random_tilt_value = random.uniform(RANDOM_TILT_FULL_RANGE[0], RANDOM_TILT_FULL_RANGE[1])
                if abs(random_tilt_value) < 1.0: random_tilt_value = 0
                
                filename_for_image = f"{image_global_index}.png" # Үргэлжилсэн индекс
                
                all_image_params_for_json.append({
                    "label": text_label_for_image,
                    "fonts": [os.path.basename(font_to_use)], 
                    "path": JSON_IMAGE_PATH_PREFIX_IN_JSON_FILE + filename_for_image, 
                    "font_to_use_relpath": font_to_use, 
                    "base_rotation": BASE_ROTATION_ANGLE_CONFIG,
                    "random_tilt": round(random_tilt_value, 3), 
                    "scaling_factor": round(scaling_factor, 3)
                })
                image_global_index += 1
                new_entries_count += 1
                current_word_list_index_for_new_file += num_words_to_group

                if new_entries_count % 100 == 0:
                    print(f"   (Metadata Gen) Added {new_entries_count} new entries from '{NEW_TEXT_FILE_TO_PROCESS_RELATIVE_PATH}'. Current global index: {image_global_index-1}")
            print(f"Info (Metadata Gen): Added {new_entries_count} new entries from '{NEW_TEXT_FILE_TO_PROCESS_RELATIVE_PATH}'.")
    else:
        print(f"Info (Metadata Gen): New text file '{NEW_TEXT_FILE_TO_PROCESS_RELATIVE_PATH}' was empty or not found. No new entries added from it.")

    total_final_entries = len(all_image_params_for_json)
    print(f"Finished processing. Total {total_final_entries} image entries will be saved to JSON file.")
            
    try:
        # JSON файлыг дарж бичнэ (өмнөх болон шинэ мэдээлэлтэйгээр)
        with open(json_file_to_update_abs_path, 'w', encoding='utf-8') as f_json:
            json.dump(all_image_params_for_json, f_json, ensure_ascii=False, indent=2)
        print(f"Info (Metadata Gen): Successfully updated JSON file at '{json_file_to_update_abs_path}'")
        return True
    except IOError as e:
        print(f"Error (Metadata Gen): Could not save updated JSON file '{json_file_to_update_abs_path}': {e}")
        return False


print("--- Running Stage 1: Generate/Append Metadata JSON ---")
stage1_success_flag = generate_and_append_metadata_json()
print(f"STAGE 1 EXECUTION FINISHED. Success: {stage1_success_flag}")
print("-" * 70)

--- Running Stage 1: Generate/Append Metadata JSON ---

------------------------------
STAGE 1: GENERATING AND APPENDING METADATA JSON
------------------------------
Info (Metadata Gen): Checking font files...
Info (Metadata Gen): Using 17 valid font(s).
Info (Metadata Gen): Existing JSON file found at 'c:\Users\domogdog\Desktop\bigproject\n_dataset_word_groups\labels_word_groups.json'. Loading entries.
Info (Metadata Gen): Loaded 84431 existing entries.
Info (Metadata Gen): New entries will start with image index: 84430

Processing new text file: 'c:\Users\domogdog\Desktop\bigproject\Mongol-tuulgatnii-hoyr-erhem-project-OCR-\adiya\web_scrap\scraped_data\president.txt' to append data.
Info (Metadata Gen): Found 303141 words/segments in '../adiya/web_scrap/scraped_data/president.txt'. Adding them to JSON.
   (Metadata Gen) Added 100 new entries from '../adiya/web_scrap/scraped_data/president.txt'. Current global index: 84529
   (Metadata Gen) Added 200 new entries from '../adiya/web_scr

In [None]:
# Cell 3: Stage 2 - Functions to Generate Actual Images from JSON and Execution

# generate_actual_image_from_json_params_notebook_v3 функц (өмнөхтэйгөө ижил, өөрчлөлтгүй)
def generate_actual_image_from_json_params_notebook_v3(
    app_instance, text_to_render, font_relative_path_from_cwd, 
    output_image_full_path, base_angle_param, random_tilt_param,
    scaling_factor_param
):
    # ... (Энэ функцийн бүтэн код өмнөх хариултад байгаа, энд товчлохгүйгээр хуулж болно) ...
    # ... (Яг өмнөх хариултын Cell 3-т байсан generate_actual_image_from_json_params_notebook_v3 функцийг энд ашиглана) ...
    if not text_to_render.strip(): return False
    actual_font_path = os.path.abspath(os.path.join(NOTEBOOK_CWD, font_relative_path_from_cwd))
    if not os.path.exists(actual_font_path):
        print(f"Error (Image Gen): Font file not found at '{actual_font_path}'. Skipping image: {os.path.basename(output_image_full_path)}")
        return False
    font_id = QFontDatabase.addApplicationFont(actual_font_path)
    if font_id == -1: font_families = ["Arial"]
    else: font_families = QFontDatabase.applicationFontFamilies(font_id)
    if not font_families: font = QFont()
    else: font = QFont(font_families[0], BASE_FONT_SIZE_CONFIG) # Use common config
    fm = QFontMetrics(font)
    text_rect_precise = fm.boundingRect(text_to_render)
    temp_horizontal_pixmap_width = text_rect_precise.width()
    temp_horizontal_pixmap_height = text_rect_precise.height()
    if temp_horizontal_pixmap_width <= 0 or temp_horizontal_pixmap_height <= 0: return False
    temp_horizontal_pixmap = QPixmap(int(temp_horizontal_pixmap_width), int(temp_horizontal_pixmap_height))
    temp_horizontal_pixmap.fill(BACKGROUND_COLOR_CONFIG) # Use common config
    painter_temp = QPainter(temp_horizontal_pixmap)
    painter_temp.setRenderHint(QPainter.Antialiasing); painter_temp.setFont(font); painter_temp.setPen(TEXT_COLOR_CONFIG) # Use common config
    painter_temp.drawText(QPointF(-text_rect_precise.x(), -text_rect_precise.y()), text_to_render); painter_temp.end()
    combined_transform_for_bounding_rect = QTransform(); combined_transform_for_bounding_rect.rotate(base_angle_param); combined_transform_for_bounding_rect.rotate(random_tilt_param)
    rotated_text_rect = combined_transform_for_bounding_rect.mapRect(QRectF(0, 0, temp_horizontal_pixmap.width(), temp_horizontal_pixmap.height()))
    final_content_width = int(rotated_text_rect.width()); final_content_height = int(rotated_text_rect.height())
    padded_final_width = final_content_width + PADDING_AMOUNT_CONFIG * 2; padded_final_height = final_content_height + PADDING_AMOUNT_CONFIG * 2 # Use common config
    if padded_final_width <=0 or padded_final_height <=0: return False
    final_oriented_pixmap = QPixmap(padded_final_width, padded_final_height); final_oriented_pixmap.fill(BACKGROUND_COLOR_CONFIG) # Use common config
    painter_final_oriented = QPainter(final_oriented_pixmap)
    painter_final_oriented.setRenderHint(QPainter.Antialiasing); painter_final_oriented.setRenderHint(QPainter.SmoothPixmapTransform)
    painter_final_oriented.translate(padded_final_width / 2, padded_final_height / 2); painter_final_oriented.rotate(base_angle_param); painter_final_oriented.rotate(random_tilt_param)
    painter_final_oriented.drawPixmap(int(-temp_horizontal_pixmap.width() / 2), int(-temp_horizontal_pixmap.height() / 2), temp_horizontal_pixmap); painter_final_oriented.end()
    final_image_to_save = final_oriented_pixmap.toImage(); current_scaling_factor = scaling_factor_param
    scaled_image_to_save = final_image_to_save
    if not (current_scaling_factor < 1.001 and current_scaling_factor > 0.999):
        if not (current_scaling_factor < 0.1):
            target_width = int(padded_final_width / current_scaling_factor); target_height = int(padded_final_height / current_scaling_factor)
            if not (target_width < 1 or target_height < 1):
                scaled_image_to_save = final_image_to_save.scaled(target_width, target_height, Qt.KeepAspectRatio, Qt.SmoothTransformation)
    output_image_dir = os.path.dirname(output_image_full_path)
    if not os.path.exists(output_image_dir):
        try: os.makedirs(output_image_dir)
        except OSError: pass 
    if scaled_image_to_save.save(output_image_full_path, "PNG"): return True
    else: return False


def process_json_and_generate_actual_images_for_append(): # Функцийн нэрийг өөрчилье
    print("\n" + "-" * 30 + "\nSTAGE 2: GENERATING IMAGES FROM UPDATED JSON\n" + "-" * 30)
    
    app = QApplication.instance()
    if app is None:
        app = QApplication(sys.argv if hasattr(sys, 'argv') else [])

    # JSON файл болон images хавтас байрлах үндсэн хавтасны абсолют зам
    # Энэ нь Үе шат 1-ийн EXISTING_OUTPUT_BASE_DIR_RELATIVE_PATH-тай ижил байх ёстой
    base_dir_for_io_abs = os.path.abspath(os.path.join(NOTEBOOK_CWD, EXISTING_OUTPUT_BASE_DIR_RELATIVE_PATH))
    
    # JSON файлын зам (Үе шат 1-ийн EXISTING_JSON_FILENAME-тэй ижил)
    json_file_to_process_abs = os.path.join(base_dir_for_io_abs, EXISTING_JSON_FILENAME)

    if not os.path.exists(json_file_to_process_abs):
        print(f"Error (Image Gen): JSON file not found at '{json_file_to_process_abs}'. Exiting Stage 2.")
        return False

    image_generation_tasks = []
    try:
        with open(json_file_to_process_abs, 'r', encoding='utf-8') as f_json:
            image_generation_tasks = json.load(f_json)
    except Exception as e:
        print(f"Error (Image Gen): Loading JSON file '{json_file_to_process_abs}': {e}. Exiting Stage 2.")
        return False

    if not image_generation_tasks:
        print("Info (Image Gen): No tasks in JSON. Exiting Stage 2.")
        return False
    
    print(f"Info (Image Gen): Base I/O directory: '{base_dir_for_io_abs}'")
    print(f"Info (Image Gen): Loaded {len(image_generation_tasks)} total tasks from JSON.")
    
    generated_count_this_run = 0
    failed_count_this_run = 0
    
    # Зөвхөн STARTING_IMAGE_GLOBAL_INDEX_FOR_APPEND-аас эхэлсэн шинэ зургуудыг үүсгэх
    # Эсвэл бүгдийг нь дахин үүсгэж болно (одоогийн байдлаар бүгдийг нь хийнэ)
    # Хэрэв зөвхөн шинийг үүсгэх бол task-уудыг шүүнэ:
    tasks_to_run_this_time = []
    for task in image_generation_tasks:
        path_in_json = task.get("path", "")
        if path_in_json:
            fname_in_json = os.path.basename(path_in_json)
            try:
                idx_in_fname = int(os.path.splitext(fname_in_json)[0])
                if idx_in_fname >= STARTING_IMAGE_GLOBAL_INDEX_FOR_APPEND:
                    tasks_to_run_this_time.append(task)
            except ValueError:
                print(f"Warning (Image Gen): Could not parse index from filename '{fname_in_json}' in path '{path_in_json}'. Including task anyway.")
                tasks_to_run_this_time.append(task) # Индекс гаргаж чадахгүй бол ямартай ч оруулъя
        else:
            tasks_to_run_this_time.append(task) # Path байхгүй бол (буруу JSON) ямартай ч оролдъё
    
    # Хэрэв өмнө нь STARTING_IMAGE_GLOBAL_INDEX_FOR_APPEND-ээс бага индексээр ажиллаж байсан бол
    # tasks_to_run_this_time хоосон байж магадгүй. Бүх task-г ажиллуулах нь илүү найдвартай.
    # Эсвэл хэрэглэгч үнэхээр зөвхөн шинээр нэмэгдсэнийг хүсэж байгаа бол дээрх шүүлтүүр зөв.
    # Одоогийн байдлаар бүх task-г ажиллуулъя, хэрэв зураг байгаа бол дарж бичнэ.
    # tasks_to_run_this_time = image_generation_tasks # Бүгдийг ажиллуулах бол

    if not tasks_to_run_this_time and image_generation_tasks: # Хэрэв шүүлтүүр хоосон үр дүн өгсөн ч, анхны task байсан бол
        print(f"Info (Image Gen): No tasks to process starting from index {STARTING_IMAGE_GLOBAL_INDEX_FOR_APPEND} or JSON structure issue. Processing all tasks instead.")
        tasks_to_run_this_time = image_generation_tasks
    elif not tasks_to_run_this_time:
        print(f"Info (Image Gen): No tasks to process (either empty JSON or filtered out by STARTING_IMAGE_GLOBAL_INDEX_FOR_APPEND).")
        return True # Юу ч хийхгүй, гэхдээ амжилттай

    total_tasks_to_run_now = len(tasks_to_run_this_time)
    print(f"Info (Image Gen): Will attempt to generate/regenerate {total_tasks_to_run_now} images...")

    for i, task in enumerate(tasks_to_run_this_time):
        label = task.get("label")
        font_rel_path = task.get("font_to_use_relpath") 
        relative_image_path_from_json = task.get("path") 
        
        base_rotation = task.get("base_rotation", BASE_ROTATION_ANGLE_CONFIG) 
        random_tilt = task.get("random_tilt", 0)
        scaling_factor = task.get("scaling_factor", 1.0) 

        if not (label and font_rel_path and relative_image_path_from_json):
            print(f"Warning (Image Gen): Skipping task {i+1} (original index might be different) due to missing data: {task}")
            failed_count_this_run +=1
            continue
            
        target_image_full_path = os.path.abspath(os.path.join(base_dir_for_io_abs, relative_image_path_from_json))
        
        if generate_actual_image_from_json_params_notebook_v3(
            app, label, 
            font_rel_path, 
            target_image_full_path,
            base_rotation, random_tilt, scaling_factor
        ):
            generated_count_this_run += 1
        else:
            failed_count_this_run += 1

        if (i + 1) % 50 == 0 or (i + 1) == total_tasks_to_run_now:
            print(f"   (Image Gen) Processed {i + 1}/{total_tasks_to_run_now} tasks for this run. Generated: {generated_count_this_run}, Failed: {failed_count_this_run}")
            
    print(f"Finished image generation for this run. Generated: {generated_count_this_run}, Failed: {failed_count_this_run}.")
    return True

# --- Execute Stage 1 (Run this cell after defining functions in Cell 2) ---
# print("--- Running Stage 1: Generate/Append Metadata JSON ---")
# stage1_success_flag = generate_and_append_metadata_json() # Функцийн нэр
# print(f"STAGE 1 EXECUTION FINISHED. Success: {stage1_success_flag}")
# print("-" * 70)

print("--- Running Stage 2: Image Generation from Updated JSON ---")
if 'stage1_success_flag' in locals() and stage1_success_flag: 
   stage2_success_flag = process_json_and_generate_actual_images_for_append() # Функцийн нэр
   print(f"STAGE 2 EXECUTION FINISHED. Success: {stage2_success_flag}")
elif 'stage1_success_flag' not in locals():
   print("Warning: Stage 1 success flag not found. Assuming success and proceeding with Stage 2.")
   stage2_success_flag = process_json_and_generate_actual_images_for_append() # Функцийн нэр
   print(f"STAGE 2 EXECUTION FINISHED. Success: {stage2_success_flag}")
else:
   print("Skipping Stage 2 because Stage 1 did not complete successfully.")
print("-" * 70)

--- Running Stage 2: Image Generation from Updated JSON ---

------------------------------
STAGE 2: GENERATING IMAGES FROM UPDATED JSON
------------------------------
Info (Image Gen): Base I/O directory: 'c:\Users\domogdog\Desktop\bigproject\n_dataset_word_groups'
Info (Image Gen): Loaded 251861 total tasks from JSON.
Info (Image Gen): Will attempt to generate/regenerate 167431 images...
   (Image Gen) Processed 50/167431 tasks for this run. Generated: 50, Failed: 0
   (Image Gen) Processed 100/167431 tasks for this run. Generated: 100, Failed: 0
   (Image Gen) Processed 150/167431 tasks for this run. Generated: 150, Failed: 0
   (Image Gen) Processed 200/167431 tasks for this run. Generated: 200, Failed: 0
   (Image Gen) Processed 250/167431 tasks for this run. Generated: 250, Failed: 0
   (Image Gen) Processed 300/167431 tasks for this run. Generated: 300, Failed: 0
   (Image Gen) Processed 350/167431 tasks for this run. Generated: 350, Failed: 0
   (Image Gen) Processed 400/167431

In [3]:
TEXT_FILE_RELATIVE_PATH = '../adiya/web_scrap/scraped_data/president.txt'

# Файлыг нээж бүх текстийг унших
with open(TEXT_FILE_RELATIVE_PATH, 'r', encoding='utf-8') as file:
    text = file.read()

# Текстийг үг болгож хуваах (хөндий зайгаар тусгаарлана)
words = text.split()

# Үгүүдийн тоог хэвлэх
print(f"Нийт үгийн тоо: {len(words)}")


Нийт үгийн тоо: 423016


In [10]:
from collections import Counter
import re

TEXT_FILE_RELATIVE_PATH = '../adiya/web_scrap/scraped_data/newsmn.txt'

with open(TEXT_FILE_RELATIVE_PATH, 'r', encoding='utf-8') as file:
    text = file.read()

# Текстийг бүх жижиг үсэг болгох
text_lower = text.lower()

# Зөвхөн үсэг болон тоо бүхий үгсийг салгах (тэмдэгтүүдийг хасах)
words = re.findall(r'\b\w+\b', text_lower)

# Үгийн давтамжийг тоолох
word_counts = Counter(words)

# Давтамж 1-ээс дээш (жишээ: 1 удаа гарсан үгсийг өвөрмөц гэж үзэх)
special_words = [word for word, count in word_counts.items() if count == 1]

print(f"Текст дотор нийт {len(words)} үг байгаагаас давтамж 1-тай өвөрмөц үгсийн тоо: {len(special_words)}")
print("Жишээ өвөрмөц үгс:")
print(special_words[:20])  # эхний 20 өвөрмөц үгийг харуулах


Текст дотор нийт 203312 үг байгаагаас давтамж 1-тай өвөрмөц үгсийн тоо: 4986
Жишээ өвөрмөц үгс:
['ᠪᠸᠯᠬᠢ', 'ᠦᠢᠯᠡᠳᠪᠤᠷᠢᠯᠡᠵᠦ', 'ᠰᠡᠷᠭᠡᠢᠯᠡᠬᠦ', 'ᠠᠨᠤᠳᠠᠷᠢ', 'ᠲᠠᠢᠰᠢᠷᠢ', 'ᠬᠣᠯᠪᠣᠯᠲᠠ', 'ᠡᠩᠬᠡᠴᠡᠴᠡᠭ', 'ᠵᠠᠢᠯᠠᠰᠢ', 'ᠠᠰᠠᠷᠠᠮᠵᠢᠯᠠᠬᠤ', 'ᠠᠮᠵᠢᠬᠤ', 'ᠨᠠᠢᠷᠠᠭᠤᠯᠤᠭᠴᠢ', 'ᠠᠮᠤᠷᠰᠠᠢᠬᠠᠨ', 'ᠳᠠᠷᠤᠭᠠᠯᠠᠵᠤ', 'ᠪᠠᠢᠭᠤᠯᠤᠭᠴᠢ', 'ᠺᠲᠷᠢᠨ', 'ᠷᠧᠢ', 'ᠦᠷᠭᠦᠯᠵᠢᠯᠡᠪᠡᠯ', 'ᠬᠡᠷᠡᠭᠵᠢᠭᠦᠯᠦᠭᠴᠢᠳ', 'ᠮᠢᠺᠷᠣᠹᠣᠨ', 'ᠲᠣᠳᠣᠷᠬᠠᠢᠯᠠᠨ']
