In [6]:
import json
import requests
from bs4 import BeautifulSoup
import time


BASE_URL = "http://khumuunbichig.montsame.mn/index.php?command=newsall&readnews="  
OUTPUT_FILE = "scraped_data.json"

start_index = 50
end_index = 2560

scraped_data = []

for index in range(start_index, end_index + 1):
    url = f"{BASE_URL}{index}"
    print(f"Scraping: {url}")
    
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        
        div_element = soup.find("div", class_="news_text_t transform")
        extracted_text = div_element.get_text(strip=True) if div_element else ""
        
        scraped_data.append({"index": index, "text": extracted_text})
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
    time.sleep(1)

# Save data to JSON file
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(scraped_data, f, ensure_ascii=False, indent=4)

print(f"Scraping complete. Data saved to {OUTPUT_FILE}")



Scraping: http://khumuunbichig.montsame.mn/index.php?command=newsall&readnews=50
Scraping: http://khumuunbichig.montsame.mn/index.php?command=newsall&readnews=51
Scraping: http://khumuunbichig.montsame.mn/index.php?command=newsall&readnews=52
Scraping: http://khumuunbichig.montsame.mn/index.php?command=newsall&readnews=53
Scraping: http://khumuunbichig.montsame.mn/index.php?command=newsall&readnews=54
Scraping: http://khumuunbichig.montsame.mn/index.php?command=newsall&readnews=55
Scraping: http://khumuunbichig.montsame.mn/index.php?command=newsall&readnews=56
Scraping: http://khumuunbichig.montsame.mn/index.php?command=newsall&readnews=57
Scraping: http://khumuunbichig.montsame.mn/index.php?command=newsall&readnews=58
Scraping: http://khumuunbichig.montsame.mn/index.php?command=newsall&readnews=59
Scraping: http://khumuunbichig.montsame.mn/index.php?command=newsall&readnews=60
Scraping: http://khumuunbichig.montsame.mn/index.php?command=newsall&readnews=61
Scraping: http://khumuunbich

In [8]:
with open("scraped_data.json") as f:
    data = json.load(f)

In [9]:
import json
from collections import Counter

# Load the scraped JSON data
with open("scraped_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Initialize character and word counters
char_counter = Counter()
word_counter = Counter()
total_word_count = 0

# Process each extracted text
for entry in data:
    text = entry["text"]
    
    # Count characters
    char_counter.update(text)
    
    # Count words (splitting by whitespace)
    words = text.split()
    word_counter.update(words)
    total_word_count += len(words)

# Get unique counts
unique_chars = len(char_counter)
unique_words = len(word_counter)

# Print results
print(f"Total unique characters: {unique_chars}")
print(f"Character frequencies: {dict(char_counter)}")
print(f"Total words: {total_word_count}")
print(f"Total unique words: {unique_words}")


Total unique characters: 97
Character frequencies: {'b': 59283, 'u': 27597, 'l': 120067, 'i': 251125, 't': 41194, '\xa0': 132848, 'o': 304666, 'a': 176760, ' ': 440195, 'D': 64548, 'g': 61292, 'e': 436514, '&': 39070, '1': 8140, '7': 1647, 'd': 100118, '*': 36064, 'J': 19522, 'r': 87437, 'O': 84099, '<': 13866, 'f': 107136, 's': 44569, '>': 13830, 'k': 74344, 'E': 103977, 'm': 31602, 'K': 36875, 'M': 18282, 'H': 22940, 'B': 32473, 'S': 28857, '%': 8477, 'N': 20932, 'h': 36135, '_': 11439, 'Y': 12430, 'c': 38491, 'I': 69844, 'C': 9189, ',': 41053, 'j': 37355, '0': 9018, '^': 9470, '(': 13674, '.': 18711, '\r': 34193, '\n': 34192, '\t': 3652, 'y': 15552, '9': 1992, '3': 2748, 'Z': 1869, 'A': 10550, '8': 1680, '6': 1784, 'G': 9568, '5': 2623, ')': 13245, 'n': 19306, '2': 8541, '4': 2396, 'w': 9527, 'W': 572, ']': 582, '-': 8997, '=': 1940, 'L': 1708, ':': 238, '"': 241, '$': 2399, 'T': 20726, '[': 1347, 'q': 784, 'z': 432, '|': 1424, '?': 742, '~': 764, 'P': 1226, '!': 469, '`': 49, 'p': 

In [None]:

from fontTools.ttLib import TTFont
from PIL import Image, ImageDraw, ImageFont
import os

# Load the WOFF font
font_path = "CMsHureedp.woff"
font_path = "cmsulaanbaatardp-webfont.woff"


def extract_and_save_glyph_images(woff_file, output_dir):
    # Load the WOFF file using TTFont
    font = TTFont(woff_file)
    
    # Get the 'cmap' table which contains the Unicode to Glyph ID mappings
    cmap = font['cmap']
    
    # Loop through the cmap encodings and extract Unicode to Glyph ID mapping
    unicode_glyph_mapping = {}
    for table in cmap.tables:
        for char_code, glyph_id in table.cmap.items():
            unicode_glyph_mapping[char_code] = glyph_id
    
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Print and save the glyph images
    print(f"Total number of glyphs: {len(unicode_glyph_mapping)}")
    print("Rendering glyphs as images...")
    
    for unicode_code, glyph_id in unicode_glyph_mapping.items():
        # Get the standard character from the Unicode code point
        standard_char = chr(unicode_code)
        
        # Create an image to render the glyph
        image = Image.new('RGB', (50, 50), color=(255, 255, 255))  # White background
        draw = ImageDraw.Draw(image)
        
        # Load the font to render the character
        try:
            font_path = font.reader.file.name  # Path to the font file
            image_font = ImageFont.truetype(font_path, 40)  # Adjust font size as needed
        except Exception as e:
            print(f"Error loading font: {e}")
            continue
        # Draw the Unicode character on the image
        draw.text((5, 5), standard_char, font=image_font, fill=(0, 0, 0))  # Black text
        
        # Save the image with the glyph's ID as the filename
        image_filename = os.path.join(output_dir, f"glyph_{glyph_id}_U+{unicode_code:04X}.png")
        image.save(image_filename)
        print(f"Saved: {image_filename}")
        return unicode_glyph_mapping

# Example usage?
woff_file = font_path
output_dir = "glyph_images"  # Directory where the images will be saved
my_map = extract_and_save_glyph_images(woff_file, output_dir)



In [None]:
from fontTools.ttLib import TTFont
from PIL import Image, ImageDraw, ImageFont
import os

def extract_and_render_glyphs(woff_file, output_image, glyphs_per_row=5, font_size=40, padding=20, box_size=80):
    # Load the WOFF font
    font = TTFont(woff_file)
    cmap = font['cmap']
    unicode_glyph_mapping = {}
    
    for table in cmap.tables:
        for char_code, glyph_id in table.cmap.items():
            unicode_glyph_mapping[char_code] = glyph_id
    
    glyph_count = len(unicode_glyph_mapping)
    rows = (glyph_count + glyphs_per_row - 1) // glyphs_per_row  # Calculate total rows
    img_width = glyphs_per_row * box_size + padding
    img_height = rows * box_size + padding
    
    # Create the output image
    image = Image.new('RGB', (img_width, img_height), color=(255, 255, 255))
    draw = ImageDraw.Draw(image)
    
    try:
        image_font = ImageFont.truetype(woff_file, font_size)
    except Exception as e:
        print(f"Error loading font: {e}")
        return
    
    # Render glyphs
    x, y = padding, padding
    print(len(unicode_glyph_mapping))
    for idx, (unicode_code, glyph_id) in enumerate(unicode_glyph_mapping.items()):
        char = chr(unicode_code)
        box_coords = [x, y, x + box_size, y + box_size]
        draw.rectangle(box_coords, outline='black', width=2)
        text_x = x + (box_size - font_size) // 2
        text_y = y + (box_size - font_size) // 2
        draw.text((text_x, text_y), char, font=image_font, fill='black')
        
        x += box_size
        if (idx + 1) % glyphs_per_row == 0:
            x = padding
            y += box_size
    
    image.save(output_image)
    print(f"Saved glyph overview: {output_image}")

# Example usage
# woff_file = "khuree_font/cmsulaanbaatardp-webfont.woff"
woff_file = "khuree_font/CMSUB.TTF"
output_image = "glyph_overview1.png"
extract_and_render_glyphs(woff_file, output_image)


124
Saved glyph overview: glyph_overview1.png


In [57]:
from fontTools.ttLib import TTFont
from PIL import Image, ImageDraw, ImageFont

def visualize_chars_in_font(woff_file, output_image, chars, glyphs_per_row=5, font_size=40, padding=20, box_size=80):
    # Load the font
    font = TTFont(woff_file)
    cmap = font['cmap']
    unicode_glyph_mapping = {}
    
    for table in cmap.tables:
        for char_code, glyph_id in table.cmap.items():
            unicode_glyph_mapping[char_code] = glyph_id

    # Create the output image
    rows = (len(chars) + glyphs_per_row - 1) // glyphs_per_row
    img_width = glyphs_per_row * box_size + padding
    img_height = rows * box_size + padding

    image = Image.new('RGB', (img_width, img_height), color=(255, 255, 255))
    draw = ImageDraw.Draw(image)

    # Load font for rendering
    try:
        image_font = ImageFont.truetype(woff_file, font_size)
    except Exception as e:
        print(f"Error loading font: {e}")
        return

    # Render each character
    x, y = padding, padding
    for idx, char in enumerate(chars):
        # Draw box around each character
        box_coords = [x, y, x + box_size, y + box_size]
        draw.rectangle(box_coords, outline='black', width=2)
        
        # Check if the character is supported
        unicode_code = ord(char)
        if unicode_code not in unicode_glyph_mapping:
            print(f"Character '{char}', unicode {unicode_code} not supported in font")
        # Center the character in the box
        text_x = x + (box_size - font_size) // 2
        text_y = y + (box_size - font_size) // 2
        draw.text((text_x, text_y), char, font=image_font, fill='black')
        
        
        # Move to next box
        x += box_size
        if (idx + 1) % glyphs_per_row == 0:
            x = padding
            y += box_size

    # Save the image
    image.save(output_image)
    print(f"Saved glyph overview: {output_image}")


# Example usage
woff_file = "khuree_font/CMSUB.TTF"
# woff_file = "khuree_font/cmsulaanbaatardp-webfont.woff"
output_image = "glyph_visualization_char.png"
chars = unique_chars

visualize_chars_in_font(woff_file, output_image, chars)


Saved glyph overview: glyph_visualization_char.png


In [61]:
vocab = ""
for char in unique_chars:
    print(char)
    vocab += char
vocab

B
2
=
q
Q
9
8
{
#
k
[
}
c
M
P
1
h
$
s
p
]
a
,
)
y
o
L
w
e
A
%
D
E
:
m
|
i
b
+
j
t
G
F
<
S
O
.
/
7
@
n
~
Z
`
Y
!
>
f
6
z
4
C
R
-
I
N
T
"
u
U
l
^
d
K
_
g
5
0
*
&
\
(
J
?
;
3
r
H
W


'B2=qQ98{#k[}cMP1h$sp]a,)yoLweA%DE:m|ib+jtGF<SO./7@n~Z`Y!>f6z4CR-INT"uUl^dK_g50*&\\(J?;3rHW'

In [56]:
image = Image.new('RGB', (300, 50), color=(255, 255, 255))  # White background
draw = ImageDraw.Draw(image)
font = TTFont("khuree_font/CMSUB.TTF")
# Load the font to render the character
try:
    font_path = font.reader.file.name  # Path to the font file
    image_font = ImageFont.truetype(font_path, 40)  # Adjust font size as needed
except Exception as e:
    print(f"Error loading font: {e}")
# Draw the Unicode character on the image
draw.text((5, 5), f"m o e h g e &", font=image_font, fill=(0, 0, 0))  # Black text


image.save("test.png")

In [6]:
from PIL import Image, ImageDraw, ImageFont
import freetype

# Function to render text using the WOFF font
def render_text_to_image(text, font_path, output_image_path):
    # Load the WOFF font with FreeType
    face = freetype.Face(font_path)
    
    # Set font size (this may require adjustments depending on the font)
    face.set_char_size(48*64)  # Set size in points (48 points here)
    
    # Calculate the size of the text in the image
    width, height, baseline = 0, 0, 0
    for char in text:
        face.load_char(char)
        glyph = face.glyph
        width += glyph.advance.x >> 6  # advance.x is in 64th units of pixels
        height = max(height, glyph.bitmap_top)
        baseline = max(baseline, glyph.bitmap_top)

    # Create a blank image
    img = Image.new("RGB", (width, height + baseline), "black")
    draw = ImageDraw.Draw(img)
    
    # Position the text
    x = 0
    for char in text:
        # Load the character from the font
        face.load_char(char)
        glyph = face.glyph
        
        # Render the glyph bitmap to the image
        bitmap = glyph.bitmap
        bitmap_width, bitmap_height = bitmap.width, bitmap.rows
        
        if bitmap_width == 0 or bitmap_height == 0:
            continue

        # Create a PIL image from the bitmap data
        glyph_image = Image.new("L", (bitmap_width, bitmap_height))
        glyph_image.putdata(bitmap.buffer)
        
        # Paste the glyph image onto the main image at the correct position
        img.paste(glyph_image, (x + glyph.bitmap_left, baseline - glyph.bitmap_top))
        
        # Move to the next character's position
        x += glyph.advance.x >> 6  # Move the cursor by the glyph's advance width
    
    # Save the result as an image
    img.save(output_image_path)
    # img.show()  # Optional: Display the image

# Path to the WOFF font file
woff_font_path = "CMsHureedp.woff"
text_to_render = "multi_v7_1"

# Path to save the generated image
output_image_path = "test.png"

# Render the text to image
render_text_to_image(text_to_render, woff_font_path, output_image_path)


In [50]:
import json
from collections import Counter

# Load the scraped JSON data
with open("cleaned_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Initialize character and word counters
char_counter = Counter()
word_counter = Counter()
total_word_count = 0

# Process each extracted text
for entry in data:
    text = entry["text"]
    
    # Count characters
    char_counter.update(text)
    
    # Count words (splitting by whitespace)
    words = text.split()
    word_counter.update(words)
    total_word_count += len(words)

# Get unique counts
unique_chars = len(char_counter)
unique_words = len(word_counter)

# Print results
print(f"Total unique characters: {unique_chars}")
print(f"Character frequencies: {dict(char_counter)}")
print(f"Total words: {total_word_count}")
print(f"Total unique words: {unique_words}")


Total unique characters: 94
Character frequencies: {'b': 59283, 'u': 27597, 'l': 120067, 'i': 251125, 't': 41194, '\xa0': 132848, 'o': 304666, 'a': 176760, ' ': 440195, 'D': 64548, 'g': 61292, 'e': 436514, '&': 39070, '1': 8140, '7': 1647, 'd': 100118, '*': 36064, 'J': 19522, 'r': 87437, 'O': 84099, '<': 13866, 'f': 107136, 's': 44569, '>': 13830, 'k': 74344, 'E': 103977, 'm': 31602, 'K': 36875, 'M': 18282, 'H': 22940, 'B': 32473, 'S': 28857, '%': 8477, 'N': 20932, 'h': 36135, '_': 11439, 'Y': 12430, 'c': 38491, 'I': 69844, 'C': 9189, ',': 41053, 'j': 37355, '0': 9018, '^': 9470, '(': 13674, '.': 18711, '\r': 34193, '\n': 34192, '\t': 3652, 'y': 15553, '9': 1992, '3': 2748, 'Z': 1869, 'A': 10550, '8': 1680, '6': 1784, 'G': 9568, '5': 2623, ')': 13245, 'n': 19306, '2': 8541, '4': 2396, 'w': 9527, 'W': 572, ']': 582, '-': 8997, '=': 1940, 'L': 1708, ':': 238, '"': 241, '$': 2399, 'T': 20726, '[': 1347, 'q': 784, 'z': 432, '|': 1424, '?': 742, '~': 764, 'P': 1226, '!': 469, '`': 49, 'p': 

In [51]:
unique_chars = set()

for word, count in word_counter.items():
    # Only consider the word if it doesn't contain unrenderable characters
    for char in word:
        unique_chars.add(char)

In [53]:
len(unique_chars)

89

In [47]:
import json

# Special characters handling
# replace_with_space = "\u00A0"  # Non-breaking space (NBSP)
remove_chars = [
    "\u200C",  # Zero-width non-joiner (ZWNJ)
    "\u00AD",  # Soft hyphen (SHY)
]

# Load the scraped JSON data
with open("scraped_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Clean the data
for entry in data:
    text = entry["text"]
    
    # Replace non-breaking space with regular space
    # text = text.replace(replace_with_space, " ")

    # Remove unwanted characters
    for char in remove_chars:
        text = text.replace(char, "")

    entry["text"] = text

# Save the cleaned data
with open("cleaned_data.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

print("Cleaning complete. Saved as 'cleaned_data.json'.")

Cleaning complete. Saved as 'cleaned_data.json'.


In [None]:
from PIL import Image, ImageDraw, ImageFont
khuree_font_path = "khuree_font/CMSUB.TTF"

def khuree2img(text, output_path="test.png", padding=20):
    try:
        # Load the font
        image_font = ImageFont.truetype(khuree_font_path, 40)  
    except Exception as e:
        print(f"Error loading font: {e}")
        return

    dummy_img = Image.new("RGB", (1, 1))
    draw = ImageDraw.Draw(dummy_img)
    text_width, text_height = draw.textbbox((0, 0), text, font=image_font)[2:]

    img_width = text_width + 2 * padding
    img_height = text_height + 2 * padding

    # Create image
    image = Image.new("RGB", (img_width, img_height), color=(255, 255, 255))
    draw = ImageDraw.Draw(image)

    # Calculate text position (centered)
    text_x = (img_width - text_width) // 2
    text_y = (img_height - text_height) // 2

    # Draw text
    draw.text((text_x, text_y), text, font=image_font, fill=(0, 0, 0))  

    # Save the image
    image.save(output_path)

# Example usage
khuree2img("Moehgo& fca", "test.png")


In [60]:
import uuid
data=[]
i=0
for word in word_counter:
    if i%100==99:
        print(f"{i+1}/{len(word_counter)}")
    image_path = f"../../dataset/khumuun_bichig/{i}.png"
    data.append({
                    "image_path": image_path,
                    "label": word,
                })
    i+=1
    # khuree2img(word, image_path)
json_path ="../../dataset/khuree_data.json"
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

print(f"Saved metadata to {json_path}")

100/29352
200/29352
300/29352
400/29352
500/29352
600/29352
700/29352
800/29352
900/29352
1000/29352
1100/29352
1200/29352
1300/29352
1400/29352
1500/29352
1600/29352
1700/29352
1800/29352
1900/29352
2000/29352
2100/29352
2200/29352
2300/29352
2400/29352
2500/29352
2600/29352
2700/29352
2800/29352
2900/29352
3000/29352
3100/29352
3200/29352
3300/29352
3400/29352
3500/29352
3600/29352
3700/29352
3800/29352
3900/29352
4000/29352
4100/29352
4200/29352
4300/29352
4400/29352
4500/29352
4600/29352
4700/29352
4800/29352
4900/29352
5000/29352
5100/29352
5200/29352
5300/29352
5400/29352
5500/29352
5600/29352
5700/29352
5800/29352
5900/29352
6000/29352
6100/29352
6200/29352
6300/29352
6400/29352
6500/29352
6600/29352
6700/29352
6800/29352
6900/29352
7000/29352
7100/29352
7200/29352
7300/29352
7400/29352
7500/29352
7600/29352
7700/29352
7800/29352
7900/29352
8000/29352
8100/29352
8200/29352
8300/29352
8400/29352
8500/29352
8600/29352
8700/29352
8800/29352
8900/29352
9000/29352
9100/29352
9200/293

In [13]:
char_counter

Counter({' ': 440195,
         'e': 436514,
         'o': 304666,
         'i': 251125,
         'a': 176760,
         '\xa0': 132848,
         'l': 120067,
         'f': 107136,
         'E': 103977,
         'd': 100118,
         'r': 87437,
         'O': 84099,
         'k': 74344,
         'I': 69844,
         'D': 64548,
         'g': 61292,
         'b': 59283,
         's': 44569,
         't': 41194,
         ',': 41053,
         '&': 39070,
         'c': 38491,
         'j': 37355,
         'K': 36875,
         'h': 36135,
         '*': 36064,
         '\r': 34193,
         '\n': 34192,
         'B': 32473,
         'm': 31602,
         'S': 28857,
         'u': 27597,
         'H': 22940,
         'N': 20932,
         'T': 20726,
         'J': 19522,
         'n': 19306,
         '.': 18711,
         'M': 18282,
         'y': 15552,
         '<': 13866,
         '>': 13830,
         '(': 13674,
         'U': 13342,
         ')': 13245,
         'Y': 12430,
         '_': 11439