In [11]:
import os
import random
import numpy as np
import pandas as pd
from PIL import Image, ImageFont, ImageDraw
from uuid import uuid4
import shutil
from pathlib import Path
import re
import unicodedata

# Create data

In [2]:
def render_myanmar_text( 
        text, 
        font_dir="font_myanmar", 
        font_size=42, 
        output_path="output.png", 
        padding=20 
    ): 
    font_files = [os.path.join(font_dir, f) for f in os.listdir(font_dir) if f.lower().endswith((".ttf", ".otf"))] 
    if not font_files: raise ValueError("Không tìm thấy font trong thư mục!") 
    chosen_font = random.choice(font_files) 
    font = ImageFont.truetype(chosen_font, font_size) 
    temp_img = Image.new("RGB", (1, 1)) 
    draw = ImageDraw.Draw(temp_img) 
    try: 
        bbox = draw.textbbox((0, 0), text, font=font) 
        text_w = bbox[2] - bbox[0] 
        text_h = bbox[3] - bbox[1] 
    except AttributeError: 
        text_w, text_h = font.getsize(text) 
    img = Image.new("RGB", (text_w + padding * 2, text_h + padding * 2), "white") 
    draw = ImageDraw.Draw(img) 
    draw.text((padding, padding), text, font=font, fill="black") 
    img.save(output_path) 
    # print("✔ Render xong với font:", chosen_font) 
    return output_path

In [3]:
render_myanmar_text("နီပေါငလျင် - လူ ၁၀၀ ကျော်သေဆုံး ၁၀၀ ကျော်ဒဏ်ရာရ", font_dir="../font_myanmar", output_path="sample.png")

'sample.png'

In [4]:
def append_to_txt(file_path, text):
    with open(file_path, "a", encoding="utf-8") as f:
        f.write(text + "\n")

In [5]:
text_df_dir = "../text_data"
list_paths = [f for f in os.listdir(text_df_dir) if f.endswith(".csv")]
list_paths

['bbc_burmese_news.csv',
 'Microbiology.csv',
 'articles.csv',
 'rakhine_proverbs.csv',
 'Agriculture.csv',
 'Mpox Myanmar.csv']

In [6]:
df1 = pd.read_csv(f"{text_df_dir}/{list_paths[0]}")
df2 = pd.read_csv(f"{text_df_dir}/{list_paths[1]}")
df3 = pd.read_csv(f"{text_df_dir}/{list_paths[2]}")
df4 = pd.read_csv(f"{text_df_dir}/{list_paths[3]}")
df5 = pd.read_csv(f"{text_df_dir}/{list_paths[4]}")
df6 = pd.read_csv(f"{text_df_dir}/{list_paths[5]}")
df=None
df1 = df1[["title", "description"]]
df2 = df2[["Instruction", "Output"]]
df3 = df3[["headline"]]
df5 = df5[["Instruction", "Output"]]
df6 = df6[["Question", "Answer"]]
df = list(df1['title'])+list(df1['description'])+list(df2["Instruction"])+list(df2["Output"])+list(df3['headline'])+list(df4['proverbs'])+list(df5['Instruction'])+list(df5['Output'])+list(df6['Question'])+list(df6['Answer'])
df = pd.DataFrame(df, columns=["text"])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39963 entries, 0 to 39962
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    39963 non-null  object
dtypes: object(1)
memory usage: 312.3+ KB


In [8]:
df_clean = df.drop_duplicates()
mask = [True if len(f)<=80 else False for f in df_clean["text"]]
df_fix_len = df_clean[mask]

In [9]:
df_fix_len.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24871 entries, 0 to 39960
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    24871 non-null  object
dtypes: object(1)
memory usage: 388.6+ KB


In [10]:
df_fix_len.head()

Unnamed: 0,text
0,ဌာနချုပ် လိုင်ဇာအနီး စစ်ကောင်စီ အင်အားတိုးချဲ့...
2,ဟက်ဇ်ဘိုလာတပ် ဘယ်လောက် အင်အားကြီးလဲ၊ အစ္စရေးနဲ...
3,အီလွန်မတ်စ်ခ်ကို အေအိုင်အကြောင်း ယူကေဝန်ကြီးခ...
4,နီပေါငလျင် - လူ ၁၀၀ ကျော်သေဆုံး ၁၀၀ ကျော်ဒဏ်ရာရ
5,မှိုအဆိပ်သင့်မှု - သံသယရှိခံရသူအမျိုးသမီးကိုလူ...


In [17]:
def clean_text(text):
    text = "".join(ch for ch in text if unicodedata.category(ch)[0] != "C")
    text = text.replace("\t", " ").replace("\n", " ").replace("\r", " ")
    text = re.sub(r"\s+", " ", text)
    text = text.strip()
    return text
def clean_text_strict(text):
    allowed_punct = "၊။-–—:/()[]{}'\""
    result = []
    for ch in text:
        if unicodedata.category(ch)[0] == "C":
            continue
        if ch.isspace():
            result.append(" ")
            continue
        if "\u1000" <= ch <= "\u109F":
            result.append(ch)
            continue
        if ch.isalnum():
            result.append(ch)
            continue
        if ch in allowed_punct:
            result.append(ch)
            continue    
    text = "".join(result)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [24]:
text_data = df_fix_len['text']
text_data = [clean_text_strict(text) for text in text_data]
df_fix_len['text'] = text_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fix_len['text'] = text_data


In [25]:
df_fix_len

Unnamed: 0,text
0,ဌာနချုပ် လိုင်ဇာအနီး စစ်ကောင်စီ အင်အားတိုးချဲ့...
2,ဟက်ဇ်ဘိုလာတပ် ဘယ်လောက် အင်အားကြီးလဲ၊ အစ္စရေးနဲ...
3,အီလွန်မတ်စ်ခ်ကို အေအိုင်အကြောင်း ယူကေဝန်ကြီးချ...
4,နီပေါငလျင် - လူ ၁၀၀ ကျော်သေဆုံး ၁၀၀ ကျော်ဒဏ်ရာရ
5,မှိုအဆိပ်သင့်မှု - သံသယရှိခံရသူအမျိုးသမီးကိုလူ...
...,...
39948,ဦးစားပေးရောဂါပိုးတစ်ခုအဖြစ် နှစ်ပေါင်းများစွာက...
39949,မဟုတ်ပါ။ ပုံမှန်ဖြစ်ပေါ်နေကျ ကူးစက်ပြန့်ပွားမှ...
39957,မျောက်ကျောက် (Monkeypox) ဗိုင်းရပ်စ်ပိုးကြောင့...
39959,အာဖရိကအလယ်ပိုင်းနှင့် အနောက်ပိုင်းဒေသတွေမှာ အတ...


In [26]:
n = len(df_fix_len)
split = int(n*0.7)
df_train = df_fix_len[:split]
df_val = df_fix_len[split:]

In [27]:
def create_data(data_frame, val_mode=False):
    imgs = []
    img_dir = Path("train")
    path = Path("data/train")
    if val_mode:
        path = Path("data/val")
        img_dir = Path("val")
    data = data_frame["text"]
    for label in data:
        img_name = f"{uuid4()}.png"
        imgs.append(img_dir/img_name)
        render_myanmar_text(label, font_dir="../font_myanmar", output_path=path/img_name)
    data_frame["label"] = imgs
    return data_frame

In [30]:
df_train = create_data(df_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_frame["label"] = imgs


In [31]:
df_val = create_data(df_val, val_mode=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_frame["label"] = imgs


In [32]:
df_train.to_csv("data/train.csv", index=False)
df_val.to_csv("data/val.csv", index=False)

In [28]:
def create_data_label_txt(file_path, data_frame):
    with open(file_path, "w", encoding="utf-8") as f:
        for _, row in data_frame.iterrows():
            path = row["label"]
            text = row["text"]
            f.write(f"{path}\t{text}\n")
    print("✔ Done! File đã được tạo:", file_path)

In [33]:
create_data_label_txt("data/train.txt", df_train)
create_data_label_txt("data/val.txt", df_val)

✔ Done! File đã được tạo: data/train.txt
✔ Done! File đã được tạo: data/val.txt
