In [1]:
import re
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def preprocess_text(text, language="en"):
    """
    Preprocess English and Burmese text.

    Args:
        text (str): Input text.
        language (str): "en" for English, "my" for Burmese.

    Returns:
        str: Preprocessed text.
    """
    if not isinstance(text, str) or text.strip() == "":
        return ""

    if language == "en":
        # Remove special characters but keep letters, numbers, spaces, and apostrophes
        text = re.sub(r"[^a-zA-Z0-9\s']", "", text)
        text = re.sub(r"\s+", " ", text).strip()  # Normalize spaces

    elif language == "my":
        # Remove English words and numbers, keep only Myanmar script and punctuation (၊ and ။)
        text = re.sub(r"[a-zA-Z0-9]", "", text)  # Remove English and numbers
        text = re.sub(r"[^\u1000-\u109F\s၊။]", "", text)  # Keep only Burmese script and punctuation
        text = re.sub(r"\s+", " ", text).strip()  # Normalize spaces

    return text

In [3]:
# Load dataset from Hugging Face
dataset = load_dataset("simbolo-ai/hola")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 110000/110000 [00:01<00:00, 106351.60 examples/s]


In [4]:
# Convert to Pandas DataFrame
df = pd.DataFrame(dataset["train"])

In [6]:
df.head()

Unnamed: 0,text,label
0,Human evolution is the evolutionary process wi...,en
1,This process involved the gradual development ...,en
2,The study of human evolution involves several ...,en
3,Primates diverged from other mammals about 85 ...,en
4,Primates produced successive clades leading to...,en


In [7]:
# Filter for English (en) and Burmese (my) texts
df_en = df[df['label'] == 'en'].reset_index(drop=True)
df_my = df[df['label'] == 'my'].reset_index(drop=True)

In [10]:
# en and my alignment
df_aligned = pd.DataFrame({
    "en": df_en["text"].values,
    "my": df_my["text"].values
})

In [11]:
# Apply preprocessing
df_aligned["en"] = df_aligned["en"].apply(lambda x: preprocess_text(x, language="en"))
df_aligned["my"] = df_aligned["my"].apply(lambda x: preprocess_text(x, language="my"))

In [13]:
# Save the cleaned dataset
df_aligned.to_csv("data/processed_main_data.csv", index=False)
print("Saved cleaned dataset: processed_main_data.csv")

Saved cleaned dataset: processed_main_data.csv


In [14]:
# Split into train (80%), validation (10%), and test (10%)
train, temp = train_test_split(df_aligned, test_size=0.2, random_state=42)
val, test = train_test_split(temp, test_size=0.5, random_state=42)

In [16]:
# Save the splits
train.to_csv("data/en_my_train_80.csv", index=False)
val.to_csv("data/en_my_val_10.csv", index=False)
test.to_csv("data/en_my_test_10.csv", index=False)

print("Saved train dataset: en_my_train_80.csv")
print("Saved validation dataset: en_my_val_10.csv")
print("Saved test dataset: en_my_test_10.csv")

Saved train dataset: en_my_train_80.csv
Saved validation dataset: en_my_val_10.csv
Saved test dataset: en_my_test_10.csv
