In [32]:
import re
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [33]:
def preprocess_text(text, language="en"):
    """
    Preprocess English and Burmese text.

    Args:
        text (str): Input text.
        language (str): "en" for English, "my" for Burmese.

    Returns:
        str: Preprocessed text.
    """
    if not isinstance(text, str) or text.strip() == "":
        return ""

    if language == "en":
        # Remove special characters but keep letters, numbers, spaces, and apostrophes
        text = re.sub(r"[^a-zA-Z0-9\s']", "", text)
        text = re.sub(r"\s+", " ", text).strip()  # Normalize spaces

    elif language == "my":
        # Remove English words and numbers, keep only Myanmar script and punctuation (၊ and ။)
        text = re.sub(r"[a-zA-Z0-9]", "", text)  # Remove English and numbers
        text = re.sub(r"[^\u1000-\u109F\s၊။]", "", text)  # Keep only Burmese script and punctuation
        text = re.sub(r"\s+", " ", text).strip()  # Normalize spaces

    return text

In [34]:
# Load dataset from Hugging Face
dataset = load_dataset("akhtet/myanmar-xnli")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 392702/392702 [00:01<00:00, 239764.74 examples/s]
Generating validation split: 100%|██████████| 2490/2490 [00:00<00:00, 135996.05 examples/s]
Generating test split: 100%|██████████| 5010/5010 [00:00<00:00, 192700.97 examples/s]


In [36]:
# Convert to Pandas DataFrame
df = pd.DataFrame(dataset["train"])

In [37]:
df.head()

Unnamed: 0,genre,label,sentence1_en,sentence2_en,sentence1_my,sentence2_my
0,government,neutral,Conceptually cream skimming has two basic dime...,Product and geography are what make cream skim...,သဘောတရားအရ ခရင်မ်စိမ်ခြင်းတွင် အခြေခံအတိုင်းအတ...,ထုတ်ကုန်နှင့် ပထဝီဝင်အနေအထားသည် ခရင်မ် skimmin...
1,telephone,entailment,you know during the season and i guess at at y...,You lose the things to the following level if ...,ရာသီအတွင်း မင်းသိတယ်၊ မင်းရဲ့အဆင့်ကို ငါ ခန့်မ...,လူတွေပြန်ခေါ်ရင် အောက်ပါအဆင့်အထိ ဆုံးရှုံးသွား...
2,fiction,entailment,One of our number will carry out your instruct...,A member of my team will execute your orders w...,ကျွန်ုပ်တို့၏နံပါတ်တစ်ခုသည် သင့်ညွှန်ကြားချက်မ...,ကျွန်ုပ်၏အဖွဲ့သားတစ်ဦးသည် သင်၏အမိန့်စာများကို ...
3,fiction,entailment,How do you know? All this is their information...,This information belongs to them.,သင်ဘယ်လိုသိသလဲ? ဒါတွေအားလုံးဟာ သူတို့ရဲ့ အချက်...,ဒီအချက်အလက်က သူတို့ပိုင်တယ်။
4,telephone,neutral,yeah i tell you what though if you go price so...,The tennis shoes have a range of prices.,ဟုတ်တယ် ငါမင်းကိုပြောပြမယ် ဒီတင်းနစ်ဖိနပ်တချို...,တင်းနစ်ဖိနပ်များသည် ဈေးနှုန်းအမျိုးမျိုးရှိသည်။


In [44]:
# Process each split and sample 50,000 rows
for split in ["train", "validation", "test"]:
    df = pd.DataFrame(dataset[split])  # Convert to Pandas DataFrame

    # Sample 50,000 rows while maintaining alignment
    sample_size = min(50000, len(df))
    df_sampled = df.sample(n=sample_size, random_state=42).reset_index(drop=True)

    # Keep only relevant columns and rename
    df_aligned = df_sampled[["sentence1_en", "sentence1_my"]].rename(columns={"sentence1_en": "en", "sentence1_my": "my"})

    # Apply preprocessing
    df_aligned["en"] = df_aligned["en"].apply(lambda x: preprocess_text(x, language="en"))
    df_aligned["my"] = df_aligned["my"].apply(lambda x: preprocess_text(x, language="my"))

    # Ensure alignment by keeping the minimum available pairs
    min_len = min(len(df_aligned["en"]), len(df_aligned["my"]))
    df_aligned = df_aligned.iloc[:min_len]  # Ensure exact alignment

    # Save processed data
    output_path = f"data/en_my_{split}.csv"
    df_aligned.to_csv(output_path, index=False)
    print(f"Saved {split} dataset with {sample_size} rows to {output_path}")

Saved train dataset with 50000 rows to data/en_my_train.csv
Saved validation dataset with 2490 rows to data/en_my_validation.csv
Saved test dataset with 5010 rows to data/en_my_test.csv


In [None]:
# # define sample size to 50,000
# sample_size = 50000
# df_sampled = df.sample(n=sample_size, random_state=42).reset_index(drop=True)

In [22]:
# # Filter only 'en' and 'my' texts while preserving order
# df_en = df_sampled[df_sampled["label"] == "sentence1_en"].reset_index(drop=True)
# df_my = df_sampled[df_sampled["label"] == "sentence1_my"].reset_index(drop=True)

In [23]:
# # Ensure correct alignment by keeping the same index range
# min_len = min(len(df_en), len(df_my))
# df_aligned = pd.DataFrame({
#     "en": df_en["text"].iloc[:min_len].values,  # Ensure equal pairs
#     "my": df_my["text"].iloc[:min_len].values
# })

In [24]:
# # Apply preprocessing
# df_aligned["en"] = df_aligned["en"].apply(lambda x: preprocess_text(x, language="en"))
# df_aligned["my"] = df_aligned["my"].apply(lambda x: preprocess_text(x, language="my"))

In [25]:
# # Save the cleaned dataset
# df_aligned.to_csv("data/processed_main_data.csv", index=False)
# print("Saved cleaned dataset: processed_main_data.csv")

Saved cleaned dataset: processed_main_data.csv


In [26]:
# # Ensure data remains in the same order
# df_aligned = df_aligned.sample(frac=1, random_state=42).reset_index(drop=True)

In [27]:
# # Split into train (80%), validation (10%), and test (10%)
# train, temp = train_test_split(df_aligned, test_size=0.2, random_state=42)
# val, test = train_test_split(temp, test_size=0.5, random_state=42)

In [28]:
# # Save the splits
# train.to_csv("data/en_my_train_80.csv", index=False)
# val.to_csv("data/en_my_val_10.csv", index=False)
# test.to_csv("data/en_my_test_10.csv", index=False)

# print("Saved train dataset: en_my_train_80.csv")
# print("Saved validation dataset: en_my_val_10.csv")
# print("Saved test dataset: en_my_test_10.csv")

Saved train dataset: en_my_train_80.csv
Saved validation dataset: en_my_val_10.csv
Saved test dataset: en_my_test_10.csv
