In [7]:
from mmdt_tokenizer.core import MyanmarTokenizer
import pandas as pd
from pathlib import Path

tokenizer = MyanmarTokenizer()

def test_syllable_tokenize_basic(tokenizer):
    text = "မင်္ဂလာပါ"
    tokens = tokenizer.syllable_tokenize(text)
    assert isinstance(tokens, list)
    assert any("မင်္" in tok or "ဂ" in tok for tok in tokens[0])

def test_syllable_tokenize_save_csv(tokenizer: MyanmarTokenizer, tmp_path):
    """Tests the CSV saving feature exposed by the main tokenizer."""
    SAMPLE_TEXT = ["မင်္ဂလာပါ မြန်မာစာ", "တနေ့တော့"]
    EXPECTED_SYLLABLES = ["မင်္ဂ", "လာ", "ပါ", "မြန်", "မာ", "စာ", "တ", "နေ့", "တော့"]
    csv_path = tmp_path + "/syllable_core_test.csv"
    
    # Call the main tokenizer method with the save_csv argument
    tokenizer.syllable_tokenize(SAMPLE_TEXT, save_csv=str(csv_path), conll_style=True)
    df = pd.read_csv(csv_path)
    expected_rows = EXPECTED_SYLLABLES 
    assert df.shape[0] == len(expected_rows)+1
    
def test_syllable_tokenize_csv_input(tokenizer: MyanmarTokenizer, tmp_path):
    """Tests the CSV loading/saving feature exposed by the main tokenizer."""
    csv_input_path = tmp_path + "/test_data.csv"
    csv_output_path = tmp_path + "/result_syllable_bd.csv"
    
    # Call the main tokenizer method with the save_csv argument
    
    df = pd.read_csv(csv_input_path)
    tokenizer.syllable_tokenize(df, column = 'original_sentence', save_csv=str(csv_output_path), conll_style=False)
    assert Path(csv_output_path).exists()
    

test_syllable_tokenize_basic(tokenizer)
test_syllable_tokenize_save_csv(tokenizer, '../data')
test_syllable_tokenize_csv_input(tokenizer, '../data')

