In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import warnings

sns.set()
rcParams['figure.figsize'] = (20,10)
pd.options.display.max_columns = None
warnings.filterwarnings('ignore')

In [3]:
import re
import json

In [2]:
def remove_vn_accent(word):
    word = re.sub('[áàảãạăắằẳẵặâấầẩẫậ]', 'a', word)
    word = re.sub('[éèẻẽẹêếềểễệ]', 'e', word)
    word = re.sub('[óòỏõọôốồổỗộơớờởỡợ]', 'o', word)
    word = re.sub('[íìỉĩị]', 'i', word)
    word = re.sub('[úùủũụưứừửữự]', 'u', word)
    word = re.sub('[ýỳỷỹỵ]', 'y', word)
    word = re.sub('đ', 'd', word)
    return word

In [4]:
# create json syllables

In [6]:
def create_json_syllables():
    syllables = {}
    with open('./data/syllables.txt', 'r') as f:
        for word in f.readlines():
            accent_removed_word = remove_vn_accent(word.strip())
            if accent_removed_word not in syllables:
                syllables[accent_removed_word] = [word.strip()]
            else:
                syllables[accent_removed_word].append(word.strip())
    return syllables
# syllables = create_json_syllables()
# json.dump(syllables, open('./data/syllables.json', 'w'))

In [14]:
syllables = json.load(open('./data/syllables.json', 'r')


In [41]:
def prob_model():
    return np.random.uniform(0, 1)

In [67]:
def beam_search(syllables:dict[str,list[str]], sentence:list[str], beam_size:int):
    result_sentences:list[tuple[list[str],float]] = [] # ex: [(['xin', 'chào'], 0.5), (['xin', 'chảo'], 0.3)]
    for _ in range(beam_size):
        result_sentences.append(([], 0))

    for removed_accent_word in sentence:
        prob_results = {}
        for tp in result_sentences:
            for word in syllables[removed_accent_word]:
                temp_sentence = tp[0].copy()
                temp_sentence.append(word)
                prob_results[tuple(temp_sentence)] = tp[1] + np.log(prob_model())
        sorted_prob_results = sorted(prob_results.items(), key=lambda x: x[1], reverse=True)
        result_sentences = [(list(item[0]), item[1]) for item in sorted_prob_results[:beam_size]]
    
    return ' '.join(result_sentences[0][0])

In [79]:
beam_search(syllables, ['thanh', 'pho', 'ha', 'noi'], 5)

'thạnh phợ há nôi'

In [69]:
# if len(result_sentences) == 0: # use unigram model
#     for removed_accent_word in sentence:
#         temp_probability:dict[str,float] = {}
#         for word in syllables[removed_accent_word]:
#             temp_probability[word] = unigram()
#         top_k = sorted(temp_probability.items(), key=lambda x: x[1], reverse=True)[:beam_size]
#         result_sentences.append([word] for word, _ in top_k)
# elif len(result_sentences) == 1: # use bigram model
#     for result_sentence in result_sentences:
#         for removed_accent_word in sentence:
#             temp_probability:dict[str,float] = {}
#             for word in syllables[removed_accent_word]:
#                 temp_probability[word] = bigram()
#             top_k = sorted(temp_probability.items(), key=lambda x: x[1], reverse=True)[:beam_size]
#             result_sentences.append([result_sentence + [word] for word, _ in top_k])
# elif len(result_sentences) >= 2: # use trigram model

In [62]:
log(0.5)

(-0.6931471805599453+0j)