**Практический пример #4** \\
Доклад: Структурированная нормализация текста с использованием недетерминированных FST \\
Автор: Владимир Марков ([@markovvn1](https://t.me/markovvn1))

In [None]:
!pip install pynini==2.1.6

Collecting pynini==2.1.6
  Downloading pynini-2.1.6-cp310-cp310-manylinux_2_28_x86_64.whl (154.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.5/154.5 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynini
Successfully installed pynini-2.1.6


In [None]:
import pynini

In [None]:
TOKEN_BEGIN = "[token_begin]"
TOKEN_SEP = "[token_sep]"
TOKEN_END = "[token_end]"
ALPHABIT = pynini.union(*[f"[{i}]" for i in range(1, 256)]).optimize()
TOKEN_STAR = (ALPHABIT | (TOKEN_BEGIN + pynini.closure(ALPHABIT) + TOKEN_SEP + pynini.closure(ALPHABIT) + TOKEN_END)).closure()

class TokensFst:
  @staticmethod
  def _build_time24():
    hours = pynini.union(*[f"{i:02d}" for i in range(24)])
    minutes = pynini.union(*"012345") + pynini.union(*"0123456789")
    res = hours + ":" + minutes
    return res.optimize()

  @staticmethod
  def _build_time12():
    hours = pynini.union(*[f"{i:02d}" for i in range(12)])
    minutes = pynini.union(*"012345") + pynini.union(*"0123456789")
    res = hours + ":" + minutes + pynini.union(" ", "") + pynini.union("am", "AM", "pm", "PM")
    return res.optimize()

  @staticmethod
  def _build_int():
    return pynini.union(*"0123456789").closure(1).optimize()

  @classmethod
  def build_create_tokens(cls):
    time12_fst = pynini.cross("", TOKEN_BEGIN + "time12" + TOKEN_SEP) + cls._build_time12() + pynini.cross("", TOKEN_END)
    time12_cdr = pynini.cdrewrite(time12_fst, pynini.union(*" ", '[BOS]'), pynini.union(*" ,.", '[EOS]'), TOKEN_STAR)

    time24_fst = pynini.cross("", TOKEN_BEGIN + "time24" + TOKEN_SEP) + cls._build_time24() + pynini.cross("", TOKEN_END)
    time24_cdr = pynini.cdrewrite(time24_fst, pynini.union(*" ", '[BOS]'), pynini.union(*" ,.", '[EOS]'), TOKEN_STAR)

    int_fst = pynini.cross("", TOKEN_BEGIN + "int" + TOKEN_SEP) + cls._build_int() + pynini.cross("", TOKEN_END)
    all_except_digits = ALPHABIT - pynini.union(*"0123456789")
    int_cdr = pynini.cdrewrite(int_fst, all_except_digits | '[BOS]', all_except_digits | '[EOS]', TOKEN_STAR)
    return time12_cdr @ time24_cdr @ int_cdr

  @staticmethod
  def build_token_view():
    tokens = pynini.cross(TOKEN_BEGIN, "<") | pynini.cross(TOKEN_SEP, "|") | pynini.cross(TOKEN_END, ">")
    return pynini.cdrewrite(tokens, "", "", pynini.closure(ALPHABIT | TOKEN_BEGIN | TOKEN_SEP | TOKEN_END))

  @staticmethod
  def build_token_rm():
    convert_tokens = pynini.cross(TOKEN_BEGIN + pynini.closure(ALPHABIT) + TOKEN_SEP, "") + pynini.closure(ALPHABIT) + pynini.cross(TOKEN_END, "")
    return pynini.cdrewrite(convert_tokens, "", "", TOKEN_STAR)

In [None]:
class AnalysisFst:
  @staticmethod
  def build_convert_time_12_to_24():
    convert_AM_hours = pynini.union(*[pynini.cross(f"{i:02d}", f"{i % 12:02d}") for i in range(1, 13)])
    convert_PM_hours = pynini.union(*[pynini.cross(f"{i:02d}", f"{i % 12 + 12:02d}") for i in range(1, 13)])
    any_minutes = pynini.union(*"012345") + pynini.union(*"0123456789")
    convert_time = (convert_AM_hours + ":" + any_minutes + pynini.cross(" AM", "")) | (convert_PM_hours + ":" + any_minutes + pynini.cross(" PM", ""))
    convert_token = pynini.cross("time12", "time24") + TOKEN_SEP + convert_time
    return pynini.cdrewrite(TOKEN_BEGIN + convert_token + TOKEN_END, "", "", TOKEN_STAR)

In [None]:
class NormalizeFst:
  @staticmethod
  def build_normalize_time():
    # Для примера реализован перевод только 23:11 в текст
    convert_time = pynini.cross("23:11", "двадцать три часа одиннадцать минут")
    convert_token = pynini.cross("time24", "text") + TOKEN_SEP + convert_time
    return pynini.cdrewrite(TOKEN_BEGIN + convert_token + TOKEN_END, "", "", TOKEN_STAR)

  @staticmethod
  def build_normalize_int():
    # Для преобразования любого числа в текст можно использовать pynini.reverse, который позволяет "читать" число с конца
    # Таким способом легко определять разряд числа и правильно его преобразовывать
    convert_int = pynini.reverse(pynini.cross("01", pynini.reverse("десять")) | pynini.cross("11", pynini.reverse("одинадцать")) | pynini.cross("21", pynini.reverse("двенадцать")))
    convert_token = pynini.cross("int", "text") + TOKEN_SEP + convert_int
    return pynini.cdrewrite(TOKEN_BEGIN + convert_token + TOKEN_END, "", "", TOKEN_STAR)

In [None]:
create_tokens = TokensFst.build_create_tokens()
token_view = TokensFst.build_token_view()
token_rm = TokensFst.build_token_rm()
convert_time = AnalysisFst.build_convert_time_12_to_24()
norm_int = NormalizeFst.build_normalize_int()
norm_time = NormalizeFst.build_normalize_time()

fst = create_tokens @ convert_time @ norm_int @ norm_time @ token_rm
print(("В 11:11 PM придет:11 гостей" @ fst).string())
print(("В 11:11 PM придет:11 гостей" @ create_tokens @ convert_time @ norm_int @ norm_time @ token_view).string())

В двадцать три часа одиннадцать минут придет:одинадцать гостей
В <text|двадцать три часа одиннадцать минут> придет:<text|одинадцать> гостей
