In [3]:
%pip install sudachipy sudachidict_core

Defaulting to user installation because normal site-packages is not writeable
Collecting sudachipy
  Downloading SudachiPy-0.6.9-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting sudachidict_core
  Downloading SudachiDict_core-20241021-py3-none-any.whl.metadata (2.5 kB)
Downloading SudachiPy-0.6.9-cp311-cp311-win_amd64.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   -- ------------------------------------- 0.1/1.4 MB 1.9 MB/s eta 0:00:01
   --------- ------------------------------ 0.3/1.4 MB 3.9 MB/s eta 0:00:01
   ------------------ --------------------- 0.6/1.4 MB 5.0 MB/s eta 0:00:01
   ---------------------------- ----------- 1.0/1.4 MB 5.5 MB/s eta 0:00:01
   ------------------------------------- -- 1.3/1.4 MB 5.8 MB/s eta 0:00:01
   ---------------------------------------- 1.4/1.4 MB 5.8 MB/s eta 0:00:00
Downloading SudachiDict_core-20241021-py3-none-any.whl (72.1 MB)
   ---------------------------------------- 0.0/72.1 MB ? eta -:--:--
  



In [1]:
import sudachipy
import numpy as np


In [2]:
text_file_n = 126
id_word_list = []
word_id_dict = {}

name_list = []
name_list2 = []
# <NULL>は遷移単語候補の終わりや、前の単語が存在しない場合の仮の単語を表す(「<NULL><NULL><SOS>今日……」のように)
# <SOS>は文章の始まりを表す
# <EOS>は文章の終わりを表す
markov_order = 2

word_transition_dict = {}

sudachi_dict = sudachipy.Dictionary()
tokenizer = sudachi_dict.create()


In [35]:
def read_corpus_text(corpus_text: str, i):
    if corpus_text == '\n':
        return
    
    colon_index = corpus_text.find('：')

    name = corpus_text[:colon_index]
    save_name(name)

    main_text = corpus_text[colon_index + 1:]
    read_main_text(main_text)

def save_name(name: str):
    if name[:3] in ['勝利時']:
        name = name[3:]

    if not name in name_list:
        name_list.append(name)

def read_main_text(main_text: str):
    morpheme_list = tokenizer.tokenize(main_text)
    
    recent_word_list = []   
    recent_word_list = reset_recent_word_list()

    for morpheme in morpheme_list:
        word = morpheme.surface()

        save_word(word)
        save_word_transition(word, recent_word_list)

        recent_word_list = update_recent_word_list(word, recent_word_list)
    
    word = '<EOS>'
    save_word(word)
    save_word_transition(word, recent_word_list)

def reset_recent_word_list():
    recent_word_list = [0 for i in range(markov_order - 1)]
    recent_word_list.append(word_id_dict['<SOS>'])

    return recent_word_list

def update_recent_word_list(word, recent_word_list):
    recent_word_list[:-1] = recent_word_list[1:]
    recent_word_list[-1] = word_id_dict[word]

    return recent_word_list

def save_word_transition(word, recent_word_list):
    index = tuple(recent_word_list)

    if not index in word_transition_dict.keys():
        word_transition_dict[index] = []
    
    word_id = word_id_dict[word]

    if not word_id in word_transition_dict[index]:
        word_transition_dict[index].append(word_id)

def save_word(word):
    if not word in id_word_list:
        word_id_dict[word] = len(id_word_list)
        id_word_list.append(word)


In [36]:
def write_word_transition_dict():
    text_list = []

    for (index, word_id_list) in word_transition_dict.items():
        text = ','.join((str(i) for i in (*index, *word_id_list))) + '\n'
        text_list.append(text)
    
    with open('./word_transition_dict.csv', 'w') as f:
        f.writelines(text_list)

def write_word_list():
    text_list = []

    for (word, id) in word_id_dict.items():
        text = ','.join((str(id), word)) + '\n'
        text_list.append(text)
    
    with open('./word_list.csv', 'w') as f:
        f.writelines(text_list)


In [25]:
for word in ['<NULL>', '<SOS>', '<EOS>']:
    save_word(word)



In [37]:
text_file_n = 12

for i in range(text_file_n):
    with open(f'./dialogues/dialogue_{i}.txt') as f:
        corpus_text_list = f.readlines()
        for corpus_text in corpus_text_list:
            read_corpus_text(corpus_text, i)



In [38]:
write_word_list()
write_word_transition_dict()


In [39]:
word_transition_dict

{(): [],
 (0, 1): [3,
  9,
  33,
  35,
  47,
  51,
  54,
  58,
  55,
  63,
  69,
  20,
  77,
  81,
  85,
  89,
  99,
  104,
  61,
  111,
  60,
  116,
  128,
  129,
  132,
  144,
  147,
  150,
  156,
  158,
  162,
  165,
  170,
  173,
  108,
  182,
  184,
  185,
  188,
  191,
  192,
  194,
  201,
  203,
  43,
  213,
  25,
  10,
  196,
  229,
  83,
  27,
  197,
  238,
  244,
  171,
  224,
  252,
  256,
  34,
  260,
  263,
  265,
  217,
  287,
  288,
  290,
  297,
  300,
  303,
  304,
  153,
  313,
  314,
  315,
  318,
  21,
  321,
  323,
  324,
  327,
  331,
  334,
  337,
  343,
  344,
  101,
  283,
  352,
  302,
  366,
  367,
  239,
  56,
  86,
  372,
  374,
  305,
  381,
  382,
  384,
  389,
  393,
  396,
  397,
  14,
  399,
  93,
  76,
  409,
  411,
  417,
  423,
  72,
  143,
  436,
  359,
  438,
  447,
  449,
  355,
  455,
  459,
  460,
  462,
  154,
  88,
  119,
  433,
  486,
  361,
  326,
  491,
  498,
  499,
  4,
  505,
  506,
  508,
  230,
  518,
  519,
  441,
  526,
  218,
  534