# 한잔 FastText 테스트

In [1]:
!pip install soynlp
!pip install FastText
from tqdm import tqdm


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting soynlp
  Downloading soynlp-0.0.493-py3-none-any.whl (416 kB)
[K     |████████████████████████████████| 416 kB 23.7 MB/s 
Installing collected packages: soynlp
Successfully installed soynlp-0.0.493
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting FastText
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[K     |████████████████████████████████| 68 kB 6.2 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.10.1-py3-none-any.whl (216 kB)
Building wheels for collected packages: FastText
  Building wheel for FastText (setup.py) ... [?25l[?25hdone
  Created wheel for FastText: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3155764 sha256=5c852347d3a056797d3c52ea4bfd00408dec9b9a293d50eb4c28760c6af7bff9
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f7641801a6597a29c8f4f19e38f9c02a345ba

## 자소 분리

In [39]:
def process_jamo(tokenized_corpus_fname, output_fname):
    toatal_lines = sum(1 for line in open(tokenized_corpus_fname, 'r', encoding='utf-8-sig'))

    with open(tokenized_corpus_fname, 'r', encoding='utf-8') as f1, \
            open(output_fname, 'w', encoding='utf-8') as f2:

        for _, line in tqdm(enumerate(f1), total=toatal_lines):
            sentence = line.replace('\n', '').strip()
            processed_sentence = jamo_sentence(sentence)
            f2.writelines(processed_sentence + '\n')


In [40]:
tokenized_corpus_fname = './drink_list.txt'
output_fname = './drink_output.txt'
process_jamo(tokenized_corpus_fname, output_fname)

100%|██████████| 6/6 [00:00<00:00, 2819.07it/s]


## 자모 문장 분석

In [41]:
import re
from soynlp.hangle import compose, decompose, character_is_korean

doublespace_pattern = re.compile('\s+')


def jamo_sentence(sent):
    def transform(char):
        if char == ' ':
            return char
        cjj = decompose(char)
        if len(cjj) == 1:
            return cjj
        cjj_ = ''.join(c if c != ' ' else '-' for c in cjj)
        return cjj_

    sent_ = []
    for char in sent:
        if character_is_korean(char):
            sent_.append(transform(char))
        else:
            sent_.append(char)
    sent_ = doublespace_pattern.sub(' ', ''.join(sent_))
    return sent_

def jamo_to_word(jamo):
    jamo_list, idx = [], 0
    while idx < len(jamo):
        if not character_is_korean(jamo[idx]):
            jamo_list.append(jamo[idx])
            idx += 1
        else:
            jamo_list.append(jamo[idx:idx + 3])
            idx += 3
    word = ""
    for jamo_char in jamo_list:
        if len(jamo_char) == 1:
            word += jamo_char
        elif jamo_char[2] == "-":
            word += compose(jamo_char[0], jamo_char[1], " ")
        else:
            word += compose(jamo_char[0], jamo_char[1], jamo_char[2])
    return word

  ## fasttext 스킵그램 모델

In [45]:
from gensim.models import FastText
from tqdm import tqdm
import logging

corpus_fname = './drink_output.txt'
model_fname = './fasttext'

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

print('corpus 생성')
corpus = [sent.strip().split(" ") for sent in tqdm(open(corpus_fname, 'r', encoding='utf-8-sig').readlines())]
print("학습 중")
model = FastText(size=100, workers=4, sg=1, iter=2, word_ngrams=5)
model.build_vocab(corpus_file=corpus_fname)
model.train(total_examples=len(corpus), epoch=10)

model.save(model_fname)

print(f"학습 소요 시간 : {model.total_train_time}")
# https://projector.tensorflow.org/ 에서 시각화 하기 위해 따로 저장
model.wv.save_word2vec_format(model_fname + "_vis")
print('완료')

corpus 생성


100%|██████████| 6/6 [00:00<00:00, 44462.59it/s]


학습 중


RuntimeError: ignored

## 유사도 분석

In [7]:
from gensim.models import FastText

def transform(list):
    return [(jamo_to_word(w), r) for (w, r) in list]


# 모델을 로딩하여 가장 유사한 단어를 출력
loaded_model = FastText.load('./fasttext')
print(loaded_model.wv.vectors.shape)

print(transform(loaded_model.wv.most_similar(jamo_sentence('처음처럼'), topn=5)))
print(transform(loaded_model.wv.most_similar(jamo_sentence('예전처럼'), topn=5)))
print(transform(loaded_model.wv.most_similar(jamo_sentence('말펙'), topn=5)))

(0, 100)


TypeError: ignored