In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import operator
import glove
from scipy.spatial.distance import cosine
import numpy as np
import konlpy

In [2]:
twitter=konlpy.tag.Twitter()

In [3]:
class GloVe():
    def __init__(self,d=300,alpha=0.75,x_max=300.0,epoch = 250):
        self.vectorizer = CountVectorizer(tokenizer=twitter.morphs)
        self.d=d
        self.alpha=alpha
        self.x_max=x_max
        self.epoch = epoch
    def Make_glove(self, sentence):
        X = self.vectorizer.fit_transform(sentence)
        Xc = X.T * X
        Xc.setdiag(0)
        result = Xc.toarray()
        self.dic = {}
        for idx1, word1 in enumerate(result):
            tmpdic = {}
            for idx2, word2 in enumerate(word1):
                if word2 > 0:
                    tmpdic[idx2] = word2
            self.dic[idx1] = tmpdic
        self.vocab = sorted(self.vectorizer.vocabulary_.items(), key=operator.itemgetter(1))
        self.vocab = [word[0] for word in self.vocab]
        model = glove.Glove(self.dic, d=self.d, alpha=self.alpha, x_max=self.x_max)
        
        for epoch in range(self.epoch):
            err = model.train(batch_size=200, workers=4)
            print("epoch %d, error %.9f" % (epoch, err), flush=True)
        self.wordvectors = model.W
        return self.wordvectors, self.vocab, self.dic
    
    def most_similar(self,word, vocab, vecs, topn=10):
        query = vecs[vocab.index(word)]
        result = []
        for idx, vec in enumerate(vecs):
            if idx is not vocab.index(word):
                result.append((vocab[idx],1-cosine(query,vec)))
        result = sorted(result,key=lambda x: x[1],reverse=True)
        return result[:topn]
    
    def Save_glove(self,glove,vocab,file_name='./glove.npz'):
        np.savez(file_name,E = glove,row=vocab)

In [4]:
glove_book = GloVe()

In [5]:
import json
with open("book_list.txt", 'r') as f:
    book_list = [line.rstrip('\n') for line in f]
print(book_list)

with open('book_json.txt', 'r') as f:
    book_json=(json.load(f))

['꿀벌 마야의 모험(한글).txt', '눈의 여왕(한글).txt', '로빈훗의 모험(한글).txt', '미운오리새끼(한글).txt', '백조의 호수(한글).txt', '빨간머리 앤(한글).txt', '성냥팔이 소녀(한글).txt', '소공녀(한글).txt', '소공자(한글).txt', '알라딘과 요술램프(한글).txt', '알리바바와 40인의 도적(한글).txt', '엄마 찾아 삼만리(한글).txt', '엄지공주(한글).txt', '임금님 귀는 당나귀(한글).txt', '장화신은 고양이(한글).txt', '잭과 콩나무(한글).txt', '행복한 왕자(한글).txt']


In [62]:
glove_data,vocab,_ = glove_book.Make_glove(book_json[book_list[0]])

epoch 0, error 0.000183952
epoch 1, error 0.000178112
epoch 2, error 0.000172815
epoch 3, error 0.000167976
epoch 4, error 0.000163537
epoch 5, error 0.000159448
epoch 6, error 0.000155663
epoch 7, error 0.000152143
epoch 8, error 0.000148858
epoch 9, error 0.000145787
epoch 10, error 0.000142895
epoch 11, error 0.000140185
epoch 12, error 0.000137631
epoch 13, error 0.000135219
epoch 14, error 0.000132947
epoch 15, error 0.000130784
epoch 16, error 0.000128743
epoch 17, error 0.000126803
epoch 18, error 0.000124966
epoch 19, error 0.000123216
epoch 20, error 0.000121554
epoch 21, error 0.000119974
epoch 22, error 0.000118471
epoch 23, error 0.000117033
epoch 24, error 0.000115661
epoch 25, error 0.000114360
epoch 26, error 0.000113112
epoch 27, error 0.000111920
epoch 28, error 0.000110783
epoch 29, error 0.000109698
epoch 30, error 0.000108657
epoch 31, error 0.000107662
epoch 32, error 0.000106707
epoch 33, error 0.000105803
epoch 34, error 0.000104928
epoch 35, error 0.000104083
ep

In [63]:
glove_book.most_similar(word='언니', vocab=vocab,vecs=glove_data, topn=5)

[('언니', 1.0000000000000002),
 ('혼자', 0.23981371894629044),
 ('꿀을', 0.21280043921795944),
 ('사실을', 0.17687381328653218),
 ('들었어요', 0.16803423017839503)]

In [64]:
glove_book.Save_glove(glove_data,vocab)