In [2]:
import gensim

In [12]:
gensim.__version__

'4.0.1'

# Step 1
훈련 데이터 (말뭉치) 준비하기:   
주어진 데이터로 Word2Vec 모델 학습

In [3]:
# 데이터 처리를 담당할 클래스
class TextIterator(object):
    def __init__(self, fname):
        self.fname = fname

    def __iter__(self):
        for line in open(self.fname):
            yield line.split()

In [19]:
# 훈련 데이터 불러오기
filename = './data/newskor.txt'
sentences = TextIterator(filename)

# Step 2
Word2Vec 모델 훈련시키기

# Step 3
훈련이 끝난 모델을 불러오기

In [5]:
# hyper-parameters
TRAIN = True # 동작 플래그 (True: 훈련 / False: 불러오기)
SIZE = 300 # 임베딩 벡터의 차원
WINDOW = 5 # context window 주변단어 5개와의 관계
SG = 1 # 1: skip-gram / 그 외: CBOW
MIN_COUNT = 10 # 출현 빈도가 이 값보다 작은 단어는 무시함
WORKERS = 20 # threads

In [20]:
# 모델을 훈련하거나 불러오는 함수
def train_word2vec(train):
    if train:
        # gensim 4.0.1
        model = gensim.models.Word2Vec(
            vector_size=SIZE, window=WINDOW, sg=SG, 
            min_count=MIN_COUNT, workers=WORKERS
        )
        # vocabulary 생성 (index 생성 -> one-hot vector생성 단계)
        model.build_vocab(sentences) 
        # 훈련 시작
        model.train(sentences, total_examples=model.corpus_count, epochs=5) # total_examples : 문장의 수
        # 모델 저장
        model.save('./save_files/newskor.model')
    else:
        # 저장된 모델 불러오기
        model = gensim.models.Word2Vec.load('./save_files/newskor.model')
    return model

In [21]:
model = train_word2vec(TRAIN)

In [60]:
# model.corpus_count

430720

In [22]:
print(type(model.wv))

<class 'gensim.models.keyedvectors.KeyedVectors'>


In [23]:
# show the vocabulary
# print(model.wv.vocab)
print(model.wv.index_to_key)

['하', '이', '.', '는', '을', 'ㄴ', '다', '의', '에', '를', '은', '어', '있', '고', '으로', '가', '였', 'ㄹ', '되', ',', '에서', '었', ')', '(', '로', '것', '도', '등', '과', '들', '지', '와', '여', '일', '기', '·', 'ㄴ다', '적', '수', '아', '%', '게', '원', '년', '2', '았', '3', '1', '다고', '“', '”', '월', '위하', '대하', '말하', '시장', '면', '업체', '따르', '하고', '않', '만', '까지', '‘', '’', '밝히', '명', '및', '부터', '다가', '미국', '며', '이라고', '4', '개', '대', '나', '오', '대표', '국내', '한국', '5', '다는', '던', '서비스', '습니다', '개발', '계획', '인', '주', '통하', '제품', '없', '또', '화', '정부', '면서', '최근', '한', '으며', '6', '지난해', '중', '그', '사업', '올해', '중국', '보이', '크', '받', '일본', '이번', '10', '보다', '내', '기업', '경우', '보', '대통령', '같', '-', '관련', '기술', '에게', '지만', '때문', '전', '전망', 'LG', '라고', '7', '달러', '현재', '예정', '지나', '어서', 'ㄴ다는', '특히', '이상', '8', '북한', '사', '시스템', '문제', '김', '삼성전자', '겠', '늘', '간', '나서', '관계자', '잇', '9', '아니', '라는', '용', '인터넷', '높', '라', '내년', '솔루션', '위', '데', '회사', '시키', '장비', 'ㅁ', '씨', '업계', '이날', '때', '다면서', '두', '세계', '도록', '예상', '추진', '게임', 'PC', '그러나', '말

In [24]:
# check the word embedding result
word = '버스'
print(model.wv[word])
print('size of vector: ', len(model.wv[word]))

[-1.035671    0.47437614  0.17578264  0.21114135  0.28532004 -0.21645187
 -0.11969339 -0.25866488  0.05896854 -0.04923951  0.3597209   0.38133186
 -0.00812934  0.21491832  0.14384598 -0.01674394 -0.11356556 -0.14080124
 -0.02060484 -0.08545088 -0.16076039 -0.21722218 -0.10472752 -0.27132094
  0.33943757 -0.46941307 -0.31572875  0.42769772 -0.12540789 -0.15096343
 -0.04650512 -0.06047432 -0.19878888  0.5069182   0.10734709 -0.13395289
 -0.00288289 -0.18555228  0.05162125  0.38320273  0.28170422  0.13233086
  0.13138261 -0.2352454   0.3651275  -0.01976934 -0.25788006 -0.01114927
  0.24317372 -0.16166775  0.36819476  0.35691395  0.11084783  0.39397803
  0.18633391  0.31877163 -0.10156225  0.2019092   0.01135934  0.16301599
  0.08479711  0.00684411 -0.17012142  0.1333794  -0.12080101  0.18379413
  0.05114222 -0.10004972 -0.66074413 -0.04568807 -0.1504878  -0.00674057
 -0.09479905  0.04554502  0.16238856  0.4021653  -0.5235755  -0.24702998
 -0.08035028 -0.13925883  0.17153572 -0.13866934  0

# Step 4
두 단어 간 유사도 출력하기

In [45]:
#word1 = '한국'
#word2 = '북한'
print ("Caculate the similarity between word1 and word2")
word1 = input("word1: ")
word2 = input("word2: ")

# check the words are in the vocabulary
no_problem = True
# vocab = model.wv.vocab.keys() # 3.8.0
vocab = model.wv.key_to_index # 4.0.1

if word1 not in vocab:
    print ('The word ' + word1 + ' is not in the vocabulary')
    no_problem = False

if word2 not in vocab:
    print ('The word ' + word2 + ' is not in the vocabulary')
    no_problem = False

if no_problem:
    similarity = model.wv.similarity(word1, word2) # 두 벡터 간의 내적에 기반한 cos유사도 등으로 계산
    print ('The similarity between ' + word1 + ' and ' + word2 + ' : ', similarity) 

Caculate the similarity between word1 and word2
word1: 한국
word2: 북한
The similarity between 한국 and 북한 :  0.29495502


# Step 5
주어진 단어들 중 그 평균값에서 가장 먼 한 단어 출력하기

In [52]:
#words = '소프트웨어 네트워크 프로그램 가방'
print("Find one mismatched word among the given words")
text = input("Text(Words): ")
words = list(text.split())

# check the words are in the vocabulary
no_problem = True
# vocab = model.wv.vocab.keys() # 3.8.0
vocab = model.wv.key_to_index # 4.0.1

for word in words:
    if word not in vocab:
        print('The word ' + word + ' is not in the vocabulary')
        no_problem = False
        break;

if no_problem:
    mismatched = model.wv.doesnt_match(words) # 내적에 기반하여 계산
    print ('the mismatch word among ' + text +' is', mismatched)

Find one mismatched word among the given words
Text(Words): 아버지 어머니 형 누나 한국
the mismatch word among 아버지 어머니 형 누나 한국 is 형


# Step 6
어떤 단어들과 가장 가까운, 그러면서 동시에 다른 어떤 단어들로부터는 멀리 떨어진 단어 N개 출력하기

In [59]:
print("Find the top-N words that are most similar to the given ‘positive’ words and are most different from the given ‘negative’ words")
word = input("Word: ")

no_problem = True
# vocab = model.wv.vocab.keys() # 3.8.0
vocab = model.wv.key_to_index # 4.0.1

if word not in vocab:
    print ('The word ' + word + ' is not in the vocabulary')
    no_problem = False

if no_problem:
    print(model.wv.most_similar(positive=[word]))

Find the top-N words that are most similar to the given ‘positive’ words and are most different from the given ‘negative’ words
Word: 한국
[('우리나라', 0.5445699095726013), ('BMC', 0.49306520819664), ('SGI', 0.49253514409065247), ('국내', 0.48649993538856506), ('KPCA', 0.480002760887146), ('휴렛팩커드', 0.4716898202896118), ('정영수', 0.4686005413532257), ('ICA', 0.4683288037776947), ('KISTI', 0.46359169483184814), ('IBM', 0.45915600657463074)]


In [70]:
#word_a = '한국'
#word_b = '아시아'
#word_c = '유럽'
print('Find the top-N words that are most similar to the given ‘positive’ words and are most different from the given ‘negative’ words w.r.t the result of [ a - b + c ]')
word_a = input("a: ")
word_b = input("b: ")
word_c = input("c: ")

# check the words are in the vocabulary
no_problem = True
# vocab = model.wv.vocab.keys() # 3.8.0
vocab = model.wv.key_to_index # 4.0.1

if word_a not in vocab:
    print ('the word ' + word_a + ' is not in the vocabulary')
    no_problem = False

if word_b not in vocab:
    print ('the word ' + word_b + ' is not in the vocabulary')
    no_problem = False

if word_c not in vocab:
    print ('the word ' + word_c + ' is not in the vocabulary')
    no_problem = False

if no_problem:
    mostsimilar = model.wv.most_similar(positive=[word_a, word_c], negative=[word_b], topn=5)
    print ('most similar word of ' + word_a + ' - ' + word_b + ' + ' + word_c + ' is', mostsimilar[0][0], mostsimilar[1][0], mostsimilar[2][0])

Find the top-N words that are most similar to the given ‘positive’ words and are most different from the given ‘negative’ words w.r.t the result of [ a - b + c ]
a: 한국
b: 아시아
c: 유럽
most similar word of 한국 - 아시아 + 유럽 is 유럽시장 국내 우리나라
