In [None]:
import gensim
from pprint import pprint

In [None]:
sentences = ['He is a boy', 'He is a man', 'She is a girl']

sentences = [s.lower().strip().split() for s in sentences]

sentences

[['he', 'is', 'a', 'boy'],
 ['he', 'is', 'a', 'man'],
 ['she', 'is', 'a', 'girl']]

###### Tagging Sentences
###### 하나의 paragraph에 대해서 하나의 sentence를 설정.

In [None]:
for i, s in enumerate(sentences):
  print(i, s)

0 ['he', 'is', 'a', 'boy']
1 ['he', 'is', 'a', 'man']
2 ['she', 'is', 'a', 'girl']


In [None]:
tagged_documents = []
for i, s in enumerate(sentences):
    tagged_documents.append(gensim.models.doc2vec.TaggedDocument(s, [i]))

doc2vec_model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=1)

In [None]:
doc2vec_model

<gensim.models.doc2vec.Doc2Vec at 0x7f1d08a29690>

###### Building word2vec vocab
###### doc2vec은 word2vec에 근거하고 있다.

In [None]:
doc2vec_model.build_vocab(tagged_documents)

In [None]:
# Train tagged_documents
doc2vec_model.train(tagged_documents, total_examples=len(tagged_documents), epochs=300)

In [None]:
# Infer new document vector 
# not string, use list of string as input
print("---- Document Vector ----")
new_document = 'he is a man'
new_doc_vector = doc2vec_model.infer_vector(new_document.lower().split(" "))
print(f"Document, '{new_document}' to vector {new_doc_vector[:5]}")

---- Document Vector ----
Document, 'he is a man' to vector [-0.01515365  0.0103257   0.01213598  0.02197052 -0.00239013]


In [None]:
# Using word2vec similarity 
# Document 전체에 대하여 similarity를 측정하여, 가장 가까운 word_vector를 사용해서 결과를 리턴.
print("---- Word similarity ----")
pprint(doc2vec_model.wv.similar_by_vector(new_doc_vector))

---- Word similarity ----
[('he', 0.2349940985441208),
 ('girl', 0.1688927561044693),
 ('man', 0.09743630886077881),
 ('is', 0.014727441594004631),
 ('a', 0.005051769316196442),
 ('boy', -0.11151957511901855),
 ('she', -0.11516711115837097)]


In [None]:
# Using doc2vec simliarity
# doc2vec.most_similar --> word에 대한 vector에 기반해서 처리
print('---- Document similarity ----')
doc_sim_list = doc2vec_model.docvecs.most_similar(positive=[new_doc_vector], topn=len(doc2vec_model.docvecs))
for doc_id, sim in doc_sim_list:
    print(f"Document {doc_id} - similarity: {sim:.5f}")

# Document 1:He is a man
# Document 0:He is a boy
# Document 2:She is a girl


---- Document similarity ----
Document 1 - similarity: 0.85703
Document 0 - similarity: 0.72945
Document 2 - similarity: 0.71735
