<a href="https://colab.research.google.com/github/KRiver28/TIL/blob/master/5_1_doc2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# https://radimrehurek.com/gensim/models/doc2vec.html
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument



In [2]:
common_texts


[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [3]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
documents



[TaggedDocument(words=['human', 'interface', 'computer'], tags=[0]),
 TaggedDocument(words=['survey', 'user', 'computer', 'system', 'response', 'time'], tags=[1]),
 TaggedDocument(words=['eps', 'user', 'interface', 'system'], tags=[2]),
 TaggedDocument(words=['system', 'human', 'system', 'eps'], tags=[3]),
 TaggedDocument(words=['user', 'response', 'time'], tags=[4]),
 TaggedDocument(words=['trees'], tags=[5]),
 TaggedDocument(words=['graph', 'trees'], tags=[6]),
 TaggedDocument(words=['graph', 'minors', 'trees'], tags=[7]),
 TaggedDocument(words=['graph', 'minors', 'survey'], tags=[8])]

In [4]:
model = Doc2Vec(documents, vector_size=5, window=3, min_count=1)



In [5]:
word2idx = {w:obj.index for w, obj in model.wv.vocab.items()}
idx2word = {v:k for k, v in word2idx.items()}

print("사전 크기 =", len(word2idx))
print(word2idx)

사전 크기 = 12
{'human': 4, 'interface': 5, 'computer': 6, 'survey': 7, 'user': 1, 'system': 0, 'response': 8, 'time': 9, 'eps': 10, 'trees': 2, 'graph': 3, 'minors': 11}


In [6]:
# The number of trained document tags is available from:
len(model.docvecs)

9

In [7]:
model.docvecs[[8][0]]

array([-0.01538636,  0.00664686,  0.02287913, -0.01051788,  0.04889192],
      dtype=float32)

In [8]:
for text, tags in documents:
    print(model.docvecs[tags[0]])


[ 0.05088975  0.03661568  0.07556903  0.08941567 -0.02334388]
[-0.07182071  0.02274394  0.00966447 -0.0008566   0.07141146]
[ 0.00430666  0.09910301  0.05073719 -0.08839872 -0.09612329]
[-0.01957331 -0.0653591   0.05502439  0.00275736  0.07872102]
[ 0.00991887 -0.0253971   0.00652011  0.02257582  0.07185292]
[-0.08665517  0.07207931 -0.03090863  0.01776069  0.03163863]
[ 0.0807455  -0.02895734  0.01609362 -0.09247477 -0.02901707]
[-0.06008804  0.04940773 -0.09253345  0.00667203 -0.0008773 ]
[-0.01538636  0.00664686  0.02287913 -0.01051788  0.04889192]


In [9]:
new_doc = model.infer_vector(["system", "response"])
new_doc

array([-0.08220916, -0.05461131, -0.05633409, -0.07846064, -0.07803524],
      dtype=float32)

In [10]:
model.docvecs.most_similar([new_doc], topn=3)

new_doc = model.infer_vector(['survey', 'user', 'computer', 'system', 'response', 'time'])
model.docvecs.most_similar([new_doc], topn=3)

[(0, 0.5894572138786316), (3, 0.43782782554626465), (6, 0.3842448592185974)]