In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
from nltk.tokenize import word_tokenize

In [6]:
sentences = ["I love machine learning. Its awesome.",
        "I love coding in python",
        "I love building chatbots",
        "they chat amagingly well"]

In [7]:
tagged_data = []

for i, d in enumerate(sentences):
    tagged_data.append(TaggedDocument(words=word_tokenize(d.lower()), tags=[str(i)]))

In [8]:
tagged_data

[TaggedDocument(words=['i', 'love', 'machine', 'learning', '.', 'its', 'awesome', '.'], tags=['0']),
 TaggedDocument(words=['i', 'love', 'coding', 'in', 'python'], tags=['1']),
 TaggedDocument(words=['i', 'love', 'building', 'chatbots'], tags=['2']),
 TaggedDocument(words=['they', 'chat', 'amagingly', 'well'], tags=['3'])]

### Training

In [9]:
max_epochs = 100 # 학습 100번
vec_size = 20 # number of vector dimension (벡터 차원 개수)
alpha = 0.025 # 초기 learning rate

In [10]:
model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm=1) 
# dm = 1 (PV-DM, distributed memory, 0이면, PV-DBOW, distributed bag of words)

model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print(f"iteration {epoch}")
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    # learning rate 감소
    model.alpha -= 0.0002 # 0.0002씩 감소 시키면서 학습
    # learning rate 고정 
    model.min_alpha = model.alpha

model.save("d2v.model")
print("Model.saved")

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

#### test_data를 토큰화하여 vector를 만드는 과정

In [11]:
model = Doc2Vec.load("d2v.model")

test_data = word_tokenize("I love chatbots".lower())
v1 = model.infer_vector(test_data)

print("V1_infer", v1)

V1_infer [-0.02625987 -0.00292833  0.02067595 -0.00482254  0.0167065  -0.02450037
  0.00549205  0.00129894 -0.02022499  0.0021243  -0.01183919  0.01597496
  0.00086636 -0.01064102 -0.02188106 -0.0089807  -0.00907736  0.0061656
  0.0121739  -0.00782044]


In [22]:
# 0번 문장 : I love machine learning. Its awesome.
similar_doc = model.docvecs.most_similar('0')

for index, similarity in similar_doc:
    index = int(index)
    print(f"{sentences[index]} --> {similarity}")

I love building chatbots --> 0.912226676940918
I love machine learning. Its awesome. --> 0.7618046998977661
they chat amagingly well --> 0.7178292274475098


  similar_doc = model.docvecs.most_similar('1')


In [26]:
# 1번 문장 : I love coding in python
similar_doc = model.docvecs.most_similar('1')

for index, similarity in similar_doc:
    index = int(index)
    print(f"{sentences[index]} --> {similarity}")

I love building chatbots --> 0.912226676940918
I love machine learning. Its awesome. --> 0.7618046998977661
they chat amagingly well --> 0.7178292274475098


  similar_doc = model.docvecs.most_similar('1')


In [24]:
# 2번 문장 : I love building chatbots
similar_doc = model.docvecs.most_similar('2')

for index, similarity in similar_doc:
    index = int(index)
    print(f"{sentences[index]} --> {similarity}")

I love coding in python --> 0.9122265577316284
they chat amagingly well --> 0.7344692349433899
I love machine learning. Its awesome. --> 0.7150116562843323


  similar_doc = model.docvecs.most_similar('2')


In [25]:
# 3번 문장 : they chat amagingly well
similar_doc = model.docvecs.most_similar('3')

for index, similarity in similar_doc:
    index = int(index)
    print(f"{sentences[index]} --> {similarity}")

I love building chatbots --> 0.7344693541526794
I love coding in python --> 0.7178292870521545
I love machine learning. Its awesome. --> 0.46119457483291626


  similar_doc = model.docvecs.most_similar('3')
