In [1]:
import sys
import gensim
import random
from gensim.models.doc2vec import Doc2Vec, FAST_VERSION, TaggedDocument
import numpy as np
import re
from scipy.spatial.distance import cdist


'''
    Doc2Vec documentation: https://radimrehurek.com/gensim/models/doc2vec.html
    Doc2Vec blog post about installing a fast C compiler to make this soooooo much faster: http://rare-technologies.com/word2vec-in-python-part-two-optimizing/
        - This was a headache for me a bit, so lemme know if theres problems.
        - With these toy examples the speed is not a factor however, so it probably can be disregarded.
'''
patt = r'[,\.-_]+$'

#This is to make sure you have the proper underlying Cython stuff worked up
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

In [2]:

input_docs = ["Hey there, this is a test of the national broadcast system.",
         "Dogs are very cute animals.",
         "Didn't you get a super cute pet the other day?",
         "This is a nightmare, the world is on fire.",
         "How many times do I have to tell you?"]

test_docs = ["My bird makes for a great pet."]


In [3]:
labeled_docs = []
ctr = 0
for doc in input_docs:
    labeled_docs.append(TaggedDocument(words=[re.sub(patt,'', x) for x in doc.lower().split()], tags=['%s'%(str(ctr))])) #Add these TaggedDocuments to a list, which are the datastructure the model takes
    ctr += 1

In [5]:
model = Doc2Vec(labeled_docs, size=100, window=10, min_count=1, workers=4) #Plenty more parameters to mess with. 

for epoch in range(10):
    random.shuffle(labeled_docs)
    model.train(labeled_docs, total_examples=len(labeled_docs))

In [7]:
#See how the word similarity stuff works
print(model.similarity('cute', 'pet'))
print(model.similarity('cute', 'fire'))

0.449660611432
-0.823191990483


In [11]:
#See which sentences in the input docs are closest to the query
docs_mat = np.zeros((len(model.docvecs), 100))

for ix_ in range(len(model.docvecs)):
    vec = model.docvecs[ix_]
    docs_mat[ix_, :] = vec
    
for doc in test_docs:
    vec = model.infer_vector([re.sub(patt, '', x) for x in doc.lower().split()])
    res = cdist(np.reshape(vec, (1,vec.size)), model.docvecs, 'cosine') #distance between doc & every doc in the input_docs
    sorted_ix = np.argsort(res[0])

    sorted_res = res[0, sorted_ix]
    print("query: %s" % doc)
    for i in range(3):
        print("#%d) %s" %(i, input_docs[sorted_ix[i]]))
        print("\tscore:%f" %(sorted_res[i]))

query: My bird makes for a great pet.
#0) Hey there, this is a test of the national broadcast system.
	score:0.192573
#1) Didn't you get a super cute pet the other day?
	score:0.524513
#2) How many times do I have to tell you?
	score:0.588661
