In [4]:
import pickle5 as pickle
import random
import numpy as np

with open('../tokens', 'rb') as pickle_file:
    tokenized_sent = pickle.load(pickle_file)

random.shuffle(tokenized_sent)

def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

test_tokens = tokenized_sent[:10000]
train_tokens = tokenized_sent[10000:]

In [5]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
tagged_train_data = [TaggedDocument(token[2], [i]) for i, token in enumerate(train_tokens)]
tagged_test_data = [i[2] for i in test_tokens]

In [6]:
## Train doc2vec model
model = Doc2Vec(tagged_train_data, vector_size = 50, min_count = 2, epochs = 30)

'''
vector_size = Dimensionality of the feature vectors.
window = The maximum distance between the current and predicted word within a sentence.
min_count = Ignores all words with total frequency lower than this.
alpha = The initial learning rate.
'''

'\nvector_size = Dimensionality of the feature vectors.\nwindow = The maximum distance between the current and predicted word within a sentence.\nmin_count = Ignores all words with total frequency lower than this.\nalpha = The initial learning rate.\n'

In [7]:
test_doc = tagged_test_data[0]
test_doc_vector = model.infer_vector(test_doc)
model.dv.most_similar(positive = [test_doc_vector])

[(19637, 0.6353837251663208),
 (30930, 0.6341956853866577),
 (5478, 0.6312306523323059),
 (19140, 0.6294926404953003),
 (39313, 0.6060676574707031),
 (29072, 0.6038985848426819),
 (45961, 0.6007195115089417),
 (21924, 0.5802558064460754),
 (16527, 0.5774394869804382),
 (33303, 0.571479082107544)]

In [8]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(tagged_test_data) - 1)
inferred_vector = model.infer_vector(tagged_test_data[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(tagged_test_data[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(tagged_train_data[sims[index][0]].words)))

Test Document (4628): «in this paper an improved design is presented to achieve a compact reconfigurable wideband antenna based on liquid crystal lc for millimeter wave mmw 5g networks the proposed design consists of two stacked patch antenna using aperture coupled feeding the lc is an anisotropic dielectric material used as substrate and it has a variable permittivity that can be controlled by a biasing voltage the dimensions of the designed stacked patch antenna are in mm the proposed antenna is suitable for radar satellite communications and for 5g applications such as theatres shopping malls convention centers and stadiums it operates at millimeter wave from to ghz with a bandwidth of ghz and a maximum antenna gain of db is achieved»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (22237, 0.8091264963150024): «a wideband hybrid dielectric antenna fitting into a low cost multilayer aip concept is proposed for 5g cellular handsets applications in this 

In [10]:
import numpy as np
import pandas as pd
results = []
for token in tokenized_sent:
    class_token = token[1]
    token_embedding = model.infer_vector(token[2])

    row = np.append(token_embedding, [class_token])
    results.append(row)

df = pd.DataFrame(results)
df.to_csv('../dataframes/doc2vet_embedding.csv', index=False, encoding='utf-8')