In [4]:
import pandas as pd
from nltk.corpus import stopwords
import re
from tqdm import tqdm
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn import utils
from stop_words import get_stop_words

In [12]:
df = pd.read_csv("dataset_cleaned.csv")

In [25]:
documents = [TaggedDocument(row['text'].split(), [row['DALTIX_ID']]) for index, row in df.iterrows()]

In [26]:
docs = [(row['text'].split(), row['DALTIX_ID']) for index, row in df.iterrows()]

In [27]:
model = Doc2Vec(dm=0, vector_size=300, negative=5, window=10, min_count=2, workers=6, alpha=0.025, min_alpha=0.00025)
model.build_vocab(documents)

In [28]:
%%time
for epoch in tqdm(range(30)):
    model.train(utils.shuffle(documents), total_examples=len(documents), epochs=1)
    model.alpha -= 0.002
    model.min_alpha = model.alpha

100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [04:53<00:00,  9.95s/it]


Wall time: 4min 53s


In [29]:
model.init_sims(replace=False)

In [30]:
model.save("doc2vec_model.d2v")
# model = Doc2Vec.load("doc2vec_model.d2v")

In [31]:
%%time
results = []
count = 0
for doc in tqdm(docs):
    infer_vector_ = model.infer_vector(doc[0], steps=20)
    similar = model.docvecs.most_similar([infer_vector_], topn = 2)
    if similar[0][0] == doc[1]:
        count += 1
        results.append((doc[1], similar[1][0], similar[1][1]))
    else:
        results.append((doc[1], similar[0][0], similar[0][1]))
print(count)

100%|██████████████████████████████████████████████████████████████████████████| 101781/101781 [18:32<00:00, 81.42it/s]


9246
Wall time: 18min 32s


In [32]:
doc2vec_df = pd.DataFrame(results, columns=['daltix_id_1', 'daltix_id_2', 'similarity'])

In [33]:
doc2vec_df.to_csv("doc2vec_df.csv", index=False)