<a href="https://colab.research.google.com/github/Krishishah7/nlp-doc2vec-embeddings/blob/main/doc2vec_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install gensim nltk

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [2]:
documents = [
    "Natural language processing is interesting",
    "Machine learning powers modern AI",
    "Deep learning improves NLP performance",
    "Python is widely used in data science",
    "AI and NLP are closely related fields"
]

In [5]:
tagged_docs = [
    TaggedDocument(words=word_tokenize(doc.lower()), tags=[str(i)])
    for i, doc in enumerate(documents)
]

In [6]:
model = Doc2Vec(
    vector_size=50,
    window=2,
    min_count=1,
    workers=4,
    epochs=100
)

model.build_vocab(tagged_docs)
model.train(tagged_docs, total_examples=model.corpus_count, epochs=model.epochs)

In [7]:
doc_vectors = [model.dv[str(i)] for i in range(len(documents))]

print(doc_vectors[0][:10])  # first 10 values

[-0.01361717 -0.01378536 -0.02050054  0.01684039  0.00735306  0.00150181
 -0.01989088 -0.00939226 -0.02238794  0.00517341]


In [8]:
new_doc = "NLP and machine learning"
new_vector = model.infer_vector(word_tokenize(new_doc.lower()))

print(new_vector[:10])

[-0.01594007 -0.00661278  0.00869833 -0.00943646  0.00084472 -0.00472357
  0.0049426   0.00827775  0.0041286  -0.00155006]


In [9]:
similar_docs = model.dv.most_similar([new_vector])

print("Most similar documents:")
for doc_id, score in similar_docs:
    print(documents[int(doc_id)], "→", score)

Most similar documents:
Machine learning powers modern AI → 0.3361947536468506
Deep learning improves NLP performance → 0.33059605956077576
Natural language processing is interesting → 0.23701074719429016
AI and NLP are closely related fields → 0.18708263337612152
Python is widely used in data science → 0.1220303624868393
