In [1]:
#!pip install gensim

In [2]:
from gensim.models import Word2Vec

In [3]:
from nltk.tokenize import word_tokenize
import nltk

In [4]:
# Download NLTK data (if not already downloaded)
#nltk.download('all')

In [5]:
# Sample corpus
sentences = [
    "NLP helps computers understand and interpret human language",
    "NLP tasks include text classification, machine translation, and sentiment analysis",
    "NLP models are trained on large datasets of text to learn patterns and relationships.",
    "NLP applications are used in various fields, such as customer service, healthcare, and marketing."
]

In [6]:
# Tokenize the sentences
tokenized_sentences = [word_tokenize(i.lower()) for i in sentences]
tokenized_sentences

[['nlp',
  'helps',
  'computers',
  'understand',
  'and',
  'interpret',
  'human',
  'language'],
 ['nlp',
  'tasks',
  'include',
  'text',
  'classification',
  ',',
  'machine',
  'translation',
  ',',
  'and',
  'sentiment',
  'analysis'],
 ['nlp',
  'models',
  'are',
  'trained',
  'on',
  'large',
  'datasets',
  'of',
  'text',
  'to',
  'learn',
  'patterns',
  'and',
  'relationships',
  '.'],
 ['nlp',
  'applications',
  'are',
  'used',
  'in',
  'various',
  'fields',
  ',',
  'such',
  'as',
  'customer',
  'service',
  ',',
  'healthcare',
  ',',
  'and',
  'marketing',
  '.']]

In [7]:
len(tokenized_sentences)

4

In [8]:
tokenized_sentences[1]

['nlp',
 'tasks',
 'include',
 'text',
 'classification',
 ',',
 'machine',
 'translation',
 ',',
 'and',
 'sentiment',
 'analysis']

In [9]:
# Define the Word2Vec model using skip gram
model = Word2Vec(sentences=tokenized_sentences, 
                 vector_size=100, 
                 window=5, 
                 min_count=1, 
                 workers=4,
                 sg=0,compute_loss=True)

1. vector_size=100 defines the number of dimensions for each word's embedding vector.
2. window=5 specifies how many surrounding words the model will consider when training.
3. min_count=1 includes all words in the vocabulary, even those appearing only once.
4. workers=4,  The training will be faster because 4 cores can work simultaneously, processing multiple words and contexts in parallel.
5. sg=0: CBOW (Continuous Bag of Words), sg=1: Skip-Gram (default)

In [10]:
# Train the model
model.train(tokenized_sentences, total_examples=len(tokenized_sentences), epochs=10)

(101, 530)

In [11]:
# Find similar words
similar_words = model.wv.most_similar('nlp', topn=5)
similar_words

[('used', 0.1896437555551529),
 ('fields', 0.1893627792596817),
 ('human', 0.16177138686180115),
 ('helps', 0.16105271875858307),
 ('language', 0.13932116329669952)]

In [12]:
print("Top 5 words similar to 'nlp':")
for similar_word, score in similar_words:
    print(f"{similar_word}: {score:.4f}")

Top 5 words similar to 'nlp':
used: 0.1896
fields: 0.1894
human: 0.1618
helps: 0.1611
language: 0.1393


In [13]:
# Find the nearest word (the most similar word)
nearest_word = similar_words[0][0]  # The first word in the list is the nearest
print(f"\nNearest word to 'nlp': {nearest_word}")


Nearest word to 'nlp': used
