In [2]:
!pip install gensim




In [10]:
# Import necessary libraries
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK data for tokenization (if not already downloaded)
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
# Example corpus (list of sentences)
sentences = [
    "I love machine learning and natural language processing.",
    "Word2Vec is a useful algorithm in NLP.",
    "We can use Word2Vec to capture semantic meaning of words.",
    "Deep learning models often rely on word embeddings.",
    "The king and queen are part of the royal family.",
    "The man and woman are important to the history of humanity.",
    "Machine learning and AI are revolutionizing the world."
]



In [15]:
# Step 1: Preprocess the data (tokenize the sentences)
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

# Step 2: Train the Word2Vec model
# We will use size=100 (dimensions for word vectors), window=5 (context window size), and min_count=1 (ignore words with frequency < 1)
model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Step 3: Access the vocabulary of the model
print("Vocabulary:\n", model.wv.index_to_key)

# Step 4: Finding similar words
similar_words = model.wv.most_similar('machine', topn=3)
print("\nMost similar words to 'machine':\n", similar_words)

Vocabulary:
 ['.', 'the', 'and', 'of', 'learning', 'are', 'to', 'machine', 'word2vec', 'algorithm', 'use', 'can', 'we', 'nlp', 'in', 'is', 'useful', 'a', 'semantic', 'processing', 'language', 'natural', 'love', 'capture', 'world', 'meaning', 'revolutionizing', 'ai', 'humanity', 'history', 'important', 'woman', 'man', 'family', 'royal', 'part', 'queen', 'king', 'embeddings', 'word', 'on', 'rely', 'often', 'models', 'deep', 'words', 'i']

Most similar words to 'machine':
 [('processing', 0.31946805119514465), ('king', 0.2044612169265747), ('useful', 0.1748146265745163)]


In [16]:
# Step 5: Word vector arithmetic (king - man + woman ≈ queen)
result = model.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=1)
print("\nResult of 'king' - 'man' + 'woman' is closest to:\n", result)

# Step 6: Save the model for future use
model.save("word2vec_example.model")

# To load the model later, use:
# loaded_model = Word2Vec.load("word2vec_example.model")


Result of 'king' - 'man' + 'woman' is closest to:
 [('love', 0.18109464645385742)]


In [18]:
# vector('king') - vector('man') + vector('woman') ≈ vector('queen')


### For Increasing the corpus size and Using pre-trained Word2Vec models, load a pre-trained model in gensim:


In [20]:
from gensim.models import KeyedVectors

# Load Google's pre-trained Word2Vec model (this requires internet connection and around 1.5GB of space)
pretrained_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

# Perform the same operation (king - man + woman ≈ queen)
result = pretrained_model.most_similar(positive=['king', 'woman'], negative=['man'], topn=1)
print("\nResult of 'king' - 'man' + 'woman' is closest to:\n", result)
