# Importing Pretrained Model

In [2]:
# Import the Gensim downloader module
# Gensim is a popular NLP library used for working with word embeddings, topic modeling, etc.
import gensim.downloader as api

# Load a pre-trained Word2Vec model trained on Google News dataset
# "word2vec-google-news-300" means:
#   - It's the Word2Vec model released by Google.
#   - Trained on ~100 billion words from Google News.
#   - Each word is represented by a 300-dimensional vector.
# The 'api.load()' function automatically downloads and loads the model if not already cached.
model = api.load("word2vec-google-news-300")

# Example of Word as a Vector

In [3]:
word_vectors = model

#let us look how the vecctor embedding of a word looks like
print(word_vectors['computer'])  #Example: accessing the vector for the word 'computer'

[ 1.07421875e-01 -2.01171875e-01  1.23046875e-01  2.11914062e-01
 -9.13085938e-02  2.16796875e-01 -1.31835938e-01  8.30078125e-02
  2.02148438e-01  4.78515625e-02  3.66210938e-02 -2.45361328e-02
  2.39257812e-02 -1.60156250e-01 -2.61230469e-02  9.71679688e-02
 -6.34765625e-02  1.84570312e-01  1.70898438e-01 -1.63085938e-01
 -1.09375000e-01  1.49414062e-01 -4.65393066e-04  9.61914062e-02
  1.68945312e-01  2.60925293e-03  8.93554688e-02  6.49414062e-02
  3.56445312e-02 -6.93359375e-02 -1.46484375e-01 -1.21093750e-01
 -2.27539062e-01  2.45361328e-02 -1.24511719e-01 -3.18359375e-01
 -2.20703125e-01  1.30859375e-01  3.66210938e-02 -3.63769531e-02
 -1.13281250e-01  1.95312500e-01  9.76562500e-02  1.26953125e-01
  6.59179688e-02  6.93359375e-02  1.02539062e-02  1.75781250e-01
 -1.68945312e-01  1.21307373e-03 -2.98828125e-01 -1.15234375e-01
  5.66406250e-02 -1.77734375e-01 -2.08984375e-01  1.76757812e-01
  2.38037109e-02 -2.57812500e-01 -4.46777344e-02  1.88476562e-01
  5.51757812e-02  5.02929

In [4]:
#getting to know the shape of the vector for word cat
print(word_vectors['cat'].shape)

(300,)


# Similar Words

### King + Woman - Man = ?

In [5]:
# Find words that complete the analogy: king - man + woman ≈ ?
# 'positive' terms are ADDED:  vec('king') + vec('woman')
# 'negative' terms are SUBTRACTED: - vec('man')
# The method computes a target vector = sum(positive) - sum(negative),
# then returns the top 10 vocabulary words with highest cosine similarity
# to that target vector (excluding the input words themselves).
print(
    word_vectors.most_similar(
        positive=['king', 'woman'],  # add these vectors
        negative=['man'],            # subtract this vector
        topn=10                      # return 10 nearest neighbors
    )
)

[('queen', 0.7118193507194519), ('monarch', 0.6189674139022827), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321839332581), ('kings', 0.5236844420433044), ('Queen_Consort', 0.5235945582389832), ('queens', 0.518113374710083), ('sultan', 0.5098593235015869), ('monarchy', 0.5087411403656006)]


the above is the list of top 10 answers and out from them queen is the one which has the highest probablity.

In [6]:
#checking the similarity between a few pair of words

print(word_vectors.similarity('woman','man'))
print(word_vectors.similarity('king','queen'))
print(word_vectors.similarity('uncle','aunt'))
print(word_vectors.similarity('boy','girl'))
print(word_vectors.similarity('nephew','niece'))
print(word_vectors.similarity('paper','water'))


0.76640123
0.6510957
0.7643474
0.8543272
0.7594367
0.11408084


in the above results we can see that the words that are related to each other have a high similarity sccore and the last words paper and water are not related to each other so their similarity score is very low as compared to the other words for which we have tested it.

similarity between the vectors is generally calculated using the distance between the vectors of the words and then we are given a similarity score.

### Finding Similar words to each other

In [7]:
# Find the top 5 words that are most similar in meaning to "tower"
# The 'most_similar()' function calculates cosine similarity between
# the vector for "tower" and all other word vectors in the vocabulary.
# 'topn=5' means we only want the 5 closest (most similar) words.
print(
    word_vectors.most_similar(
        "tower",   # the target word whose similar words we want
        topn=5     # number of similar words to return
    )
)

[('towers', 0.8531750440597534), ('skyscraper', 0.6417425870895386), ('Tower', 0.639177143573761), ('spire', 0.594687819480896), ('responded_Understood_Atlasjet', 0.5931612253189087)]
