In [11]:
import gensim.downloader as api

model = api.load("word2vec-google-news-300")
print("Google News Word2Vec model loaded successfully.")


Google News Word2Vec model loaded successfully.


In [12]:
# Example: Find words similar to 'king'
if model:
    try:
        similar_words = model.most_similar('king')
        print("Words similar to 'king':")
        for word, similarity in similar_words:
            print(f"{word}: {similarity:.4f}")
    except KeyError:
        print("'king' not in vocabulary.")

    # Example: Find the word that is to 'woman' as 'king' is to 'man'
    try:
        result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
        print("\n'woman' + 'king' - 'man' is:")
        for word, similarity in result:
             print(f"{word}: {similarity:.4f}")
    except KeyError:
         print("One or more words not in vocabulary for analogy task.")
else:
    print("Model not loaded. Cannot perform examples.")

Words similar to 'king':
kings: 0.7138
queen: 0.6511
monarch: 0.6413
crown_prince: 0.6204
prince: 0.6160
sultan: 0.5865
ruler: 0.5798
princes: 0.5647
Prince_Paras: 0.5433
throne: 0.5422

'woman' + 'king' - 'man' is:
queen: 0.7118


In [13]:
import numpy as np

# Get a list of words and their vectors from the model
# Selecting a subset for visualization purposes
words = []
vectors = []
# You can adjust the number of words you want to visualize
num_words_to_plot = 500

# Iterate through the model's vocabulary and collect words and vectors
# Only take the top `num_words_to_plot` words for efficiency
for i, word in enumerate(model.index_to_key):
    if i >= num_words_to_plot:
        break
    words.append(word)
    vectors.append(model[word])

# Convert the list of vectors to a NumPy array
vectors = np.array(vectors)

print(f"Prepared data for {len(words)} words.")

Prepared data for 500 words.


In [14]:
from sklearn.decomposition import PCA

# Reduce dimensionality to 3 components
pca = PCA(n_components=3)
reduced_vectors = pca.fit_transform(vectors)

print(f"Reduced dimensionality of {len(reduced_vectors)} vectors to 3 dimensions.")

Reduced dimensionality of 500 vectors to 3 dimensions.


In [15]:
import plotly.express as px
import pandas as pd

# Create a DataFrame for easier plotting with Plotly
plot_df = pd.DataFrame(reduced_vectors, columns=['x', 'y', 'z'])
plot_df['word'] = words # Add the words as a column

# Create the 3D scatter plot
fig = px.scatter_3d(plot_df, x='x', y='y', z='z',
                    text='word', # Use words as text labels on hover
                    title='3D Visualization of Word Embeddings (PCA)')

# Adjust layout for better readability if needed
fig.update_layout(scene = dict(
                    xaxis_title='PCA Component 1',
                    yaxis_title='PCA Component 2',
                    zaxis_title='PCA Component 3'))

fig.show()