In [2]:
%matplotlib widget
import plotly.graph_objects as go
import numpy as np

# Word2Vec vectors

In [3]:
import gensim.downloader as api
import logging
import os
import gensim

# Set up logging to display information
logging.basicConfig(level=logging.INFO)

# Define the model name
model_name = 'word2vec-google-news-300'

if os.path.isfile("models/word2vec-google-news-300.model"):
    print("Model already exists, loading from file...")
    # Load the model from the file
    wv = gensim.models.KeyedVectors.load("models/word2vec-google-news-300.model")
else:
    # Attempt to load the model
    try:
        print(f"\nAttempting to download '{model_name}' using gensim downloader...")
        # Load the model using gensim's downloader
        wv = api.load(model_name)
        print("\nModel downloaded/loaded successfully!")
        print(f"It is now cached in: {api.BASE_DIR}")

        # Save the Word2Vec model
        if wv:
            model_path = "models/word2vec.model"
            print(f"Saving the Word2Vec model to '{model_path}'...")
            wv.save(model_path)
            print("Model saved successfully!")
        else:
            print("Word2Vec model is not loaded, so it cannot be saved.")

    except Exception as e:
        print(f"\nFailed to download or load using gensim downloader: {e}")
        wv = None  # Ensure wv is defined even if loading fails

# Check if the model was loaded successfully
if wv:
    print("Word2Vec model is ready to use.")
else:
    print("Word2Vec model could not be loaded.")

INFO:gensim.utils:loading KeyedVectors object from models/word2vec-google-news-300.model


Model already exists, loading from file...


INFO:gensim.utils:loading vectors from models/word2vec-google-news-300.model.vectors.npy with mmap=None
INFO:gensim.utils:KeyedVectors lifecycle event {'fname': 'models/word2vec-google-news-300.model', 'datetime': '2025-05-08T10:24:05.304756', 'gensim': '4.3.3', 'python': '3.11.7 | packaged by Anaconda, Inc. | (main, Dec 15 2023, 18:05:47) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'loaded'}


Word2Vec model is ready to use.


In [22]:
center_word = 'model'	

similar_words = wv.most_similar(center_word, topn=300)
words = [word for word, score in similar_words]

distances = [np.abs(wv[center_word] - wv[word]) for word in words]


distances = np.sum(np.array(distances), axis=0)

# Find the 3 shortest distances
shortest_indices = np.argsort(distances)[:3]

# Find the 3 longest distances
longest_indices = np.argsort(distances)[-3:]

# 3 random dimensions
random_indices = np.random.choice(range(len(distances)), size=3, replace=False)


# Find the dimensions where the words are most clustered
distances_wfw = np.array([[np.abs(wv[word1] - wv[word2]) for word2 in words] for word1 in words])

cluster_indices = np.zeros(distances_wfw.shape[2])

for i in range(distances_wfw.shape[2]):
    # find the distances to the 10 closest words to each word in every dimension
    closest_words = np.sort(distances_wfw[:,:,i], axis=1)[:,1:10]

    closest_sum = np.sum(closest_words)
    cluster_indices[i] = np.argmin(closest_sum)

cluster_indices = np.argsort(cluster_indices)[:3]
    

In [None]:
import webbrowser

indices = cluster_indices

# Example data
vectors = [wv[words][:,indices[0]], wv[words][:,indices[1]], wv[words][:,indices[2]]] 
colors = np.array([wv.similarity(center_word, word) for word in words])
# Create a 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=vectors[0],
    y=vectors[1],
    z=vectors[2],
    mode='markers+text',
    marker=dict(
        size=5,
        color=colors,
        colorscale='Plasma',
        opacity=0.8
    ),
    text=words  # Add word labels
)])

fig.update_layout(
    title=f"3D Scatter Plot of Word Vectors for '{center_word}'",
    scene=dict(
        xaxis_title=f'{indices[0]}-axis',
        yaxis_title=f'{indices[1]}-axis',
        zaxis_title=f'{indices[2]}-axis'
    ),
    coloraxis_colorbar=dict(
        title="Similarity",
        thickness=20,
        len=0.75,
        x=1.1  # Position the colorbar slightly outside the plot
    ),
    width=1500,
    height=1000,
)

# Save the plot as an HTML file and open it in the browser
fig.write_html("word_vectors_plot.html")
webbrowser.open("word_vectors_plot.html")

True

In [23]:
from sklearn.decomposition import PCA

# Assuming `data` is your 300-dimensional dataset (shape: [n_samples, 300])
pca = PCA(n_components=3)
reduced_data = pca.fit_transform(wv[words])

colors = np.array([wv.similarity(center_word, word) for word in words])
# Create a 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_data[:, 0],
    y=reduced_data[:, 1],
    z=reduced_data[:, 2],
    mode='markers+text',
    marker=dict(
        size=5,
        color=colors,
        colorscale='Plasma',
        opacity=0.8
    ),
    text=words  # Add word labels
)])

fig.update_layout(
    title=f"3D Scatter Plot of Word Vectors for '{center_word}'",
    scene=dict(
        xaxis_title='PCA 1',
        yaxis_title='PCA 2',
        zaxis_title='PCA 3'
    ),
    coloraxis_colorbar=dict(
        title="Similarity",
        thickness=20,
        len=0.75,
        x=1.1  # Position the colorbar slightly outside the plot
    ),
    width=1000,
    height=800,
)

fig.show()

In [21]:
from sklearn.manifold import TSNE

# Reduce to 3 dimensions using t-SNE
tsne = TSNE(n_components=3, random_state=42)
reduced_data = tsne.fit_transform(wv[words])

colors = np.array([wv.similarity(center_word, word) for word in words])
# Create a 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_data[:, 0],
    y=reduced_data[:, 1],
    z=reduced_data[:, 2],
    mode='markers+text',
    marker=dict(
        size=5,
        color=colors,
        colorscale='Plasma',
        opacity=0.8
    ),
    text=words  # Add word labels
)])

fig.update_layout(
    title=f"3D Scatter Plot of Word Vectors for '{center_word}'",
    scene=dict(
        xaxis_title='TSNE 1',
        yaxis_title='TSNE 2',
        zaxis_title='TSNE 3'
    ),
    coloraxis_colorbar=dict(
        title="Similarity",
        thickness=20,
        len=0.75,
        x=1.1  # Position the colorbar slightly outside the plot
    ),
    width=1000,
    height=800,
)

fig.show()

In [123]:
import umap

# Reduce to 3 dimensions using UMAP
reducer = umap.UMAP(n_components=3, random_state=42)
reduced_data = reducer.fit_transform(wv[words])  # Use wv[words] instead of data


colors = np.array([wv.similarity(center_word, word) for word in words])
# Create a 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_data[:, 0],
    y=reduced_data[:, 1],
    z=reduced_data[:, 2],
    mode='markers+text',
    marker=dict(
        size=5,
        color=colors,
        colorscale='Plasma',
        opacity=0.8
    ),
    text=words  # Add word labels
)])

fig.update_layout(
    title=f"3D Scatter Plot of Word Vectors for '{center_word}'",
    scene=dict(
        xaxis_title='UMAP 1',
        yaxis_title='UMAP 2',
        zaxis_title='UMAP 3'
    ),
    coloraxis_colorbar=dict(
        title="Similarity",
        thickness=20,
        len=0.75,
        x=1.1  # Position the colorbar slightly outside the plot
    ),
    width=1000,
    height=800,
)

fig.show()


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.

