In [14]:
import torch 
from torchtext import vocab

In [97]:
glove = vocab.GloVe(name='6B', dim=100)

In [34]:
def get_embd_vector(word):
    word_idx = glove.stoi[word]
    return glove.vectors[word_idx]
    

In [41]:
glove.vectors

tensor([[-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        [-0.1077,  0.1105,  0.5981,  ..., -0.8316,  0.4529,  0.0826],
        [-0.3398,  0.2094,  0.4635,  ..., -0.2339,  0.4730, -0.0288],
        ...,
        [ 0.3609, -0.1692, -0.3270,  ...,  0.2714, -0.2919,  0.1611],
        [-0.1046, -0.5047, -0.4933,  ...,  0.4253, -0.5125, -0.1705],
        [ 0.2837, -0.6263, -0.4435,  ...,  0.4368, -0.8261, -0.1570]])

In [45]:
x = [(a,i) for a,i in zip('abc',[3,4,1])]
sorted(x,key=lambda x:x[1])

[('c', 1), ('a', 3), ('b', 4)]

In [57]:
def get_closest_words(word,n_words = 10):
    emb = get_embd_vector(word)
    word_dist_vec = [(word, torch.dist(emb,get_embd_vector(word))) for word in glove.itos]
    word_dist_vec = sorted(word_dist_vec,key=lambda x: x[1])
    return word_dist_vec[1:n_words+1] # first word will be the word itself

In [58]:
get_closest_words('man')

[('man', tensor(0.)),
 ('woman', tensor(3.3641)),
 ('boy', tensor(3.5585)),
 ('one', tensor(3.6909)),
 ('turned', tensor(3.8091)),
 ('another', tensor(3.8422)),
 ('person', tensor(3.8886)),
 ('whose', tensor(3.9249)),
 ('once', tensor(3.9997)),
 ('life', tensor(4.0633))]

In [61]:
def get_word_analogy(word1,word2,word3,n_words= 5):
    # word1 - word2 + word3 = word4
    emb = get_embd_vector(word1) - get_embd_vector(word2) + get_embd_vector(word3)
    word_dist_vec = [(word, torch.dist(emb,get_embd_vector(word))) for word in glove.itos]
    word_dist_vec = sorted(word_dist_vec,key=lambda x: x[1])
    return word_dist_vec[1:n_words+1] # first word will be the word itself

In [62]:
get_word_analogy('king','queen','man')

[('brother', tensor(5.3269)),
 ('thought', tensor(5.3961)),
 ('son', tensor(5.4332)),
 ('father', tensor(5.4406)),
 ('another', tensor(5.4755))]

In [63]:
get_word_analogy('queen','king','women')

[('girls', tensor(5.9165)),
 ('men', tensor(6.1727)),
 ('female', tensor(6.2754)),
 ('ladies', tensor(6.3051)),
 ('athletes', tensor(6.6412))]

In [65]:
# Plot word embeddings
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [96]:
no_of_sample= 500
wl = glove.vectors[:no_of_sample]
tsne = TSNE(n_components=2,random_state=42)
transformed_wl = tsne.fit_transform(wl)

import plotly.graph_objects as go
import pandas as pd

# Assuming you have the word embeddings already in `transformed_wl`
# and the list of words in `glove.itos[:1000]`

# Create a DataFrame to store the data for Plotly
data = {
    'x': transformed_wl[:, 0],
    'y': transformed_wl[:, 1],
    'word': glove.itos[:no_of_sample]  # Assuming you want to plot the first 1000 words
}
df = pd.DataFrame(data)

# Create the scatter plot using Plotly
fig = go.Figure()

scatter = go.Scatter(
    x=df['x'],
    y=df['y'],
    mode='markers',  # Show markers and text
    text=df['word'],      # Set text to display on hover
    textfont=dict(size=10),  # Set the size of the text
    marker=dict(size=5),   # Set the size of the markers
)

fig.add_trace(scatter)

# Add annotations for some key points
annotations = [dict(
    x=row['x'],
    y=row['y'],
    text=row['word'],
    showarrow=True,
    arrowhead=1,
    ax=0,
    ay=-20
) for _, row in df.iterrows()]

fig.update_layout(annotations=annotations)

# Add labels and title
fig.update_xaxes(title='Dimension 1')
fig.update_yaxes(title='Dimension 2')
fig.update_layout(title='t-SNE Visualization of Word Embeddings')

# Show the plot
fig.show()
