|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 5:</h2>|<h1>Observation (non-causal) mech interp<h1>|
|<h2>Section:</h2>|<h1>Investigating token embeddings<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: soft-coded analogies in word2vec<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
# !pip install gensim

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import gensim.downloader as api

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
# import word2vec
w2v = api.load('word2vec-google-news-300')

# Exercise 1: Create the analogy vectors

In [None]:
# get a list of all keys
allwords = list(w2v.key_to_index.keys())

In [None]:
# soft-code the three words
word2start = 'king'
word2subtract = 'man'
word2add = 'woman'

# test that all three words are in the vocabulary
for w in [word2start,word2subtract,word2add]:
  if w not in allwords:
    print(f'WARNING: "{w}" is not in word2vec!')

# print the analogy
print(f'\n"{word2start}" is to "{word2subtract}" as "_____" is to "{word2add}"')

In [None]:
# get the embeddings vectors
v1 = w2v[word2start]    # base word
v2 = w2v[word2subtract] # to subtract
v3 = w2v[word2add]      # to add

# analogy vector
analogyVector = v1 - v2 + v3

# plot the vectors
plt.figure(figsize=(10,4))
plt.plot(v1,label=word2start)
plt.plot(v2,label=word2subtract)
plt.plot(v3,label=word2add)
plt.plot(analogyVector,'k',linewidth=2,label='analogy')

plt.gca().set(xlim=[0,len(v1)],xlabel='Embedding dimension',ylabel='Value')
plt.legend()
plt.show()

# Exercise 2: Cosine similarity to all vectors

In [None]:
# cossim with all
cossim2all = w2v.cosine_similarities(analogyVector,w2v.vectors)
print(cossim2all.shape)

# plot the cosine similarities (skip every N)
skip4plotting = 1000
_,axs = plt.subplots(1,2,figsize=(13,4))
axs[0].scatter(range(0,len(cossim2all),skip4plotting),cossim2all[::skip4plotting],c=cossim2all[::skip4plotting],marker='o',alpha=.5,cmap='seismic')
axs[0].set(xticks=[],xlabel=f'Index (skip every {skip4plotting})',ylabel='Cosine similarity',title='Similarity with analogy vector')

axs[1].plot(np.sort(cossim2all)[-1000:],'ko',markerfacecolor='gray',alpha=.5)
axs[1].set(xticks=[],xlabel='Sorted index',ylabel='Cosine similarity',title='Top 1000 similarities')

plt.tight_layout()
plt.show()

In [None]:
# find top 10 highest scores
top10 = cossim2all.argsort()[-10:][::-1]

# print them out
print('Top 10 closest words:')
for widx in top10:
  print(f'  Similarity of {cossim2all[widx]:.3f} with "{w2v.index_to_key[widx]}"')

# Exercise 3: Create and test an analogy function

In [None]:
def analogyCalculator(word2start,word2subtract,word2add):

  # give error if a word is not in the vocabulary
  for w in [word2start,word2subtract,word2add]:
    if w not in allwords:
      raise ValueError(f'Error: "{w}" is not in word2vec!')

  # print the analogy
  print(f'\n"{word2start}" is to "{word2subtract}" as "_____" is to "{word2add}"')

  # get the vectors
  v1 = w2v[word2start]    # base word
  v2 = w2v[word2subtract] # to subtract
  v3 = w2v[word2add]      # to add

  # analogy vector
  analogyVector = v1 - v2 + v3

  # cossim with all
  cossim2all = w2v.cosine_similarities(analogyVector,w2v.vectors)

  # print out the top 10 highest scores
  top10 = cossim2all.argsort()[-10:][::-1]
  print('\nTop 10 closest words:')
  for widx in top10:
    print(f'  Similarity of {cossim2all[widx]:.3f} with "{w2v.index_to_key[widx]}"')

In [None]:
# check accuracy of function
analogyCalculator('king','man','woman')

In [None]:
# check error handling
analogyCalculator('king','man','woman0')

In [None]:
### try some other analogies

analogyCalculator('tree','leaf','petal')
# analogyCalculator('leaf','tree','flower')    # turn it around for better results?
# analogyCalculator('dachshund','dog','bird')
# analogyCalculator('tired','yawn','scratch')   # expecting "itch"
# analogyCalculator('finger','hand','foot')