|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 1:</h2>|<h1>Tokenizations and embeddings<h1>|
|<h2>Section:</h2>|<h1>Embedding spaces<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: Wikipedia vs. Twitter embeddings<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

# svg figure format
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Exercise 1: Download and inspect the models

In [None]:
# NOTE: If you get errors importing, run the following !pip... line,
# then restart your session (from Runtime menu) and comment out the pip line.
# !pip install gensim

import gensim.downloader as api

# download the wikipedia and twitter models
wiki = api.load('glove-wiki-gigaword-50')
twit = api.load('glove-twitter-50')

In [None]:
dir(twit)

In [None]:
# embedding matrix dimensions
print(f'Wikipedia model has {wiki.vectors.shape[0]:,} words and {wiki.vectors.shape[1]} embedding dimensions.')
print(f'Twitter model has {twit.vectors.shape[0]:,} words and {twit.vectors.shape[1]} embedding dimensions.')

# Exercise 2: Visualize the embeddings for one word

In [None]:
targetword = 'table'

_,axs = plt.subplots(1,2,figsize=(12,4.5))
axs[0].plot(wiki[targetword],'ks',markerfacecolor=[.7,.7,.9],markersize=8,label='Wikipedia')
axs[0].plot(twit[targetword],'ko',markerfacecolor=[.7,.9,.7],markersize=8,label='Twitter')
axs[0].set(xlabel='Dimension',ylabel='Value',title=f'Embeddings for "{targetword}"')
axs[0].legend()

axs[1].plot(wiki[targetword],twit[targetword],'k^',markerfacecolor=[.9,.7,.7],markersize=8)
axs[1].set(xlabel='Wiki embedding',ylabel='Twitter embedding',title=f'Embeddings for "{targetword}"')

plt.tight_layout()
plt.show()

# Exercise 3: Embeddings for word pairs within each model

In [None]:
# word pair
word1 = 'table'
word2 = 'chair'

# scatter plot for wiki
_,axs = plt.subplots(1,2,figsize=(12,4.5))
axs[0].plot(wiki[word1],wiki[word2],'ks',markersize=9,markerfacecolor=[.9,.7,.7])
axs[0].set(xlabel=f'Embedding for "{word1}"',ylabel=f'Embedding for "{word2}"',
           title=f'WIKI (Cosine similarity: {wiki.similarity(word1,word2):.3f})')


# scatter plot for twitter
axs[1].plot(twit[word1],twit[word2],'ko',markersize=9,markerfacecolor=[.7,.9,.7])
axs[1].set(xlabel=f'Embedding for "{word1}"',ylabel=f'Embedding for "{word2}"',
           title=f'TWITTER (Cosine similarity: {twit.similarity(word1,word2):.3f})')

plt.tight_layout()
plt.show()

# Exercise 4: Similar words within each model

In [None]:
print('10 words most similar to "battery" in wiki:')
for w,cs in wiki.most_similar('battery'):
  print(f' {w:>15} with similarity {cs:.4f}')

print('\nAnd in twitter:')
for w,cs in twit.most_similar('battery'):
  print(f' {w:>15} with similarity {cs:.4f}')

# Exercise 5: foxes and dogs

In [None]:
text = 'The quick brown fox jumps over the lazy dog'

import re
words = re.split('\s',text)#.lower())

# index sequence in the two embeddings
wiki_idx = [wiki.key_to_index[w] if w in wiki.key_to_index else np.Inf for w in words ]
twit_idx = [twit.key_to_index[w] if w in twit.key_to_index else np.Inf for w in words ]

print(' Word |  Wiki | Twitter')
print('-'*23)
for o,w,t in zip(words,wiki_idx,twit_idx):
  print(f'{o:>5} | {w:>5} | {t:>5}')

In [None]:
# get all unique inter-word similarities

plt.figure(figsize=(9,7))

# start range at 0 or 1?
for i in range(0,len(words)):
  for j in range(i+1,len(words)):

    # skip identity
    if words[i]==words[j]: continue

    # calculate the cosine similarities for the two embeddings
    cs_wiki = wiki.similarity(wiki_idx[i],wiki_idx[j])
    cs_twit = twit.similarity(twit_idx[i],twit_idx[j])

    # calculate the distance to the unity line
    v = np.array([cs_wiki,cs_twit])
    u = np.array([1,1])
    dist = np.linalg.norm(v - (sum(v*u))/(np.linalg.norm(u)**2)*u)

    # draw the results at the coordinates
    plt.plot(cs_wiki,cs_twit,'ks',markersize=9,markerfacecolor=mpl.cm.plasma(dist*5))

    # and write the word pair
    plt.text(cs_wiki,cs_twit+.02,f'{words[i]}-{words[j]}',va='bottom',ha='center')



# plot the unity line
xylims = [.05,.95]
plt.plot(xylims,xylims,'--',color=[.4,.4,.4],zorder=-30)

# final adjustments
plt.gca().set(xlim=xylims,ylim=xylims,xlabel='Wiki inter-word similarities',
              ylabel='Twitter inter-word similarities',title='Inter-word similarities')
plt.show()