<a href="https://colab.research.google.com/github/Nawapon19/NLP/blob/main/Word_Embeddings_Demo_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Word Embeddings Demo Practice**

**Word2Vec(Google)**

In [32]:
# import gdown for downloading file from google drive
import gdown

In [33]:
# download pre-trained vectors of Google News dataset
# the model contains 300-dimensional vectors for 3 million words and phrases
!gdown https://drive.google.com/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM

Downloading...
From: https://drive.google.com/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM
To: /content/GoogleNews-vectors-negative300.bin.gz
100% 1.65G/1.65G [00:19<00:00, 84.4MB/s]


In [34]:
# unzip using gunzip command for gzip file
!gunzip GoogleNews-vectors-negative300.bin.gz

gzip: GoogleNews-vectors-negative300.bin already exists; do you wish to overwrite (y or n)? ^C


In [35]:
# import KeyedVectors from gensim.models to query word vectors
# is a necessary API to interact with Word Embeddings
from gensim.models import KeyedVectors

In [36]:
# call the function load_word2vec_format, passing the pre-trained Word Embeddings
word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [37]:
# implement a function called find_analogies
def find_analogies(w1, w2, w3):
  # w1 - w2 = ? - w3
  # e.g. king - man = ? - woman
  # ? = +king +woman -man
  r = word_vectors.most_similar(positive=[w1, w3], negative=[w2])
  print("%s - %s = %s - %s" % (w1, w2, r[0][0], w3))

In [38]:
find_analogies('king', 'man', 'woman')

king - man = queen - woman


In [39]:
find_analogies('france', 'paris', 'london')

france - paris = england - london


In [40]:
find_analogies('france', 'paris', 'rome')

france - paris = italy - rome


In [41]:
# sequence is important here
find_analogies('paris', 'france', 'italy')

paris - france = lohan - italy


In [42]:
find_analogies('france', 'french', 'english')

france - french = england - english


In [43]:
# Word Embeddings are not perfect
find_analogies('japan', 'japanese', 'chinese')

japan - japanese = tibet - chinese


In [44]:
find_analogies('japan', 'japanese', 'italian')

japan - japanese = italy - italian


In [45]:
# there's also a miss here, but the result is still a month
# GloVe is more accurate for this type of words
find_analogies('december', 'november', 'june')

december - november = september - june


In [46]:
find_analogies('miami', 'florida', 'texas')

miami - florida = dallas - texas


In [47]:
find_analogies('einstein', 'scientist', 'painter')

einstein - scientist = jude - painter


In [48]:
find_analogies('man', 'woman', 'she')

man - woman = he - she


In [49]:
find_analogies('man', 'woman', 'aunt')

man - woman = uncle - aunt


In [50]:
find_analogies('man', 'woman', 'sister')

man - woman = brother - sister


In [51]:
find_analogies('man', 'woman', 'wife')

man - woman = son - wife


In [52]:
find_analogies('man', 'woman', 'actress')

man - woman = actor - actress


In [53]:
find_analogies('man', 'woman', 'mother')

man - woman = father - mother


In [54]:
find_analogies('nephew', 'niece', 'aunt')

nephew - niece = uncle - aunt


In [55]:
# implement a function called nearest_neighbors
def nearest_neighbors(w):
  r = word_vectors.most_similar(positive=[w]) # r stores most similar words and scores
  print("neighbors of: %s" % w)
  for word, score in r:
    print("\t%s" % word)

In [56]:
nearest_neighbors('king')

neighbors of: king
	kings
	queen
	monarch
	crown_prince
	prince
	sultan
	ruler
	princes
	Prince_Paras
	throne


In [57]:
nearest_neighbors('france')

neighbors of: france
	spain
	french
	germany
	europe
	italy
	england
	european
	belgium
	usa
	serbia


In [58]:
nearest_neighbors('japan')

neighbors of: japan
	japanese
	tokyo
	america
	europe
	germany
	chinese
	india
	hawaii
	usa
	korea


In [59]:
nearest_neighbors('einstein')

neighbors of: einstein
	nikki
	lmfao
	albert
	armstrong
	joan
	becky
	mcmahon
	conrad
	lori
	haley


In [60]:
nearest_neighbors('woman')

neighbors of: woman
	man
	girl
	teenage_girl
	teenager
	lady
	teenaged_girl
	mother
	policewoman
	boy
	Woman


In [61]:
nearest_neighbors('nephew')

neighbors of: nephew
	son
	uncle
	brother
	grandson
	cousin
	father
	niece
	younger_brother
	nephews
	stepson


In [62]:
nearest_neighbors('february')

neighbors of: february
	january
	april
	september
	december
	july
	october
	november
	june
	feb
	norway


**GloVe(Stanford)**

In [63]:
!wget -nc https://nlp.stanford.edu/data/glove.6B.zip

--2023-09-18 15:02:48--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-09-18 15:02:49--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2023-09-18 15:05:43 (4.72 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]



In [64]:
!unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [65]:
from gensim.scripts.glove2word2vec import glove2word2vec

In [67]:
glove2word2vec('glove.6B.300d.txt','glove.6B.300d.word2vec.txt')

  glove2word2vec('glove.6B.300d.txt','glove.6B.300d.word2vec.txt')


(400000, 300)

In [68]:
word_vectors_glove = KeyedVectors.load_word2vec_format('glove.6B.300d.word2vec.txt', binary=False)

In [69]:
def find_analogies_glove(w1, w2, w3):
  # w1 - w2 = ? - w3
  # e.g. king - man = ? - woman
  #      ? = +king +woman -man
  r = word_vectors_glove.most_similar(positive=[w1, w3], negative=[w2])
  print("%s - %s = %s - %s" % (w1, w2, r[0][0], w3))

In [70]:
find_analogies_glove('king', 'man', 'woman')

king - man = queen - woman


In [71]:
find_analogies_glove('france', 'paris', 'london')

france - paris = britain - london


In [72]:
find_analogies_glove('france', 'paris', 'rome')

france - paris = italy - rome


In [73]:
find_analogies_glove('paris', 'france', 'italy')

paris - france = rome - italy


In [74]:
find_analogies_glove('france', 'french', 'english')

france - french = england - english


In [75]:
find_analogies_glove('japan', 'japanese', 'chinese')

japan - japanese = china - chinese


In [76]:
find_analogies_glove('japan', 'japanese', 'italian')

japan - japanese = italy - italian


In [77]:
find_analogies_glove('december', 'november', 'june')

december - november = july - june


In [78]:
find_analogies_glove('miami', 'florida', 'texas')

miami - florida = dallas - texas


In [79]:
find_analogies_glove('einstein', 'scientist', 'painter')

einstein - scientist = picasso - painter


In [80]:
find_analogies_glove('man', 'woman', 'she')

man - woman = he - she


In [81]:
find_analogies_glove('man', 'woman', 'aunt')

man - woman = uncle - aunt


In [82]:
find_analogies_glove('man', 'woman', 'sister')

man - woman = brother - sister


In [83]:
find_analogies_glove('man', 'woman', 'wife')

man - woman = brother - wife


In [84]:
find_analogies_glove('man', 'woman', 'actress')

man - woman = actor - actress


In [85]:
find_analogies_glove('man', 'woman', 'mother')

man - woman = father - mother


In [86]:
find_analogies_glove('nephew', 'niece', 'aunt')

nephew - niece = uncle - aunt


In [87]:
def nearest_neighbors_glove(w):
  r = word_vectors_glove.most_similar(positive=[w])
  print("neighbors of: %s" % w)
  for word, score in r:
    print("\t%s" % word)

In [88]:
nearest_neighbors_glove('king')

neighbors of: king
	queen
	prince
	monarch
	kingdom
	throne
	ii
	iii
	crown
	reign
	kings


In [89]:
nearest_neighbors_glove('france')

neighbors of: france
	french
	paris
	belgium
	spain
	italy
	germany
	prohertrib
	britain
	chirac
	switzerland


In [90]:
nearest_neighbors_glove('japan')

neighbors of: japan
	japanese
	tokyo
	korea
	china
	asia
	osaka
	hashimoto
	taiwan
	philippines
	thailand


In [91]:
nearest_neighbors_glove('einstein')

neighbors of: einstein
	relativity
	bohr
	physicists
	heisenberg
	sigmund
	freud
	equations
	theory
	physics
	physicist


In [92]:
nearest_neighbors_glove('woman')

neighbors of: woman
	girl
	man
	mother
	she
	her
	female
	herself
	person
	women
	wife


In [93]:
nearest_neighbors_glove('nephew')

neighbors of: nephew
	brother
	cousin
	grandson
	uncle
	son
	niece
	eldest
	brother-in-law
	son-in-law
	father


In [94]:
nearest_neighbors_glove('february')

neighbors of: february
	october
	december
	january
	november
	april
	august
	september
	june
	july
	march
