# **Go through link to know more about word embedding**
https://www.analyticsvidhya.com/blog/2021/06/part-7-step-by-step-guide-to-master-nlp-word-embedding/


In [1]:
## pretrained GloVe embeddings model
## check more at https://github.com/RaRe-Technologies/gensim-data

import gensim.downloader as api
model = api.load("glove-wiki-gigaword-50")

In [2]:
## any word vector can be accessed by pasing word like below
v_king = model['king']
v_queen = model['queen']

print("Length/Dimension of vector :" , len(v_king))
print("Length/Dimension of vector :" , len(v_queen))
print(" ")
print("vector")
print(v_king)

Length/Dimension of vector : 50
Length/Dimension of vector : 50
 
vector
[ 0.50451   0.68607  -0.59517  -0.022801  0.60046  -0.13498  -0.08813
  0.47377  -0.61798  -0.31012  -0.076666  1.493    -0.034189 -0.98173
  0.68229   0.81722  -0.51874  -0.31503  -0.55809   0.66421   0.1961
 -0.13495  -0.11476  -0.30344   0.41177  -2.223    -1.0756   -1.0783
 -0.34354   0.33505   1.9927   -0.04234  -0.64319   0.71125   0.49159
  0.16754   0.34344  -0.25663  -0.8523    0.1661    0.40102   1.1685
 -1.0137   -0.21585  -0.15155   0.78321  -0.91241  -1.6106   -0.64426
 -0.51042 ]


**Check Similarity of words using word vector**




In [3]:
## check similarity beween two words/word-vectors
print("similarity:", model.similarity('king', 'queen'))

similarity: 0.7839043


In [4]:
##most similar words to king
model.most_similar('king', topn=5)

[('prince', 0.8236179351806641),
 ('queen', 0.7839042544364929),
 ('ii', 0.7746230363845825),
 ('emperor', 0.7736247181892395),
 ('son', 0.766719400882721)]

In [5]:
w1=model['science']
w2= model['physics']
w3 = model['cricket']
w4 = model['chemistry']

model.cosine_similarities(w1, [w2, w3, w4])

array([0.8314354 , 0.27207178, 0.8080922 ], dtype=float32)

In [6]:
## Vector operation on word vector gives correct new word vector
## e.g. King+Woman-Man = Queen
print(model.most_similar(positive=['woman', 'king'], negative=['man'], topn=3))


[('queen', 0.8523603677749634), ('throne', 0.7664334177970886), ('prince', 0.759214460849762)]


In [7]:
model.most_similar(positive=['india', 'capital'], topn=5)

[('indian', 0.8180676698684692),
 ('central', 0.8107988834381104),
 ('delhi', 0.8071692585945129),
 ('indonesia', 0.8021898865699768),
 ('pakistan', 0.8015152812004089)]

# **Training and Evaluate Your Own Embeddings**

In [8]:
import urllib
import re

# change to your own path if you have downloaded the file locally
url = 'https://dataskat.s3.eu-west-3.amazonaws.com/data/Shakespeare_alllines.txt'
# read file into list of lines
lines = urllib.request.urlopen(url).read().decode('utf-8').split("\n")

In [9]:
sentences = []
for line in lines:
    # remove punctuation
    line = re.sub(r'[\!"#$%&\*+,-./:;<=>?@^_`()|~=]','',line).strip()
    # tokenizer
    tokens = re.findall(r'\b\w+\b', line)
    if len(tokens) > 1:
        sentences.append(tokens)

In [10]:
sentences[0:10]

[['ACT', 'I'],
 ['SCENE', 'I', 'London', 'The', 'palace'],
 ['Enter',
  'KING',
  'HENRY',
  'LORD',
  'JOHN',
  'OF',
  'LANCASTER',
  'the',
  'EARL',
  'of',
  'WESTMORELAND',
  'SIR',
  'WALTER',
  'BLUNT',
  'and',
  'others'],
 ['So', 'shaken', 'as', 'we', 'are', 'so', 'wan', 'with', 'care'],
 ['Find', 'we', 'a', 'time', 'for', 'frighted', 'peace', 'to', 'pant'],
 ['And', 'breathe', 'shortwinded', 'accents', 'of', 'new', 'broils'],
 ['To', 'be', 'commenced', 'in', 'strands', 'afar', 'remote'],
 ['No', 'more', 'the', 'thirsty', 'entrance', 'of', 'this', 'soil'],
 ['Shall',
  'daub',
  'her',
  'lips',
  'with',
  'her',
  'own',
  'children',
  's',
  'blood'],
 ['Nor', 'more', 'shall', 'trenching', 'war', 'channel', 'her', 'fields']]

## **Train model using Gensim**

In [11]:
import logging
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s', level=logging.INFO)
logging.getLogger().setLevel(logging.INFO)

In [12]:
from gensim.models import Word2Vec

cust_model = Word2Vec(sentences,       # tokenized input sentences
                 size=100,    # size of word vectors (default 100)
                 window=2,    # context window size (default 5)
                 sg=1,        # use skip-gram (default 0 = CBOW)
                 negative=5,  # number of negative samples (default 5)
                 min_count=5, # ignore infrequent words (default 5)
                 workers=4,   # number of threads (default 3)
                 iter=5)      # number of epochs (default 5)


2022-01-20 11:36:46,511: INFO: collecting all words and their counts
2022-01-20 11:36:46,513: INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-01-20 11:36:46,540: INFO: PROGRESS: at sentence #10000, processed 79207 words, keeping 8710 word types
2022-01-20 11:36:46,563: INFO: PROGRESS: at sentence #20000, processed 156921 words, keeping 12509 word types
2022-01-20 11:36:46,586: INFO: PROGRESS: at sentence #30000, processed 230335 words, keeping 15538 word types
2022-01-20 11:36:46,609: INFO: PROGRESS: at sentence #40000, processed 306343 words, keeping 18757 word types
2022-01-20 11:36:46,632: INFO: PROGRESS: at sentence #50000, processed 381777 words, keeping 20753 word types
2022-01-20 11:36:46,655: INFO: PROGRESS: at sentence #60000, processed 455792 words, keeping 23041 word types
2022-01-20 11:36:46,680: INFO: PROGRESS: at sentence #70000, processed 535002 words, keeping 24926 word types
2022-01-20 11:36:46,704: INFO: PROGRESS: at sentence #80000, proce

In [13]:
cust_model.wv.most_similar("king")

2022-01-20 11:36:56,766: INFO: precomputing L2-norms of word weight vectors


[('duke', 0.8617936372756958),
 ('queen', 0.8092128038406372),
 ('prince', 0.7938957214355469),
 ('doctor', 0.7876814007759094),
 ('Moor', 0.7867355346679688),
 ('Jew', 0.7733607292175293),
 ('Caesar', 0.767480194568634),
 ('cardinal', 0.7622534036636353),
 ('shepherd', 0.7612801790237427),
 ('emperor', 0.7564812898635864)]

In [14]:
## save model
##model.save('./word2vec_custom_model.bin')

##load model
##model = Word2Vec.load('./word2vec_custom_model.bin')

##**Train Word2Vec , FastText , Custom model**

In [15]:
from gensim.models import Word2Vec, FastText
import numpy as np

## change param grid 
param_grid = {'w2v': {'variant': ['cbow', 'sg'], 'window': [2,3 ]},
              'ft': {'variant': ['sg'], 'window': [5]},
               'cust':{'variant': ['sg'] ,'window': [2]}
              }
size = 100
models=[]
index=0
for algo, params in param_grid.items(): 

    for variant in params['variant']:
        sg = 1 if variant == 'sg' else 0
        for window in params['window']:
            index+=1
            print("")
            print(f"===>>Model : {index}, Algo : {algo} , Variant: {variant}, Window: {window}, Size: {size}") ###
            print("==========================================================================================")
            print("")
            np.random.seed(1) ### to ensure repeatability
            if algo == 'w2v':
                model = Word2Vec(sentences, size=size, window=window, sg=sg)
            elif algo=='cust':
                model = cust_model
            else:
                model = FastText(sentences, size=size, window=window, sg=sg)

            models.append(model)

2022-01-20 11:36:56,829: INFO: collecting all words and their counts
2022-01-20 11:36:56,831: INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-01-20 11:36:56,864: INFO: PROGRESS: at sentence #10000, processed 79207 words, keeping 8710 word types
2022-01-20 11:36:56,897: INFO: PROGRESS: at sentence #20000, processed 156921 words, keeping 12509 word types
2022-01-20 11:36:56,918: INFO: PROGRESS: at sentence #30000, processed 230335 words, keeping 15538 word types
2022-01-20 11:36:56,940: INFO: PROGRESS: at sentence #40000, processed 306343 words, keeping 18757 word types
2022-01-20 11:36:56,961: INFO: PROGRESS: at sentence #50000, processed 381777 words, keeping 20753 word types
2022-01-20 11:36:56,986: INFO: PROGRESS: at sentence #60000, processed 455792 words, keeping 23041 word types
2022-01-20 11:36:57,007: INFO: PROGRESS: at sentence #70000, processed 535002 words, keeping 24926 word types



===>>Model : 1, Algo : w2v , Variant: cbow, Window: 2, Size: 100



2022-01-20 11:36:57,039: INFO: PROGRESS: at sentence #80000, processed 610642 words, keeping 26451 word types
2022-01-20 11:36:57,068: INFO: PROGRESS: at sentence #90000, processed 687819 words, keeping 27905 word types
2022-01-20 11:36:57,089: INFO: PROGRESS: at sentence #100000, processed 763170 words, keeping 29478 word types
2022-01-20 11:36:57,113: INFO: collected 30675 word types from a corpus of 830015 raw words and 108774 sentences
2022-01-20 11:36:57,115: INFO: Loading a fresh vocabulary
2022-01-20 11:36:57,156: INFO: effective_min_count=5 retains 9106 unique words (29% of original 30675, drops 21569)
2022-01-20 11:36:57,157: INFO: effective_min_count=5 leaves 795574 word corpus (95% of original 830015, drops 34441)
2022-01-20 11:36:57,193: INFO: deleting the raw counts dictionary of 30675 items
2022-01-20 11:36:57,196: INFO: sample=0.001 downsamples 55 most-common words
2022-01-20 11:36:57,198: INFO: downsampling leaves estimated 619881 word corpus (77.9% of prior 795574)
202


===>>Model : 2, Algo : w2v , Variant: cbow, Window: 3, Size: 100



2022-01-20 11:37:04,269: INFO: PROGRESS: at sentence #80000, processed 610642 words, keeping 26451 word types
2022-01-20 11:37:04,292: INFO: PROGRESS: at sentence #90000, processed 687819 words, keeping 27905 word types
2022-01-20 11:37:04,334: INFO: PROGRESS: at sentence #100000, processed 763170 words, keeping 29478 word types
2022-01-20 11:37:04,359: INFO: collected 30675 word types from a corpus of 830015 raw words and 108774 sentences
2022-01-20 11:37:04,361: INFO: Loading a fresh vocabulary
2022-01-20 11:37:04,392: INFO: effective_min_count=5 retains 9106 unique words (29% of original 30675, drops 21569)
2022-01-20 11:37:04,394: INFO: effective_min_count=5 leaves 795574 word corpus (95% of original 830015, drops 34441)
2022-01-20 11:37:04,425: INFO: deleting the raw counts dictionary of 30675 items
2022-01-20 11:37:04,428: INFO: sample=0.001 downsamples 55 most-common words
2022-01-20 11:37:04,430: INFO: downsampling leaves estimated 619881 word corpus (77.9% of prior 795574)
202


===>>Model : 3, Algo : w2v , Variant: sg, Window: 2, Size: 100



2022-01-20 11:37:11,543: INFO: PROGRESS: at sentence #80000, processed 610642 words, keeping 26451 word types
2022-01-20 11:37:11,568: INFO: PROGRESS: at sentence #90000, processed 687819 words, keeping 27905 word types
2022-01-20 11:37:11,592: INFO: PROGRESS: at sentence #100000, processed 763170 words, keeping 29478 word types
2022-01-20 11:37:11,612: INFO: collected 30675 word types from a corpus of 830015 raw words and 108774 sentences
2022-01-20 11:37:11,614: INFO: Loading a fresh vocabulary
2022-01-20 11:37:11,646: INFO: effective_min_count=5 retains 9106 unique words (29% of original 30675, drops 21569)
2022-01-20 11:37:11,655: INFO: effective_min_count=5 leaves 795574 word corpus (95% of original 830015, drops 34441)
2022-01-20 11:37:11,690: INFO: deleting the raw counts dictionary of 30675 items
2022-01-20 11:37:11,692: INFO: sample=0.001 downsamples 55 most-common words
2022-01-20 11:37:11,697: INFO: downsampling leaves estimated 619881 word corpus (77.9% of prior 795574)
202


===>>Model : 4, Algo : w2v , Variant: sg, Window: 3, Size: 100



2022-01-20 11:37:21,741: INFO: PROGRESS: at sentence #100000, processed 763170 words, keeping 29478 word types
2022-01-20 11:37:21,761: INFO: collected 30675 word types from a corpus of 830015 raw words and 108774 sentences
2022-01-20 11:37:21,763: INFO: Loading a fresh vocabulary
2022-01-20 11:37:21,797: INFO: effective_min_count=5 retains 9106 unique words (29% of original 30675, drops 21569)
2022-01-20 11:37:21,799: INFO: effective_min_count=5 leaves 795574 word corpus (95% of original 830015, drops 34441)
2022-01-20 11:37:21,832: INFO: deleting the raw counts dictionary of 30675 items
2022-01-20 11:37:21,834: INFO: sample=0.001 downsamples 55 most-common words
2022-01-20 11:37:21,835: INFO: downsampling leaves estimated 619881 word corpus (77.9% of prior 795574)
2022-01-20 11:37:21,869: INFO: estimated required memory for 9106 words and 100 dimensions: 11837800 bytes
2022-01-20 11:37:21,871: INFO: resetting layer weights
2022-01-20 11:37:23,809: INFO: training model with 3 workers 


===>>Model : 5, Algo : ft , Variant: sg, Window: 5, Size: 100



2022-01-20 11:37:33,117: INFO: PROGRESS: at sentence #90000, processed 687819 words, keeping 27905 word types
2022-01-20 11:37:33,137: INFO: PROGRESS: at sentence #100000, processed 763170 words, keeping 29478 word types
2022-01-20 11:37:33,162: INFO: collected 30675 word types from a corpus of 830015 raw words and 108774 sentences
2022-01-20 11:37:33,165: INFO: Loading a fresh vocabulary
2022-01-20 11:37:33,200: INFO: effective_min_count=5 retains 9106 unique words (29% of original 30675, drops 21569)
2022-01-20 11:37:33,204: INFO: effective_min_count=5 leaves 795574 word corpus (95% of original 830015, drops 34441)
2022-01-20 11:37:33,243: INFO: deleting the raw counts dictionary of 30675 items
2022-01-20 11:37:33,245: INFO: sample=0.001 downsamples 55 most-common words
2022-01-20 11:37:33,249: INFO: downsampling leaves estimated 619881 word corpus (77.9% of prior 795574)
2022-01-20 11:37:33,384: INFO: estimated required memory for 9106 words, 63081 buckets and 100 dimensions: 388978


===>>Model : 6, Algo : cust , Variant: sg, Window: 2, Size: 100



##**Compare Word2Vec, FastText and Custom model**

In [16]:
import pandas as pd

def compare_models(models, **kwargs):
    index=0
    df = pd.DataFrame()
    for m in models:
        index+=1
        name= ("Model : " + str(index) )
        df[name] = [f"{word} {score:.3f}"
                    for word, score in model.wv.most_similar(**kwargs)]
    df.index = df.index + 1 # let row index start at 1
    return df

In [17]:
compare_models(models, positive='king', topn=10)

Unnamed: 0,Model : 1,Model : 2,Model : 3,Model : 4,Model : 5,Model : 6
1,duke 0.862,duke 0.862,duke 0.862,duke 0.862,duke 0.862,duke 0.862
2,queen 0.809,queen 0.809,queen 0.809,queen 0.809,queen 0.809,queen 0.809
3,prince 0.794,prince 0.794,prince 0.794,prince 0.794,prince 0.794,prince 0.794
4,doctor 0.788,doctor 0.788,doctor 0.788,doctor 0.788,doctor 0.788,doctor 0.788
5,Moor 0.787,Moor 0.787,Moor 0.787,Moor 0.787,Moor 0.787,Moor 0.787
6,Jew 0.773,Jew 0.773,Jew 0.773,Jew 0.773,Jew 0.773,Jew 0.773
7,Caesar 0.767,Caesar 0.767,Caesar 0.767,Caesar 0.767,Caesar 0.767,Caesar 0.767
8,cardinal 0.762,cardinal 0.762,cardinal 0.762,cardinal 0.762,cardinal 0.762,cardinal 0.762
9,shepherd 0.761,shepherd 0.761,shepherd 0.761,shepherd 0.761,shepherd 0.761,shepherd 0.761
10,emperor 0.756,emperor 0.756,emperor 0.756,emperor 0.756,emperor 0.756,emperor 0.756


In [18]:
!pip3 install umap-learn



In [19]:
from umap import UMAP

model =cust_model
words = model.wv.vocab
wv = [model[word] for word in words]

reducer = UMAP(n_components=2, metric='cosine', n_neighbors = 15, min_dist=0.1, random_state = 12)
reduced_wv = reducer.fit_transform(wv)

  """


In [20]:
!pip install plotly==4.8.2



In [33]:
import plotly.express as px


px.defaults.template = "plotly_white" ### plotly style

plot_df = pd.DataFrame.from_records(reduced_wv, columns=['x', 'y'])
plot_df['word'] = words
params = {'hover_data': {c: False for c in plot_df.columns}, 
          'hover_name': 'word'}
params.update({'width': 800, 'height': 600}) ###


fig = px.scatter(plot_df, x="x", y="y", opacity=0.3, size_max=3, **params)
fig.update_traces(marker={'line': {'width': 0}}) ###
fig.update_xaxes(showticklabels=False, showgrid=True, zeroline=False, visible=True) ###
fig.update_yaxes(showticklabels=False, showgrid=True, zeroline=False, visible=True) ###

fig.update_layout(
    autosize=False,
    width=1000,
    height=1000,
    yaxis=dict(
        title_text="",
        ticktext=["Very long label", "long label", "3", "label"],
        tickvals=[1, 2, 3, 4],
        tickmode="array",
        titlefont=dict(size=50)))

fig.show()