**1. FastText Word Embeddings**

In [1]:
# Install gensim and load pre-trained FastText model
!pip install gensim



In [17]:
import gensim.downloader as api

# Load pre-trained FastText model from gensim's dataset
fasttext_model = api.load("fasttext-wiki-news-subwords-300")



In [18]:
# Define word pairs to compute similarity for
word_pairs = [('learn', 'learning'), ('pakistan', 'pakistani'), ('fame', 'famous')]

# Compute similarity for each pair of words
for pair in word_pairs:
	similarity = fasttext_model.similarity(pair[0], pair[1])
	print(f"Similarity between '{pair[0]}' and '{pair[1]}' using FastText: {similarity:.3f}")

Similarity between 'learn' and 'learning' using FastText: 0.642
Similarity between 'pakistan' and 'pakistani' using FastText: 0.784
Similarity between 'fame' and 'famous' using FastText: 0.519


In [19]:
fasttext_model['cricket']

array([-5.6223e-02, -1.3269e-01,  8.0792e-02,  4.8387e-03, -7.2780e-02,
       -1.1366e-01,  3.2758e-02, -1.0579e-01,  5.5781e-02,  8.1121e-02,
       -2.6884e-02, -5.6836e-02,  3.4722e-02, -1.8339e-02,  2.4359e-02,
        5.4740e-02,  1.2844e-01, -1.1079e-02,  4.5804e-02,  3.4510e-03,
       -8.2912e-03, -1.3103e-02,  6.4645e-02,  6.1126e-02,  9.1010e-02,
       -7.0568e-02,  3.2366e-02, -2.5020e-02,  9.4346e-03, -5.7183e-02,
        5.2997e-02,  4.6864e-02,  4.9109e-02, -3.3272e-02,  5.7041e-02,
        2.2314e-03, -1.8304e-02, -4.2963e-03,  4.2639e-02, -7.5454e-02,
       -5.0088e-02, -1.2924e-01, -4.4175e-02, -7.7405e-02,  4.7315e-03,
       -2.2751e-02,  8.2874e-02, -1.2767e-01, -3.0402e-02,  7.6816e-02,
       -1.2038e-04, -2.2981e-02,  1.0717e-01, -8.3321e-02, -5.9552e-02,
       -3.0353e-02,  2.3399e-02,  9.7110e-03, -1.1167e-01,  2.1441e-02,
        4.9749e-02, -4.7759e-02,  9.3138e-02,  4.5799e-03,  1.0022e-02,
       -1.0357e-01, -5.1326e-02, -5.2696e-03, -2.5817e-03,  1.48

In [28]:
fasttext_model['cricket'].shape

(300,)

In [20]:
fasttext_model.most_similar('man')

[('woman', 0.8119808435440063),
 ('man--', 0.7323855757713318),
 ('man--and', 0.7230692505836487),
 ('person', 0.7203925848007202),
 ('mad-man', 0.7037578225135803),
 ('guy', 0.6992257833480835),
 ('god-man', 0.69350266456604),
 ('boy-man', 0.6925113797187805),
 ('man--the', 0.6904609203338623),
 ('man-love', 0.687400221824646)]

In [21]:
fasttext_model.similarity('man','bottle')

0.40124792

In [22]:
fasttext_model.doesnt_match(['PHP','java','monkey'])

'monkey'

**2. GloVe Word Embeddings**

In [24]:
# Load pre-trained GloVe model (6B tokens, 100-dimensional vectors)
glove_model = api.load("glove-wiki-gigaword-100")



In [25]:
word_pairs = [('learn', 'learning'), ('pakistan', 'pakistani'), ('fame', 'famous')]

# Compute similarity for each pair of words
for pair in word_pairs:
	similarity = glove_model.similarity(pair[0], pair[1])
	print(f"Similarity between '{pair[0]}' and '{pair[1]}' using GloVe: {similarity:.3f}")

Similarity between 'learn' and 'learning' using GloVe: 0.734
Similarity between 'pakistan' and 'pakistani' using GloVe: 0.813
Similarity between 'fame' and 'famous' using GloVe: 0.468


In [26]:
glove_model['cricket']

array([-0.55541  ,  0.45894  ,  0.51851  , -0.045938 , -1.4064   ,
        0.49701  , -0.085008 ,  0.63442  , -1.7949   , -0.31881  ,
       -0.13673  , -1.1583   ,  0.45505  ,  0.21464  , -0.21751  ,
       -0.21984  ,  0.60619  ,  0.55812  , -0.01031  ,  0.66228  ,
        0.22206  ,  0.25498  ,  0.8452   , -0.72988  ,  0.26195  ,
        0.26418  ,  0.22577  , -0.051338 ,  0.024459 ,  0.86389  ,
       -0.35585  ,  0.48662  , -0.49752  , -0.44777  , -0.040533 ,
       -0.18376  , -1.32     ,  0.54899  , -1.2289   , -0.22673  ,
       -0.93431  ,  0.78923  ,  0.9565   , -1.3996   ,  1.0314   ,
        0.39573  ,  0.7956   , -0.27184  ,  0.51776  , -1.0387   ,
       -0.38121  ,  0.21772  ,  0.52486  ,  0.63307  , -0.21206  ,
       -1.6741   , -1.3811   ,  0.079469 ,  0.46871  ,  0.29956  ,
       -0.90023  , -0.16781  , -0.30873  ,  0.16586  ,  0.12141  ,
        0.50219  ,  0.049859 ,  0.54896  ,  0.55576  , -0.14683  ,
        0.55657  , -0.0060587,  0.25941  , -0.91918  ,  0.23  

In [29]:
glove_model['cricket'].shape

(100,)

In [30]:
glove_model.most_similar('man')

[('woman', 0.8323495388031006),
 ('boy', 0.7914870977401733),
 ('one', 0.7788748741149902),
 ('person', 0.7526816725730896),
 ('another', 0.7522234916687012),
 ('old', 0.7409117221832275),
 ('life', 0.7371697425842285),
 ('father', 0.7370322346687317),
 ('turned', 0.7347694635391235),
 ('who', 0.7345511317253113)]

In [31]:
glove_model.similarity('man','bottle')

0.3487146

In [32]:
glove_model.doesnt_match(['PHP','java','monkey'])



'monkey'

**3. BERT Word Embeddings**

In [1]:
# Import necessary libraries
from transformers import BertTokenizer, BertModel
import torch

In [2]:
# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [3]:
word_pairs = [('learn', 'learning'), ('pakistan', 'pakistani'), ('fame', 'famous')]

# Compute similarity for each pair of words
for pair in word_pairs:
	tokens = tokenizer(pair, return_tensors='pt')
	with torch.no_grad():
		outputs = model(**tokens)

	# Extract embeddings for the [CLS] token
	cls_embedding = outputs.last_hidden_state[:, 0, :]

	similarity = torch.nn.functional.cosine_similarity(cls_embedding[0], cls_embedding[1], dim=0)

	print(f"Similarity between '{pair[0]}' and '{pair[1]}' using BERT: {similarity:.3f}")

Similarity between 'learn' and 'learning' using BERT: 0.930
Similarity between 'pakistan' and 'pakistani' using BERT: 0.913
Similarity between 'fame' and 'famous' using BERT: 0.956


In [7]:
# Tokenize the word "cricket"
tokens = tokenizer("cricket", return_tensors='pt')

# Get the model's output
with torch.no_grad():
    outputs = model(**tokens)

# Access the embedding for the word "cricket"
# We'll take the embedding of the first token, assuming it's "cricket"
embedding = outputs.last_hidden_state[:, 0, :]
print(embedding)

tensor([[-2.5019e-01,  2.1354e-02, -4.3675e-01, -5.8682e-01,  3.4105e-02,
         -2.8385e-02,  1.3506e-01,  5.2249e-01, -4.7748e-01,  2.8922e-02,
         -2.7474e-01, -3.2879e-01, -1.1539e-01,  3.5818e-01,  1.4294e-01,
          3.1087e-01, -5.2249e-01,  4.1769e-01,  4.6940e-01, -2.7974e-01,
         -1.2542e-01, -1.4642e-01, -4.5150e-01, -2.7796e-01,  6.0341e-02,
         -2.4179e-01,  3.6778e-02,  6.8280e-02, -2.7100e-02, -1.8102e-02,
          7.1989e-02,  3.1395e-01, -3.4609e-02,  3.3215e-01, -1.1463e-01,
         -2.6011e-01,  3.0103e-02, -2.6199e-01,  4.4755e-02,  2.2186e-01,
         -3.6590e-02, -2.0875e-01,  1.1753e-01, -2.0598e-01,  3.3432e-01,
          4.8508e-02, -2.0608e+00, -8.8270e-02, -3.1192e-01,  4.4332e-02,
          1.8787e-01,  3.8772e-01,  4.3683e-01,  9.3089e-02,  2.0869e-01,
          4.8024e-01,  5.3258e-03,  5.7429e-01,  1.8835e-01,  1.2618e-01,
          1.7460e-01, -9.7302e-02, -1.2189e-01,  3.8829e-02,  3.7123e-01,
          2.5130e-01,  3.6287e-02,  3.

In [8]:
embedding.shape

torch.Size([1, 768])

**Word2Vec**

In [4]:
import gensim
from gensim.models import Word2Vec,KeyedVectors

In [5]:
import gensim.downloader
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [6]:
model = gensim.downloader.load('word2vec-google-news-300') #already loads the Word2Vec Google News model as a KeyedVectors object



In [7]:
model['cricket'] # model is a word2vec, it contains word2vec vector representation of 3 million words. and each vector has 300 dimensions

array([-3.67187500e-01, -1.21582031e-01,  2.85156250e-01,  8.15429688e-02,
        3.19824219e-02, -3.19824219e-02,  1.34765625e-01, -2.73437500e-01,
        9.46044922e-03, -1.07421875e-01,  2.48046875e-01, -6.05468750e-01,
        5.02929688e-02,  2.98828125e-01,  9.57031250e-02,  1.39648438e-01,
       -5.41992188e-02,  2.91015625e-01,  2.85156250e-01,  1.51367188e-01,
       -2.89062500e-01, -3.46679688e-02,  1.81884766e-02, -3.92578125e-01,
        2.46093750e-01,  2.51953125e-01, -9.86328125e-02,  3.22265625e-01,
        4.49218750e-01, -1.36718750e-01, -2.34375000e-01,  4.12597656e-02,
       -2.15820312e-01,  1.69921875e-01,  2.56347656e-02,  1.50146484e-02,
       -3.75976562e-02,  6.95800781e-03,  4.00390625e-01,  2.09960938e-01,
        1.17675781e-01, -4.19921875e-02,  2.34375000e-01,  2.03125000e-01,
       -1.86523438e-01, -2.46093750e-01,  3.12500000e-01, -2.59765625e-01,
       -1.06933594e-01,  1.04003906e-01, -1.79687500e-01,  5.71289062e-02,
       -7.41577148e-03, -

In [8]:
model['cricket'].shape

(300,)

In [9]:
model.most_similar('man')  # the number of times the word 'man' appears in google news, most of the times, word 'robber' also come that context

[('woman', 0.7664012908935547),
 ('boy', 0.6824871301651001),
 ('teenager', 0.6586930155754089),
 ('teenage_girl', 0.6147903203964233),
 ('girl', 0.5921714305877686),
 ('suspected_purse_snatcher', 0.571636438369751),
 ('robber', 0.5585119128227234),
 ('Robbery_suspect', 0.5584409832954407),
 ('teen_ager', 0.5549196600914001),
 ('men', 0.5489763021469116)]

In [10]:
model.most_similar('cricket')

[('cricketing', 0.8372225761413574),
 ('cricketers', 0.8165745735168457),
 ('Test_cricket', 0.8094819188117981),
 ('Twenty##_cricket', 0.8068488240242004),
 ('Twenty##', 0.7624265551567078),
 ('Cricket', 0.75413978099823),
 ('cricketer', 0.7372578382492065),
 ('twenty##', 0.7316356897354126),
 ('T##_cricket', 0.7304614186286926),
 ('West_Indies_cricket', 0.6987985968589783)]

In [11]:
model.similarity('man','bottle') # this function converts the given two words into vectors and then compute cosine similarity between two vectors.

0.18008712

In [12]:
model.doesnt_match(['PHP','java','monkey']) # pass array of words, and fitler out the non-matching word, monkey is not similar to php and java

'monkey'

In [13]:
#Performing vector arithmetic, like "king - man + woman," yields words like "queen," "princess," or "ruler."
# by removing man and add woman to king vector, top 10 most similar words inlcude queen, princess
vec = model['king'] - model['man'] + model['woman']
model.most_similar([vec])

[('king', 0.8449392318725586),
 ('queen', 0.7300517559051514),
 ('monarch', 0.645466148853302),
 ('princess', 0.6156251430511475),
 ('crown_prince', 0.5818676352500916),
 ('prince', 0.5777117609977722),
 ('kings', 0.5613663792610168),
 ('sultan', 0.5376775860786438),
 ('Queen_Consort', 0.5344247817993164),
 ('queens', 0.5289887189865112)]

In [14]:
vec = model['PKR'] - model ['Pakistan'] + model['England']
model.most_similar([vec])

[('PKR', 0.700527548789978),
 ('Parti_Keadilan_Rakyat_PKR', 0.49439042806625366),
 ('Parti_Keadilan_Rakyat', 0.479891300201416),
 ('Pakatan', 0.4773881733417511),
 ('Keadilan', 0.47646868228912354),
 ('BN', 0.4756278693675995),
 ('exco', 0.46868693828582764),
 ('Umno', 0.4659350514411926),
 ('UMNO', 0.46452951431274414),
 ('Barisan', 0.46387091279029846)]