In [14]:
# Word Embeddings (Word2Vec)
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

text = "I love programming and solving problems"
tokens = word_tokenize(text)

w2v = Word2Vec([tokens], min_count=1)

w2v_embedding = w2v.wv['programming']
w2v_embedding

array([-8.2426779e-03,  9.2993546e-03, -1.9766092e-04, -1.9672764e-03,
        4.6036304e-03, -4.0953159e-03,  2.7431143e-03,  6.9399667e-03,
        6.0654259e-03, -7.5107943e-03,  9.3823504e-03,  4.6718083e-03,
        3.9661205e-03, -6.2435055e-03,  8.4599797e-03, -2.1501649e-03,
        8.8251876e-03, -5.3620026e-03, -8.1294188e-03,  6.8245591e-03,
        1.6711927e-03, -2.1985089e-03,  9.5136007e-03,  9.4938548e-03,
       -9.7740470e-03,  2.5052286e-03,  6.1566923e-03,  3.8724565e-03,
        2.0227872e-03,  4.3050171e-04,  6.7363144e-04, -3.8206363e-03,
       -7.1402504e-03, -2.0888723e-03,  3.9238976e-03,  8.8186832e-03,
        9.2591504e-03, -5.9759365e-03, -9.4026709e-03,  9.7643770e-03,
        3.4297847e-03,  5.1661171e-03,  6.2823449e-03, -2.8042626e-03,
        7.3227035e-03,  2.8302716e-03,  2.8710044e-03, -2.3803699e-03,
       -3.1282497e-03, -2.3701417e-03,  4.2764368e-03,  7.6057913e-05,
       -9.5842788e-03, -9.6655441e-03, -6.1481940e-03, -1.2856961e-04,
      

In [15]:
similar_words = w2v.wv.most_similar('programming', topn=5)

similar_words

[('I', 0.13149002194404602),
 ('solving', 0.06797593086957932),
 ('and', -0.013514931313693523),
 ('love', -0.04461710527539253),
 ('problems', -0.1116705983877182)]

In [16]:
# Contextual Embeddings (BERT)
from transformers import BertTokenizer, BertModel
import torch

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
ce = BertModel.from_pretrained('bert-base-uncased')

ce_inputs = bert_tokenizer(text, return_tensors='pt')
ce_outputs = ce(**ce_inputs)

ce_embeddings = ce_outputs.last_hidden_state
ce_embeddings


tensor([[[ 0.1407,  0.1971, -0.4170,  ..., -0.1294,  0.2620,  0.5057],
         [ 0.8725,  0.1432, -0.6234,  ...,  0.1477,  0.5541,  0.3443],
         [ 1.0669,  1.2485,  0.1577,  ...,  0.0895,  0.4967,  0.0659],
         ...,
         [-0.3901,  0.3799,  0.6719,  ..., -0.0116, -0.6614, -0.3459],
         [-0.5052,  0.5227, -0.2586,  ...,  0.4635, -0.0789, -0.2289],
         [ 0.7185, -0.1601, -0.2851,  ...,  0.1205, -0.6713, -0.2215]]],
       grad_fn=<NativeLayerNormBackward0>)

In [17]:
decoded_sentence = bert_tokenizer.decode(ce_inputs['input_ids'][0], skip_special_tokens=True)

decoded_sentence

'i love programming and solving problems'

In [18]:
# Sentence Embeddings (Sentence-BERT)
from sentence_transformers import SentenceTransformer

st = SentenceTransformer('all-MiniLM-L6-v2')

sentences = ["I love programming", "I enjoy coding"]
st_embeddings = st.encode(sentences)

st_embeddings

array([[-3.61786596e-02, -1.27737075e-02,  3.00630671e-03,
        -1.69034563e-02,  9.48425848e-03, -6.51517287e-02,
         9.37663764e-02,  7.14234710e-02,  1.85262896e-02,
         5.35826907e-02, -8.24747756e-02,  1.80089399e-02,
         3.44095752e-02,  1.51711507e-02,  3.18937674e-02,
        -1.89523716e-02, -8.73143151e-02, -3.99797037e-02,
        -5.35775488e-03, -7.61753768e-02, -1.42133236e-01,
        -6.11381140e-03, -4.14068177e-02, -3.20393480e-02,
         5.18483780e-02,  9.73950624e-02, -1.16528745e-03,
        -4.83810306e-02,  1.42009296e-02, -7.73504227e-02,
        -9.08908844e-02,  9.55949202e-02,  8.43655765e-02,
         6.60044551e-02,  1.65076889e-02,  5.63961603e-02,
         8.12914073e-02, -6.16290495e-02,  8.56058020e-03,
         5.73751982e-03, -1.09042332e-01,  5.53962328e-02,
         3.23738493e-02, -3.92971449e-02, -4.51566046e-03,
        -3.88993919e-02, -3.04347947e-02, -5.73263951e-02,
         8.27749372e-02,  3.39200869e-02,  1.93316967e-0

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_score = cosine_similarity([st_embeddings[0]], [st_embeddings[1]])
similarity_score[0][0]

0.8172091

In [20]:
# Transformer-based Embeddings (GPT-2)
from transformers import GPT2Tokenizer, GPT2Model

gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_model = GPT2Model.from_pretrained('gpt2')

gpt_inputs = gpt_tokenizer(text, return_tensors='pt')
gpt_outputs = gpt_model(**gpt_inputs)

gpt_embeddings = gpt_outputs.last_hidden_state
gpt_embeddings

tensor([[[-0.0796, -0.0654, -0.0842,  ..., -0.1442, -0.0456,  0.0143],
         [ 0.0757, -0.0351, -0.4318,  ..., -0.3136,  0.1932,  0.1943],
         [-0.1802,  0.2625, -1.8288,  ...,  0.0091,  0.3509, -0.1083],
         [ 0.4898, -0.0774, -0.5044,  ..., -0.2248, -0.4967,  0.1105],
         [-0.2229,  1.2018, -0.7035,  ...,  0.0351,  0.3151, -0.3190],
         [-0.0163,  0.0032, -2.9161,  ...,  0.0815, -0.1141,  0.1891]]],
       grad_fn=<ViewBackward0>)

In [21]:
decoded_sentence = gpt_tokenizer.decode(gpt_inputs['input_ids'][0], skip_special_tokens=True)

decoded_sentence

'I love programming and solving problems'