#Implement GloVe using tensorflow gradient descent

In [1]:
import tensorflow as tf
import numpy as np
from keras.models import Model
from keras.layers import Input, Embedding, Dot, Reshape

# Sample corpus
corpus = [
    "the cat in the hat",
    "the quick brown fox",
    "the lazy dog",
    # Add more sentences as needed
]

# Tokenize words
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

# Generate word pairs for context and target words
def generate_word_pairs(corpus, window_size=1):
    word_pairs = []
    for sentence in corpus:
        words = tokenizer.texts_to_sequences([sentence])[0]
        for i, target_word in enumerate(words):
            for context_word in words[max(0, i - window_size) : i + window_size]:
                if context_word != target_word:
                    word_pairs.append([target_word, context_word])
    return np.array(word_pairs)

# Build the GloVe model
embedding_size = 50  # Choose an appropriate size for your embeddings
context_size = 2  # Context window size

input_target = Input(shape=(1,))
input_context = Input(shape=(1,))

embedding = Embedding(total_words, embedding_size, input_length=1)(input_target)
context_embedding = Embedding(total_words, embedding_size, input_length=1)(input_context)

dot_product = Dot(axes=2)([embedding, context_embedding])
dot_product = Reshape((1,))(dot_product)

# Define the GloVe model
glove_model = Model(inputs=[input_target, input_context], outputs=dot_product)
glove_model.compile(optimizer="adam", loss="mean_squared_error")

# Generate training data
word_pairs = generate_word_pairs(corpus, window_size=context_size)
target = np.array([pair[0] for pair in word_pairs], dtype="int32")
context = np.array([pair[1] for pair in word_pairs], dtype="int32")
labels = np.array([1.0] * len(word_pairs))

# Train the model
glove_model.fit([target, context], labels, epochs=100, batch_size=32)

# Extract word embeddings
word_embeddings = glove_model.get_layer("embedding").get_weights()[0]

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [2]:
# Now, word_embeddings contains the trained GloVe embeddings
word_embeddings

array([[-0.03011609,  0.03164754, -0.01227959, -0.01718123, -0.02108562,
        -0.04339051, -0.03055278,  0.02313313,  0.02580244,  0.02080562,
         0.02674754,  0.01736661,  0.04634222,  0.02139523, -0.03735017,
         0.02572913, -0.01494846, -0.0322681 ,  0.04311777,  0.02931016,
        -0.00747167, -0.04543829,  0.04021717,  0.04912562, -0.01601272,
         0.00941927,  0.00574072,  0.01799511, -0.01849099,  0.01693786,
        -0.00949676,  0.0082594 ,  0.0120337 , -0.03889183,  0.00913298,
         0.01177325,  0.02934283,  0.04777331,  0.03583412, -0.02616826,
         0.01232226,  0.03500963, -0.04925743,  0.03835196, -0.01760975,
        -0.02751219, -0.04651996,  0.0348387 , -0.01815857, -0.04568649],
       [-0.13559745,  0.13986742,  0.09990193, -0.02728223, -0.16652799,
        -0.10410953,  0.10235941, -0.11882525,  0.13646148,  0.09205011,
         0.15423088, -0.13670278,  0.11939931,  0.10564298, -0.14911763,
        -0.16938724,  0.15442142,  0.10704798,  0.

In [3]:
# Sample words for interpretation
sample_words = ["the", "cat", "in", "hat", "quick", "brown", "fox", "lazy", "dog"]

# Create a dictionary to store word embeddings
word_embedding_dict = {}
for word in sample_words:
    word_index = tokenizer.word_index[word]
    word_embedding = word_embeddings[word_index]
    word_embedding_dict[word] = word_embedding

# Print the word embeddings
for word, embedding in word_embedding_dict.items():
    print(f"{word}: {embedding}")


the: [-0.13559745  0.13986742  0.09990193 -0.02728223 -0.16652799 -0.10410953
  0.10235941 -0.11882525  0.13646148  0.09205011  0.15423088 -0.13670278
  0.11939931  0.10564298 -0.14911763 -0.16938724  0.15442142  0.10704798
  0.16037163 -0.11131185 -0.12112981 -0.11738084  0.14015874 -0.15223126
  0.15391329  0.14183487 -0.11505135  0.1162683  -0.08307496  0.12118861
  0.14915636  0.14954105  0.10461583  0.11644858  0.08718924  0.13985723
 -0.12490056 -0.06064502  0.13727891 -0.11223114 -0.15250792 -0.15971738
  0.10271975 -0.14162458  0.1683922   0.15837696  0.1650399  -0.15990724
  0.12750906 -0.1271096 ]
cat: [-0.14928666  0.11549943  0.13652483  0.1122572  -0.16353552  0.06756092
  0.15149362 -0.10322514  0.11409543  0.14993636 -0.16653559  0.12831257
  0.15037337  0.16811304 -0.12538877 -0.14168267  0.14784168 -0.02596062
  0.10879837  0.16144928 -0.00637459 -0.09580443  0.12715404  0.1295661
  0.09505559  0.07303328 -0.08666128 -0.11936042  0.11307866  0.12823392
 -0.15479355 -0.