# GloVe Algorithme

In [1]:
import numpy as np
import spacy
from scipy.spatial.distance import cosine

nlp = spacy.load("fr_core_news_sm")

corpus = [
    "the king is a man who rules a kingdom",
    "the queen is a woman who rules a kingdom",
    "the man is strong and wise",
    "the woman is graceful and intelligent",
    "the king married the queen to unite their kingdoms",
    "a queen can reign in the absence of a king",
    "the man aspired to be a king one day",
    "the woman aspired to be a queen one day",
    "king and queen often host grand ceremonies",
    "the king and the queen govern the kingdom together",
]

def build_vocab(corpus):
    vocab = set([word for sentence in corpus for word in sentence.split()])
    word_to_id = {word: idx for idx, word in enumerate(vocab)}
    id_to_word = {idx: word for idx, word in enumerate(vocab)}
    return vocab, word_to_id, id_to_word

vocab, word_to_id, id_to_word = build_vocab(corpus)
vocab_size = len(vocab)
print(word_to_id)

{'ceremonies': 0, 'married': 1, 'rules': 2, 'day': 3, 'woman': 4, 'man': 5, 'reign': 6, 'govern': 7, 'grand': 8, 'of': 9, 'a': 10, 'who': 11, 'aspired': 12, 'in': 13, 'absence': 14, 'one': 15, 'their': 16, 'and': 17, 'the': 18, 'intelligent': 19, 'wise': 20, 'unite': 21, 'host': 22, 'together': 23, 'to': 24, 'king': 25, 'queen': 26, 'is': 27, 'strong': 28, 'often': 29, 'be': 30, 'can': 31, 'kingdoms': 32, 'kingdom': 33, 'graceful': 34}


In [2]:
def build_cooccurrence_matrix(corpus, word_to_id, window_size=5):
    coocurrence_matrix = np.zeros((vocab_size, vocab_size))
    for sentence in corpus:
        words = sentence.split()
        for idx, word in enumerate(words):
            word_id = word_to_id[word]
            neighbours = words[max(idx-window_size, 0) : min(idx+window_size+1, len(words))]
            for neighbour in neighbours:
                if word != neighbour:
                    neighbour_id = word_to_id[neighbour]
                    coocurrence_matrix[word_id, neighbour_id] += 1
    return coocurrence_matrix

coocurrence_matrix = build_cooccurrence_matrix(corpus, word_to_id)
coocurrence_matrix
            

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 2., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 2., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [3]:
embedding_dim = 50
np.random.seed(42)
W = np.random.rand(vocab_size, embedding_dim)
W_context = np.random.rand(vocab_size, embedding_dim)
bias = np.random.rand(vocab_size)
bias_context = np.random.rand(vocab_size)

def glove_loss(X, W, W_context, bias, bias_context, x_max=100, alpha=0.75):
    loss = 0
    for i in range(vocab_size):
        for j in range(vocab_size):
            if X[i, j] > 0:
                weight = (X[i, j]/x_max) ** alpha if X[i, j] < x_max else 1
                J = weight * (W[i] @ W_context[j] + bias[i] + bias_context[j] - np.log(X[i, j])) ** 2
                loss += J
    loss /= vocab_size ** 2
    return loss

learning_rate = 0.01
epochs = 100
x_max = 10
alpha = 0.75

for epoch in range(epochs):
    for i in range(vocab_size):
        for j in range(vocab_size):
            if coocurrence_matrix[i, j] > 0:
                weight = (coocurrence_matrix[i, j] / x_max) ** alpha if coocurrence_matrix[i, j] < x_max else 1
                diff = W[i] @ W_context[j] + bias[i] + bias_context[j] - np.log(coocurrence_matrix[i, j])
                
                grad_W = weight * diff * W_context[j]
                grad_W_context = weight * diff * W[i]
                grad_bias = weight * diff
                grad_bias_context = weight * diff
                
                W[i] -= learning_rate * grad_W
                W_context[j] -= learning_rate * grad_W_context
                bias[i] -= learning_rate * grad_bias
                bias_context[j] -= learning_rate * grad_bias_context
            
    if epoch % 10 == 0:
        loss = glove_loss(coocurrence_matrix, W, W_context, bias, bias_context)
        print(f"epoch {epoch}, loss: {loss}")
        
word_embeddings = W + W_context


epoch 0, loss: 0.5037277975815238
epoch 10, loss: 0.027474525614644987
epoch 20, loss: 0.007520292766907942
epoch 30, loss: 0.0029732131917531816
epoch 40, loss: 0.0014963563424770473
epoch 50, loss: 0.000885435287521995
epoch 60, loss: 0.0005795710357287714
epoch 70, loss: 0.0004033112365595446
epoch 80, loss: 0.00029154663312451153
epoch 90, loss: 0.0002161046815966535


In [4]:
def search_analogy(word1, word2, word3, word_embeddings, word_to_id, id_to_word):
    idx_word1 = word_to_id[word1]
    idx_word2 = word_to_id[word2]
    idx_word3 = word_to_id[word3]
    
    analogy = word_embeddings[idx_word1] - word_embeddings[idx_word2] + word_embeddings[idx_word3]
    
    nearest_distance = np.inf
    most_similar_word = None
    
    for word_index, word_vector in enumerate(word_embeddings):
        distance = np.linalg.norm(word_vector - analogy)
        print(f"{id_to_word[word_index]}, {distance}")
        if distance < nearest_distance:
            nearest_distance = distance
            most_similar_word = id_to_word[word_index]
    return most_similar_word
    
search_analogy('king', 'man', 'woman', word_embeddings, word_to_id, id_to_word)
    
    

ceremonies, 5.515369111864009
married, 5.815243849299505
rules, 5.44058810356872
day, 5.853181441406126
woman, 3.4291271570638173
man, 6.040977575232007
reign, 4.649872527785306
govern, 5.9988481611436235
grand, 5.185778695661402
of, 5.451295701388337
a, 4.142715274763036
who, 5.4617038863462355
aspired, 4.902933485044255
in, 4.844102795130681
absence, 5.777869724997661
one, 5.997296528745032
their, 4.953411516018894
and, 4.8068420865102315
the, 4.727379151122312
intelligent, 5.7391957323782705
wise, 5.723526002679257
unite, 5.4502375460451775
host, 5.755791368769518
together, 6.017254638557424
to, 4.6654507446651925
king, 3.692987251408548
queen, 4.80188066956214
is, 4.987628113671329
strong, 5.2767207411149215
often, 5.297463272123986
be, 5.140922589322133
can, 5.8588476301907235
kingdoms, 6.21177694665099
kingdom, 5.299561657255372
graceful, 5.849915750667705


'woman'