# GloVe Algorithme

In [None]:
import numpy as np
import spacy
from scipy.spatial.distance import cosine

nlp = spacy.load("fr_core_news_sm")

corpus = [
    "the king is a man who rules a kingdom",
    "the queen is a woman who rules a kingdom",
    "the man is strong and wise",
    "the woman is graceful and intelligent",
    "the king married the queen to unite their kingdoms",
    "a queen can reign in the absence of a king",
    "the man aspired to be a king one day",
    "the woman aspired to be a queen one day",
    "king and queen often host grand ceremonies",
    "the king and the queen govern the kingdom together",
]

def build_vocab(corpus):
    vocab = set([word for sentence in corpus for word in sentence.split()])
    word_to_id = {word: idx for idx, word in enumerate(vocab)}
    id_to_word = {idx: word for idx, word in enumerate(vocab)}
    return vocab, word_to_id, id_to_word

vocab, word_to_id, id_to_word = build_vocab(corpus)
vocab_size = len(vocab)
print(word_to_id)

In [None]:
def build_cooccurrence_matrix(corpus, word_to_id, window_size=5):
    coocurrence_matrix = np.zeros((vocab_size, vocab_size))
    for sentence in corpus:
        words = sentence.split()
        for idx, word in enumerate(words):
            word_id = word_to_id[word]
            neighbours = words[max(idx-window_size, 0) : min(idx+window_size+1, len(words))]
            for neighbour in neighbours:
                if word != neighbour:
                    neighbour_id = word_to_id[neighbour]
                    coocurrence_matrix[word_id, neighbour_id] += 1
    return coocurrence_matrix

coocurrence_matrix = build_cooccurrence_matrix(corpus, word_to_id)
coocurrence_matrix
            

In [None]:
embedding_dim = 50
np.random.seed(42)
W = np.random.rand(vocab_size, embedding_dim)
W_context = np.random.rand(vocab_size, embedding_dim)
bias = np.random.rand(vocab_size)
bias_context = np.random.rand(vocab_size)

def glove_loss(X, W, W_context, bias, bias_context, x_max=100, alpha=0.75):
    loss = 0
    for i in range(vocab_size):
        for j in range(vocab_size):
            if X[i, j] > 0:
                weight = (X[i, j]/x_max) ** alpha if X[i, j] < x_max else 1
                J = weight * (W[i] @ W_context[j] + bias[i] + bias_context[j] - np.log(X[i, j])) ** 2
                loss += J
    loss /= vocab_size ** 2
    return loss

learning_rate = 0.01
epochs = 100
x_max = 10
alpha = 0.75

for epoch in range(epochs):
    for i in range(vocab_size):
        for j in range(vocab_size):
            if coocurrence_matrix[i, j] > 0:
                weight = (coocurrence_matrix[i, j] / x_max) ** alpha if coocurrence_matrix[i, j] < x_max else 1
                diff = W[i] @ W_context[j] + bias[i] + bias_context[j] - np.log(coocurrence_matrix[i, j])
                
                grad_W = weight * diff * W_context[j]
                grad_W_context = weight * diff * W[i]
                grad_bias = weight * diff
                grad_bias_context = weight * diff
                
                W[i] -= learning_rate * grad_W
                W_context[j] -= learning_rate * grad_W_context
                bias[i] -= learning_rate * grad_bias
                bias_context[j] -= learning_rate * grad_bias_context
            
    if epoch % 10 == 0:
        loss = glove_loss(coocurrence_matrix, W, W_context, bias, bias_context)
        print(f"epoch {epoch}, loss: {loss}")
        
word_embeddings = W + W_context


In [None]:
def search_analogy(word1, word2, word3, word_embeddings, word_to_id, id_to_word):
    idx_word1 = word_to_id[word1]
    idx_word2 = word_to_id[word2]
    idx_word3 = word_to_id[word3]
    
    analogy = word_embeddings[idx_word1] - word_embeddings[idx_word2] + word_embeddings[idx_word3]
    
    nearest_distance = np.inf
    most_similar_word = None
    
    for word_index, word_vector in enumerate(word_embeddings):
        distance = np.linalg.norm(word_vector - analogy)
        print(f"{id_to_word[word_index]}, {distance}")
        if distance < nearest_distance:
            nearest_distance = distance
            most_similar_word = id_to_word[word_index]
    return most_similar_word
    
search_analogy('king', 'man', 'woman', word_embeddings, word_to_id, id_to_word)
    
    