In [1]:
import re
from collections import Counter

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # enlever la ponctuation
    words = text.split()
    return words


In [2]:
def build_vocab(words):
    vocab = list(set(words))
    word2idx = {w: i for i, w in enumerate(vocab)}
    idx2word = {i: w for w, i in word2idx.items()}
    return vocab, word2idx, idx2word


In [3]:
def generate_cbow_pairs(words, window_size):
    pairs = []
    for idx in range(window_size, len(words) - window_size):
        context = []
        for i in range(-window_size, window_size + 1):
            if i != 0:
                context.append(words[idx + i])
        target = words[idx]
        pairs.append((context, target))
    return pairs


In [4]:
import numpy as np

def one_hot_vector(word, word2idx, vocab_size):
    vec = np.zeros(vocab_size)
    vec[word2idx[word]] = 1
    return vec


In [5]:
def create_training_data(pairs, word2idx, vocab_size):
    X_train = []
    y_train = []
    for context, target in pairs:
        context_vec = np.sum([one_hot_vector(w, word2idx, vocab_size) for w in context], axis=0)
        target_vec = one_hot_vector(target, word2idx, vocab_size)
        X_train.append(context_vec)
        y_train.append(target_vec)
    return np.array(X_train), np.array(y_train)


In [6]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

class CBOW:
    def __init__(self, vocab_size, embedding_dim):
        self.W1 = np.random.rand(vocab_size, embedding_dim)
        self.W2 = np.random.rand(embedding_dim, vocab_size)

    def forward(self, x):
        self.h = np.dot(x, self.W1)
        self.u = np.dot(self.h, self.W2)
        self.y_pred = softmax(self.u)
        return self.y_pred

    def backward(self, x, y_true, learning_rate):
        e = self.y_pred - y_true  # erreur
        dW2 = np.outer(self.h, e)
        dW1 = np.outer(x, np.dot(self.W2, e))

        self.W1 -= learning_rate * dW1
        self.W2 -= learning_rate * dW2


In [7]:
def train(model, X, y, epochs=100, learning_rate=0.1):
    for epoch in range(epochs):
        loss = 0
        for i in range(len(X)):
            y_pred = model.forward(X[i])
            model.backward(X[i], y[i], learning_rate)
            loss += -np.log(y_pred[np.argmax(y[i])])
        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss/len(X):.4f}")


In [8]:
# Exemple d’un paragraphe
paragraph = """
Le traitement du langage naturel permet aux machines de comprendre le langage humain.
Word2Vec est un modèle populaire pour représenter les mots sous forme de vecteurs.
"""

# Pipeline complet
words = preprocess_text(paragraph)
vocab, word2idx, idx2word = build_vocab(words)
pairs = generate_cbow_pairs(words, window_size=2)
vocab_size = len(vocab)

X_train, y_train = create_training_data(pairs, word2idx, vocab_size)
cbow_model = CBOW(vocab_size=vocab_size, embedding_dim=10)

train(cbow_model, X_train, y_train, epochs=100, learning_rate=0.05)


Epoch 0, Loss: 4.3363
Epoch 10, Loss: 1.5539
Epoch 20, Loss: 0.2866
Epoch 30, Loss: 0.1023
Epoch 40, Loss: 0.0579
Epoch 50, Loss: 0.0394
Epoch 60, Loss: 0.0295
Epoch 70, Loss: 0.0234
Epoch 80, Loss: 0.0193
Epoch 90, Loss: 0.0163


In [9]:
# Embedding du mot "vecteurs"
word = "vecteurs"
embedding = cbow_model.W1[word2idx[word]]
print(f"Embedding de '{word}' :", embedding)


Embedding de 'vecteurs' : [-0.63304883  0.3104305   0.46244799  0.03192678  0.37470317  0.86157857
  0.68808391  0.50728333  1.08083407  0.81998858]
