In [1]:
words = ["I", "love", "deep", "learning"]


word_to_idx = {word: idx for idx, word in enumerate(words)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

X_seq = ["I", "love", "deep"]
Y_target = "learning"

In [2]:
def zero_vector(size):
    return [0.0] * size

def dot(v1, v2):
    return sum(x * y for x, y in zip(v1, v2))

def add(v1, v2):
    return [x + y for x, y in zip(v1, v2)]

def tanh(v):
    from math import tanh
    return [tanh(x) for x in v]

def softmax(v):
    from math import exp
    exp_v = [exp(x) for x in v]
    total = sum(exp_v)
    return [x / total for x in exp_v]


In [3]:
import random


vocab_size = len(words)
embedding_size = vocab_size
hidden_size = 4
output_size = vocab_size

def random_matrix(rows, cols):
    return [[random.uniform(-1, 1) for _ in range(cols)] for _ in range(rows)]

Wxh = random_matrix(hidden_size, vocab_size)
Whh = random_matrix(hidden_size, hidden_size)
Why = random_matrix(output_size, hidden_size)
bh = zero_vector(hidden_size)
by = zero_vector(output_size)


In [4]:
def one_hot(idx, size):
    vec = [0.0] * size
    vec[idx] = 1.0
    return vec

def forward(X_seq):
    h_prev = zero_vector(hidden_size)
    for word in X_seq:
        x = one_hot(word_to_idx[word], vocab_size)


        h_input = zero_vector(hidden_size)
        for i in range(hidden_size):
            h_input[i] = dot(Wxh[i], x) + dot(Whh[i], h_prev) + bh[i]
        h_prev = tanh(h_input)

    y = zero_vector(output_size)
    for i in range(output_size):
        y[i] = dot(Why[i], h_prev) + by[i]

    probs = softmax(y)
    return probs


In [5]:
probs = forward(X_seq)

predicted_idx = probs.index(max(probs))
predicted_word = idx_to_word[predicted_idx]

print(f"Predicted word: {predicted_word}")
print(f"Target word: {Y_target}")


Predicted word: deep
Target word: learning


In [6]:
def cross_entropy_loss(predicted_probs, target_idx):
    from math import log
    return -log(predicted_probs[target_idx] + 1e-9)


In [7]:
def d_softmax_cross_entropy(probs, target_idx):

    d = [p for p in probs]
    d[target_idx] -= 1.0
    return d


In [8]:
learning_rate = 0.1
epochs = 1000

for epoch in range(epochs):

    h_prev = zero_vector(hidden_size)
    xs = []
    hs = [h_prev]

    for word in X_seq:
        x = one_hot(word_to_idx[word], vocab_size)
        xs.append(x)

        h_input = zero_vector(hidden_size)
        for i in range(hidden_size):
            h_input[i] = dot(Wxh[i], x) + dot(Whh[i], h_prev) + bh[i]
        h_prev = tanh(h_input)
        hs.append(h_prev)


    y = zero_vector(output_size)
    for i in range(output_size):
        y[i] = dot(Why[i], h_prev) + by[i]

    probs = softmax(y)
    loss = cross_entropy_loss(probs, word_to_idx[Y_target])


    dy = d_softmax_cross_entropy(probs, word_to_idx[Y_target])

    dWhy = [[0.0]*hidden_size for _ in range(output_size)]
    dby = [0.0]*output_size
    for i in range(output_size):
        for j in range(hidden_size):
            dWhy[i][j] = dy[i] * hs[-1][j]
        dby[i] = dy[i]

    dh = [0.0] * hidden_size
    for i in range(hidden_size):
        for j in range(output_size):
            dh[i] += Why[j][i] * dy[j]

    dtanh = [1 - h**2 for h in hs[-1]]
    dhraw = [dh[i] * dtanh[i] for i in range(hidden_size)]

    dWxh = [[0.0]*vocab_size for _ in range(hidden_size)]
    dWhh = [[0.0]*hidden_size for _ in range(hidden_size)]
    dbh = [0.0]*hidden_size

    for i in range(hidden_size):
        for j in range(vocab_size):
            dWxh[i][j] = dhraw[i] * xs[-1][j]
        for j in range(hidden_size):
            dWhh[i][j] = dhraw[i] * hs[-2][j]
        dbh[i] = dhraw[i]

    for i in range(output_size):
        for j in range(hidden_size):
            Why[i][j] -= learning_rate * dWhy[i][j]
        by[i] -= learning_rate * dby[i]

    for i in range(hidden_size):
        for j in range(vocab_size):
            Wxh[i][j] -= learning_rate * dWxh[i][j]
        for j in range(hidden_size):
            Whh[i][j] -= learning_rate * dWhh[i][j]
        bh[i] -= learning_rate * dbh[i]


    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")


Epoch 0, Loss: 2.2446
Epoch 100, Loss: 0.0229
Epoch 200, Loss: 0.0099
Epoch 300, Loss: 0.0062
Epoch 400, Loss: 0.0045
Epoch 500, Loss: 0.0035
Epoch 600, Loss: 0.0029
Epoch 700, Loss: 0.0024
Epoch 800, Loss: 0.0021
Epoch 900, Loss: 0.0018


In [9]:
probs = forward(X_seq)
predicted_idx = probs.index(max(probs))
predicted_word = idx_to_word[predicted_idx]

print(f"\nFinal predicted word: {predicted_word}")
print(f"Target word: {Y_target}")



Final predicted word: learning
Target word: learning
