In [2]:
data = ["AI", "will", "change", "future"]
word_to_idx = {w: i for i, w in enumerate(data)}
idx_to_word = {i: w for w, i in word_to_idx.items()}

X = [word_to_idx["AI"], word_to_idx["will"], word_to_idx["change"]]
Y = word_to_idx["future"]

def one_hot(index, size):
    vec = [0] * size
    vec[index] = 1
    return vec

seed = 1
def rand():
    global seed
    seed = (seed * 9301 + 49297) % 233280
    return (seed / 233280.0) - 0.5

def random_matrix(rows, cols):
    return [[rand() for _ in range(cols)] for _ in range(rows)]

def add_vectors(a, b): return [a[i] + b[i] for i in range(len(a))]
def scalar_multiply(vec, s): return [v * s for v in vec]
def clip(x, min_val=-10, max_val=10): return [min(max(i, min_val), max_val) for i in x]
def tanh(x): return [i - (i**3)/3 for i in clip(x)]
def tanh_derivative(x): return [1 - i**2 for i in tanh(clip(x))]

def softmax(x):
    exp_x = [1 + i + (i**2)/2 for i in x]  
    total = sum(exp_x)
    return [i / total for i in exp_x]

vocab_size = len(data)
input_size = hidden_size = output_size = vocab_size
lr = 0.1

Wxh = random_matrix(hidden_size, input_size)
Whh = random_matrix(hidden_size, hidden_size)
Why = random_matrix(output_size, hidden_size)
bh = [0] * hidden_size
by = [0] * output_size

for epoch in range(100):
    inputs = [one_hot(i, input_size) for i in X]
    target = one_hot(Y, output_size)
    hs = [[0]*hidden_size]

    for t in range(len(inputs)):
        x = inputs[t]
        h_prev = hs[-1]
        xh = [sum(x[i]*Wxh[j][i] for i in range(input_size)) for j in range(hidden_size)]
        hh = [sum(h_prev[i]*Whh[j][i] for i in range(hidden_size)) for j in range(hidden_size)]
        h_raw = add_vectors(add_vectors(xh, hh), bh)
        h = tanh(h_raw)
        hs.append(h)

    y_raw = [sum(h[i]*Why[j][i] for i in range(hidden_size)) + by[j] for j in range(output_size)]
    y = softmax(y_raw)
    loss = -sum(target[i]*y[i] for i in range(output_size))

    dy = [y[i] - target[i] for i in range(output_size)]

    for i in range(output_size):
        for j in range(hidden_size):
            Why[i][j] -= lr * dy[i] * hs[-1][j]
        by[i] -= lr * dy[i]

    dh_next = [0] * hidden_size
    for t in reversed(range(len(inputs))):
        h = hs[t+1]
        h_prev = hs[t]
        dh = [sum(dy[i]*Why[i][j] for i in range(output_size)) + dh_next[j] for j in range(hidden_size)]
        dh_raw = [dh[j] * tanh_derivative(h)[j] for j in range(hidden_size)]

        for i in range(hidden_size):
            for j in range(input_size):
                Wxh[i][j] -= lr * dh_raw[i] * inputs[t][j]
            for j in range(hidden_size):
                Whh[i][j] -= lr * dh_raw[i] * h_prev[j]
            bh[i] -= lr * dh_raw[i]
        dh_next = dh_raw

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {round(loss, 4)}")

inputs = [one_hot(i, input_size) for i in X]
h = [0]*hidden_size
for t in range(len(inputs)):
    x = inputs[t]
    xh = [sum(x[i]*Wxh[j][i] for i in range(input_size)) for j in range(hidden_size)]
    hh = [sum(h[i]*Whh[j][i] for i in range(hidden_size)) for j in range(hidden_size)]
    h = tanh(add_vectors(add_vectors(xh, hh), bh))

y = [sum(h[i]*Why[j][i] for i in range(hidden_size)) + by[j] for j in range(output_size)]
pred = softmax(y)

print("\nFinal Prediction:")
for i in range(len(pred)):
    print(idx_to_word[i], ":", round(pred[i], 4))
print("\nExpected:",idx_to_word[Y])

Epoch 0, Loss: -0.2352
Epoch 10, Loss: -0.6185
Epoch 20, Loss: -0.5496
Epoch 30, Loss: -0.5247
Epoch 40, Loss: -0.5162
Epoch 50, Loss: -0.512
Epoch 60, Loss: -0.5095
Epoch 70, Loss: -0.5079
Epoch 80, Loss: -0.5067
Epoch 90, Loss: -0.5059

Final Prediction:
AI : 0.0
will : 0.4948
change : 0.0
future : 0.5052

Expected: future
