In [1]:
import numpy as np

# Define the sentence and map words to indices
sentence = ["I", "love", "deep", "learning"]
word_to_idx = {word: i for i, word in enumerate(set(sentence))}
idx_to_word = {i: word for word, i in word_to_idx.items()}

# Convert words to indices
X = [word_to_idx[word] for word in sentence[:-1]]  # input: first 3 words
y = word_to_idx[sentence[-1]]                      # target: 4th word

print("Word to index:", word_to_idx)
print("Input indices:", X)
print("Target index:", y)

Word to index: {'deep': 0, 'love': 1, 'learning': 2, 'I': 3}
Input indices: [3, 1, 0]
Target index: 2


In [2]:
# Model dimensions
vocab_size = len(word_to_idx)
hidden_size = 5

# Initialize weights and biases
np.random.seed(1)
W1 = np.random.randn(hidden_size, vocab_size) * 0.01  # input to hidden
W2 = np.random.randn(hidden_size, hidden_size) * 0.01  # hidden to hidden
W3 = np.random.randn(vocab_size, hidden_size) * 0.01  # hidden to output

b1 = np.zeros((hidden_size, 1))  # hidden bias
b2 = np.zeros((vocab_size, 1))   # output bias

print("W1 shape:", W1.shape)
print("W2 shape:", W2.shape)
print("W3 shape:", W3.shape)


W1 shape: (5, 4)
W2 shape: (5, 5)
W3 shape: (4, 5)


In [3]:
def one_hot(index, size):
    vec = np.zeros((size, 1))
    vec[index] = 1
    return vec

# Prepare input one-hot vectors
inputs = [one_hot(i, vocab_size) for i in X]

# Store hidden states
h = {}
h[-1] = np.zeros((hidden_size, 1))

# Forward through time
for t in range(len(inputs)):
    h[t] = np.tanh(np.dot(W1, inputs[t]) + np.dot(W2, h[t-1]) + b1)
    print(f"Hidden state at time {t}:\n", h[t])

# Output layer
y_out = np.dot(W3, h[len(X)-1]) + b2
probs = np.exp(y_out) / np.sum(np.exp(y_out))  # softmax

print("\nOutput vector (before softmax):\n", y_out)
print("Predicted probabilities:\n", probs.ravel())
print("Predicted word:", idx_to_word[np.argmax(probs)])

Hidden state at time 0:
 [[-0.01072927]
 [-0.00761192]
 [-0.02059849]
 [-0.01099847]
 [ 0.00582809]]
Hidden state at time 1:
 [[-0.00627501]
 [-0.02267566]
 [-0.0021939 ]
 [-0.00394171]
 [-0.00871949]]
Hidden state at time 2:
 [[ 0.01593346]
 [ 0.00870947]
 [ 0.00343067]
 [-0.00310528]
 [-0.00156572]]

Output vector (before softmax):
 [[-2.62098862e-05]
 [-7.92881217e-06]
 [ 1.75756294e-04]
 [ 8.12122318e-06]]
Predicted probabilities:
 [0.24998409 0.24998866 0.25003458 0.24999267]
Predicted word: learning


In [4]:
# Prepare the target
target = one_hot(y, vocab_size)
dy = probs - target  # derivative of loss wrt output

# Initialize gradients
dW3 = np.dot(dy, h[len(X)-1].T)
db2 = dy.copy()

dW1 = np.zeros_like(W1)
dW2 = np.zeros_like(W2)
db1 = np.zeros_like(b1)
dh_next = np.zeros_like(h[0])

# Backpropagation through time
for t in reversed(range(len(inputs))):
    dh = np.dot(W3.T, dy) + dh_next
    dh_raw = (1 - h[t] ** 2) * dh
    db1 += dh_raw
    dW1 += np.dot(dh_raw, inputs[t].T)
    dW2 += np.dot(dh_raw, h[t-1].T)
    dh_next = np.dot(W2.T, dh_raw)

# Print gradients
print("\ndW3:\n", dW3)
print("dW2:\n", dW2)
print("dW1:\n", dW1)
print("db2:\n", db2.ravel())
print("db1:\n", db1.ravel())


dW3:
 [[ 0.00398311  0.00217723  0.00085761 -0.00077627 -0.0003914 ]
 [ 0.00398318  0.00217727  0.00085763 -0.00077628 -0.00039141]
 [-0.01194954 -0.0065318  -0.00257288  0.00232885  0.00117423]
 [ 0.00398325  0.0021773   0.00085764 -0.0007763  -0.00039142]]
dW2:
 [[1.19738723e-04 2.14804752e-04 1.59669946e-04 1.04994943e-04
  2.13781165e-05]
 [6.04246206e-05 1.07801124e-04 8.08983769e-05 5.30660032e-05
  1.03906505e-05]
 [5.60469284e-05 1.00019430e-04 7.50220464e-05 4.92175566e-05
  9.65674055e-06]
 [6.02296307e-05 1.05783164e-04 8.15402863e-05 5.31230181e-05
  9.24566972e-06]
 [7.57048928e-05 1.33826260e-04 1.02024236e-04 6.66543168e-05
  1.21958367e-05]]
dW1:
 [[-0.0071256  -0.0069926   0.         -0.00699357]
 [-0.00356306 -0.0035479   0.         -0.00354584]
 [-0.00330648 -0.00328995  0.         -0.00328727]
 [-0.00345992 -0.00359005  0.         -0.00359284]
 [-0.00439628 -0.00448476  0.         -0.00448479]]
db2:
 [ 0.24998409  0.24998866 -0.74996542  0.24999267]
db1:
 [-0.02111

In [5]:
# Gradient descent step
lr = 0.1
W1 -= lr * dW1
W2 -= lr * dW2
W3 -= lr * dW3
b1 -= lr * db1
b2 -= lr * db2

print("\nWeights updated.")



Weights updated.


In [6]:
print("\nUpdated W1:\n", W1)
print("Updated W2:\n", W2)
print("Updated W3:\n", W3)
print("Updated b1:\n", b1.ravel())
print("Updated b2:\n", b2.ravel())


Updated W1:
 [[ 0.01695601 -0.0054183  -0.00528172 -0.01003033]
 [ 0.00901038 -0.0226606   0.01744812 -0.00725748]
 [ 0.00352104 -0.00216471  0.01462108 -0.02027268]
 [-0.00287818 -0.00348154  0.01133769 -0.01063963]
 [-0.00128465 -0.00833011  0.00042214  0.00627663]]
Updated W2:
 [[-0.01101817  0.01142576  0.00899994  0.00501444  0.00900642]
 [-0.00684332 -0.00123968 -0.00936578 -0.00268419  0.00530252]
 [-0.00692221 -0.00397754 -0.00687923 -0.00845698 -0.00671343]
 [-0.00013267 -0.01118368  0.002336    0.01659271  0.00741952]
 [-0.00192593 -0.00888967 -0.00748179  0.01691788  0.00050686]]
Updated W3:
 [[-0.00676827  0.00169143  0.02091679  0.00127922  0.00621117]
 [ 0.00260338 -0.00374023 -0.01151094 -0.0034158  -0.0020498 ]
 [ 0.00706119  0.00904301  0.00956831  0.00262299  0.00873399]
 [-0.0079423   0.01231095  0.00504353 -0.0029033   0.00492432]]
Updated b1:
 [0.00211118 0.00106568 0.00098837 0.00106428 0.00133658]
Updated b2:
 [-0.02499841 -0.02499887  0.07499654 -0.02499927]
