In [1]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from collections import defaultdict


class GRUCell:
    def __init__(self, input_size, hidden_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        self.W_z = np.random.randn(hidden_size, input_size)
        self.U_z = np.random.randn(hidden_size, hidden_size)
        self.b_z = np.zeros((hidden_size, 1))
        
        self.W_r = np.random.randn(hidden_size, input_size)
        self.U_r = np.random.randn(hidden_size, hidden_size)
        self.b_r = np.zeros((hidden_size, 1))
        
        self.W_h = np.random.randn(hidden_size, input_size)
        self.U_h = np.random.randn(hidden_size, hidden_size)
        self.b_h = np.zeros((hidden_size, 1))
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def tanh(self, x):
        return np.tanh(x)
    
    def forward(self, x, h_prev):
        z = self.sigmoid(self.W_z @ x + self.U_z @ h_prev + self.b_z)
        r = self.sigmoid(self.W_r @ x + self.U_r @ h_prev + self.b_r)
        h_tilde = self.tanh(self.W_h @ x + self.U_h @ (r * h_prev) + self.b_h)
        h = (1 - z) * h_prev + z * h_tilde
        return h, (h, z, r, h_tilde, h_prev, x)
    
    def backward(self, dh, cache):
        h, z, r, h_tilde, h_prev, x = cache
        
        dh_tilde = dh * z * (1 - h_tilde ** 2)
        dz = dh * (h_tilde - h_prev) * z * (1 - z)
        dr = (self.U_h.T @ dh_tilde) * h_prev * r * (1 - r)
        
        dW_z = dz @ x.T
        dU_z = dz @ h_prev.T
        db_z = dz.sum(axis=1, keepdims=True)
        
        dW_r = dr @ x.T
        dU_r = dr @ h_prev.T
        db_r = dr.sum(axis=1, keepdims=True)
        
        dW_h = dh_tilde @ x.T
        dU_h = dh_tilde @ (r * h_prev).T
        db_h = dh_tilde.sum(axis=1, keepdims=True)
        
        dx = self.W_z.T @ dz + self.W_r.T @ dr + self.W_h.T @ dh_tilde
        dh_prev = self.U_z.T @ dz + self.U_r.T @ dr + self.U_h.T @ (dh_tilde * r)
        
        return dx, dh_prev, dW_z, dU_z, db_z, dW_r, dU_r, db_r, dW_h, dU_h, db_h

class GRULayer:
    def __init__(self, input_size, hidden_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.cell = GRUCell(input_size, hidden_size)
    
    def forward(self, X):
        h = np.zeros((self.hidden_size, 1))
        h_seq = []
        caches = []
        for x in X:
            h, cache = self.cell.forward(x, h)
            h_seq.append(h)
            caches.append(cache)
        return h_seq, caches
    
    def backward(self, dh_seq, caches):
        dW_z, dU_z, db_z = np.zeros_like(self.cell.W_z), np.zeros_like(self.cell.U_z), np.zeros_like(self.cell.b_z)
        dW_r, dU_r, db_r = np.zeros_like(self.cell.W_r), np.zeros_like(self.cell.U_r), np.zeros_like(self.cell.b_r)
        dW_h, dU_h, db_h = np.zeros_like(self.cell.W_h), np.zeros_like(self.cell.U_h), np.zeros_like(self.cell.b_h)
        
        dh = np.zeros((self.hidden_size, 1))
        for t in reversed(range(len(dh_seq))):
            dh += dh_seq[t]
            dx, dh, dW_z_t, dU_z_t, db_z_t, dW_r_t, dU_r_t, db_r_t, dW_h_t, dU_h_t, db_h_t = self.cell.backward(dh, caches[t])
            dW_z += dW_z_t
            dU_z += dU_z_t
            db_z += db_z_t
            dW_r += dW_r_t
            dU_r += dU_r_t
            db_r += db_r_t
            dW_h += dW_h_t
            dU_h += dU_h_t
            db_h += db_h_t
        
        return dW_z, dU_z, db_z, dW_r, dU_r, db_r, dW_h, dU_h, db_h

class GRUModel:
    def __init__(self, input_size, hidden_size, output_size):
        self.hidden_size = hidden_size
        self.gru_layer = GRULayer(input_size, hidden_size)
        self.W_y = np.random.randn(output_size, hidden_size)
        self.b_y = np.zeros((output_size, 1))
    
    def forward(self, X):
        h_seq, caches = self.gru_layer.forward(X)
        y_seq = [self.W_y @ h + self.b_y for h in h_seq]
        return y_seq, caches
    
    def backward(self, dy_seq, caches):
        dh_seq = [self.W_y.T @ dy for dy in dy_seq]
        dW_y = sum(dy @ h.T for dy, (h, _, _, _, _, _) in zip(dy_seq, caches))
        db_y = sum(dy for dy in dy_seq)
        
        dW_z, dU_z, db_z, dW_r, dU_r, db_r, dW_h, dU_h, db_h = self.gru_layer.backward(dh_seq, caches)
        return dW_y, db_y, dW_z, dU_z, db_z, dW_r, dU_r, db_r, dW_h, dU_h, db_h
    
    def update(self, grads, lr):
        dW_y, db_y, dW_z, dU_z, db_z, dW_r, dU_r, db_r, dW_h, dU_h, db_h = grads
        self.W_y -= lr * dW_y
        self.b_y -= lr * db_y
        self.gru_layer.cell.W_z -= lr * dW_z
        self.gru_layer.cell.U_z -= lr * dU_z
        self.gru_layer.cell.b_z -= lr * db_z
        self.gru_layer.cell.W_r -= lr * dW_r
        self.gru_layer.cell.U_r -= lr * dU_r
        self.gru_layer.cell.b_r -= lr * db_r
        self.gru_layer.cell.W_h -= lr * dW_h
        self.gru_layer.cell.U_h -= lr * dU_h
        self.gru_layer.cell.b_h -= lr * db_h
    
    def train(self, X, Y, epochs, lr):
        for epoch in range(epochs):
            for x_seq, y_seq in zip(X, Y):
                y_pred, caches = self.forward(x_seq)
                loss = sum((y - y_pred) ** 2 for y, y_pred in zip(y_seq, y_pred))
                dy_seq = [2 * (y_pred - y) for y, y_pred in zip(y_seq, y_pred)]
                grads = self.backward(dy_seq, caches)
                self.update(grads, lr)
            print(f'Epoch {epoch + 1}, Loss: {loss}')

    def predict(self, X, h0=None):
        h = h0 if h0 is not None else np.zeros((self.hidden_size, 1))
        preds = []
        for x in X:
            h, _ = self.gru_layer.cell.forward(x, h)
            y = self.W_y @ h + self.b_y
            preds.append(y)
        return preds

In [2]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from collections import defaultdict

# Ensure you have the NLTK data downloaded
nltk.download('punkt')

# GRUCell, GRULayer, GRUModel classes as provided by you

def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

def build_vocab(tokens):
    vocab = defaultdict(lambda: len(vocab))
    _ = [vocab[token] for token in tokens]
    return vocab

def tokens_to_one_hot(tokens, vocab):
    one_hot_vectors = []
    vocab_size = len(vocab)
    for token in tokens:
        vector = np.zeros((vocab_size, 1))
        vector[vocab[token]] = 1
        one_hot_vectors.append(vector)
    return one_hot_vectors

def one_hot_to_tokens(one_hot_vectors, vocab):
    inv_vocab = {v: k for k, v in vocab.items()}
    tokens = [inv_vocab[np.argmax(vec)] for vec in one_hot_vectors]
    return tokens

# Sample paragraph for training
training_paragraph = """
The quick brown fox jumps over the lazy dog. This is a common English sentence used for typing practice.
It contains all the letters of the alphabet, making it a good example for training language models.
"""

# Tokenize the text
tokens = tokenize_text(training_paragraph)

# Build vocabulary
vocab = build_vocab(tokens)

# Convert tokens to one-hot vectors
one_hot_vectors = tokens_to_one_hot(tokens, vocab)

# Prepare training data (input sequences and corresponding next words)
X_train = []
Y_train = []
for i in range(len(one_hot_vectors) - 1):
    X_train.append(one_hot_vectors[i])
    Y_train.append(one_hot_vectors[i + 1])

X_train = [X_train]  # GRUModel expects a list of sequences
Y_train = [Y_train]  # GRUModel expects a list of sequences

# Initialize GRU model
input_size = len(vocab)
hidden_size = 10  # Example hidden size
output_size = len(vocab)

gru_model = GRUModel(input_size, hidden_size, output_size)

# Train the model
gru_model.train(X_train, Y_train, epochs=100, lr=0.01)

# Predict the next word
def predict_next_word(model, input_seq, vocab):
    one_hot_vectors = tokens_to_one_hot(input_seq, vocab)
    predictions = model.predict(one_hot_vectors)
    next_word_one_hot = predictions[-1]
    next_word = one_hot_to_tokens([next_word_one_hot], vocab)[0]
    return next_word



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\K-Gen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Epoch 1, Loss: [[105.75777028]
 [135.4786441 ]
 [ 58.94127845]
 [ 57.18938242]
 [ 55.85693106]
 [ 84.91339423]
 [ 59.24044228]
 [305.32933411]
 [ 52.02393692]
 [ 25.93773411]
 [100.04398447]
 [ 70.94946406]
 [ 68.75001481]
 [ 38.69197917]
 [ 41.41739102]
 [144.57464939]
 [ 36.9602354 ]
 [ 42.63373581]
 [ 47.50611279]
 [ 74.53584703]
 [ 98.03945257]
 [123.7622026 ]
 [137.10375591]
 [162.49111352]
 [ 38.31790337]
 [ 27.38158659]
 [ 62.88264971]
 [ 68.42682487]
 [ 65.16103411]
 [178.26321244]
 [100.13738986]
 [143.23607253]
 [103.19001309]
 [ 58.64976433]]
Epoch 2, Loss: [[ 216.94901981]
 [ 196.59176627]
 [ 249.20820723]
 [ 257.83310858]
 [ 145.58327596]
 [  93.10129019]
 [ 606.32363208]
 [ 677.20864986]
 [  19.90507031]
 [  17.291711  ]
 [  94.98954667]
 [ 141.41845373]
 [  93.22304716]
 [  66.16047088]
 [  32.00877954]
 [  35.5939631 ]
 [  20.92085174]
 [ 215.93368857]
 [ 145.22858769]
 [  15.98321575]
 [ 282.54577222]
 [ 546.55513965]
 [ 266.72263679]
 [ 157.74775003]
 [ 255.52835105]


In [6]:
# Given a sequence, predict the next word
input_sequence = ["The", "quick", "brown"]
next_word = predict_next_word(gru_model, input_sequence, vocab)
print(f'Next word prediction: {next_word}')

Next word prediction: .
