In [40]:
import tensorflow as tf 
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Embedding, LSTM, Dense, GRU, Layer 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences 
import numpy as np 
import regex as re 

In [41]:
def file_to_sentence_list(file_path): 
	with open(file_path, 'r') as file: 
		text = file.read() 

	# Splitting the text into sentences using 
	# delimiters like '.', '?', and '!' 
	sentences = [sentence.strip() for sentence in re.split( 
		r'(?<=[.!?])\s+', text) if sentence.strip()] 

	return sentences 

file_path = 'pizza.txt'
text_data = file_to_sentence_list(file_path) 

# Tokenize the text data 
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(text_data) 
total_words = len(tokenizer.word_index) + 1

# Create input sequences 
input_sequences = [] 
for line in text_data: 
	token_list = tokenizer.texts_to_sequences([line])[0] 
	for i in range(1, len(token_list)): 
		n_gram_sequence = token_list[:i+1] 
		input_sequences.append(n_gram_sequence) 

# Pad sequences and split into predictors and label 
max_sequence_len = max([len(seq) for seq in input_sequences]) 
input_sequences = np.array(pad_sequences( 
	input_sequences, maxlen=max_sequence_len, padding='pre')) 
X, y = input_sequences[:, :-1], input_sequences[:, -1] 

# Convert target data to one-hot encoding 
y = tf.keras.utils.to_categorical(y, num_classes=total_words) 

In [42]:
# Define the model 
model = Sequential() 
model.add(Embedding(total_words, 10, 
					input_length=max_sequence_len-1)) 
model.add(GRU(128))
model.add(Dense(total_words, activation='softmax')) 
model.compile(loss='categorical_crossentropy', 
			optimizer='adam', metrics=['accuracy']) 

In [None]:
# Train the model 
model.fit(X, y, epochs=100, verbose=1) 

In [None]:
# Generate next word predictions 
seed_text = "Pizza have different "
next_words = 5

for _ in range(next_words): 
	token_list = tokenizer.texts_to_sequences([seed_text])[0] 
	token_list = pad_sequences( 
		[token_list], maxlen=max_sequence_len-1, padding='pre') 
	predicted_probs = model.predict(token_list) 
	predicted_word = tokenizer.index_word[np.argmax(predicted_probs)] 
	seed_text += " " + predicted_word 

print("Next predicted words:", seed_text) 

Next predicted words: Pizza have different  become a symbol of comfort


In [82]:
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def tanh(x):
    return np.tanh(x)

def softmax(x):
    e_x = np.exp(x - np.max(x))  # subtract max for numerical stability
    return e_x / e_x.sum(axis=1, keepdims=True)

class GRU:
    def __init__(self, input_dim, hidden_dim, output_dim):
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim  # Store the output dimension

        # Initialize weights
        self.W_z = np.random.randn(input_dim + hidden_dim, hidden_dim)
        self.W_r = np.random.randn(input_dim + hidden_dim, hidden_dim)
        self.W_h = np.random.randn(input_dim + hidden_dim, hidden_dim)
        self.b_z = np.zeros((1, hidden_dim))
        self.b_r = np.zeros((1, hidden_dim))
        self.b_h = np.zeros((1, hidden_dim))
        self.W_out = np.random.randn(hidden_dim, output_dim)  # Output layer weights
        self.b_out = np.zeros((1, output_dim))  # Output layer bias


    def forward(self, x, h_prev):
        # Add an extra dimension to the input vector
        x = np.expand_dims(x, axis=0)

        # Concatenate input and previous hidden state
        combined = np.column_stack((x, h_prev))

        # Calculate gates
        z = sigmoid(np.dot(combined, self.W_z) + self.b_z)
        r = sigmoid(np.dot(combined, self.W_r) + self.b_r)
        h_tilde = np.tanh(np.dot(np.column_stack((x, r * h_prev)), self.W_h) + self.b_h)

        # Calculate next hidden state
        h_next = (1 - z) * h_prev + z * h_tilde

        return h_next

    def backward(self, x, h_prev, h_next, dh_next):
        # Concatenate input and previous hidden state
        combined = np.column_stack((x, h_prev))

        # Calculate gates
        z = sigmoid(np.dot(combined, self.W_z) + self.b_z)
        r = sigmoid(np.dot(combined, self.W_r) + self.b_r)
        h_tilde = np.tanh(np.dot(np.column_stack((x, r * h_prev)), self.W_h) + self.b_h)

        # Calculate gradients
        dh_tilde = dh_next * z
        dz = dh_next * (h_tilde - h_prev) * z * (1 - z)
        dr = dh_next * (1 - z) * (1 - h_tilde**2) * h_prev
        dx = np.dot(dh_next * (1 - z), self.W_h[:, :self.input_dim].T) + np.dot(dr * h_prev, self.W_r[:, :self.input_dim].T)
        dh_prev = dh_next * (1 - z) * r + np.dot(dr, self.W_r[:, self.input_dim:].T) + np.dot(dh_next * (1 - z) * (1 - h_tilde**2), self.W_h[:, self.input_dim:].T)

        # Calculate weight gradients
        dW_z = np.dot(combined.T, dz)
        dW_r = np.dot(combined.T, dr)
        dW_h = np.dot(np.column_stack((x, r * h_prev)).T, dh_tilde)
        db_z = np.sum(dz, axis=0, keepdims=True)
        db_r = np.sum(dr, axis=0, keepdims=True)
        db_h = np.sum(dh_tilde, axis=0, keepdims=True)

        # Update weights and biases
        self.learning_rate = 0.01  # Example learning rate
        self.W_z -= self.learning_rate * dW_z
        self.W_r -= self.learning_rate * dW_r
        self.W_h -= self.learning_rate * dW_h
        self.b_z -= self.learning_rate * db_z
        self.b_r -= self.learning_rate * db_r
        self.b_h -= self.learning_rate * db_h

        return dx, dh_prev, dW_z, dW_r, dW_h, db_z, db_r, db_h
    
    def train(self, X_train, y_train, epochs=100):
        # Initialize the hidden state
        h_prev = np.zeros((1, self.hidden_dim))

        # Training loop
        for epoch in range(epochs):
            # Reset the total loss for this epoch
            total_loss = 0

            # Loop over the training data
            for t in range(len(X_train)):
                # Forward pass
                h_prev = self.forward(X_train[t], h_prev)

                # Compute the output
                output = np.dot(h_prev, self.W_out) + self.b_out

                # Compute the loss
                loss = np.sum((output - y_train[t])**2) / 2

                # Backward pass
                dh_next = (output - y_train[t]) * self.W_out.T
                _, _, dW_z, dW_r, dW_h, db_z, db_r, db_h = self.backward(X_train[t], h_prev, h_prev, dh_next)

                # Update the total loss
                total_loss += loss

            # Print the loss for this epoch
            print(f'Epoch {epoch+1}/{epochs}: Loss = {total_loss}')



In [93]:
hidden_dim = 128
gru = GRU(max_sequence_len-1, hidden_dim, 1)

gru.train(X, y, epochs=100) 

ValueError: operands could not be broadcast together with shapes (1,687) (1,128) 

In [89]:
input_sequence = ['I', 'am', 'a', 'bot']

# Create a dictionary to map words to indices
word_to_idx = {word: idx for idx, word in enumerate(set(input_sequence))}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# One-hot encode the input sequence
input_sequence = [np.eye(len(word_to_idx))[word_to_idx[word]] for word in input_sequence]

# Initialize the GRU
input_dim = len(word_to_idx)
hidden_dim = 128
gru = GRU(input_dim, hidden_dim, 1)

# Forward pass through the GRU
h_prev = np.zeros((1, hidden_dim))
for x in input_sequence[:-1]:
    h_prev = gru.forward(x, h_prev)

# Predict the last word using the output layer
output = np.dot(h_prev, gru.W_out) + gru.b_out
output = softmax(output)  # apply softmax to get probabilities
last_word_idx = np.argmax(output)
predicted_word = idx_to_word[last_word_idx]

print(f"Predicted last word: {predicted_word}")
output


Predicted last word: I


array([[1.]])