In [21]:
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def tanh(x):
    return np.tanh(x)

def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))

def tanh_derivative(x):
    return 1 - np.tanh(x)**2

def mse_loss(y_pred, y_true):
    return ((y_pred - y_true) ** 2).mean()

def mse_loss_derivative(y_pred, y_true):
    return 2 * (y_pred - y_true) / y_true.size

class GRUCell:
    def __init__(self, input_size, hidden_size, learning_rate=0.01):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.learning_rate = learning_rate
        # Initialize weights
        self.init_weights()
        
    def init_weights(self):
        self.Wz = np.random.randn(self.hidden_size, self.input_size) * 0.1
        self.Uz = np.random.randn(self.hidden_size, self.hidden_size) * 0.1
        self.bz = np.zeros((self.hidden_size, 1))
        
        self.Wr = np.random.randn(self.hidden_size, self.input_size) * 0.1
        self.Ur = np.random.randn(self.hidden_size, self.hidden_size) * 0.1
        self.br = np.zeros((self.hidden_size, 1))
        
        self.Wh = np.random.randn(self.hidden_size, self.input_size) * 0.1
        self.Uh = np.random.randn(self.hidden_size, self.hidden_size) * 0.1
        self.bh = np.zeros((self.hidden_size, 1))
        
    def forward(self, x, h_prev):
        # Store values for backpropagation
        self.x, self.h_prev = x, h_prev
        
        # Update gate
        self.z = sigmoid(np.dot(self.Wz, x) + np.dot(self.Uz, h_prev) + self.bz)
        
        # Reset gate
        self.r = sigmoid(np.dot(self.Wr, x) + np.dot(self.Ur, h_prev) + self.br)
        
        # Candidate hidden state
        self.h_tilde = tanh(np.dot(self.Wh, x) + np.dot(self.Uh, self.r * h_prev) + self.bh)
        
        # Final hidden state
        h_next = self.z * h_prev + (1 - self.z) * self.h_tilde
        
        return h_next

    def backward(self, d_h_next):
        # Derivative of final hidden state
        d_z = d_h_next * (self.h_prev - self.h_tilde)
        d_h_prev = d_h_next * self.z
        d_h_tilde = d_h_next * (1 - self.z)
        
        # Derivatives of the gates
        d_h_tilde_raw = d_h_tilde * tanh_derivative(self.h_tilde)
        d_r = np.dot(self.Uh.T, d_h_tilde_raw) * self.h_prev
        d_h_prev += np.dot(self.Uh.T, d_h_tilde_raw) * self.r
        
        # Update weights and biases
        self.Wh -= self.learning_rate * np.dot(d_h_tilde_raw, self.x.T)
        self.Uh -= self.learning_rate * np.dot(d_h_tilde_raw, (self.r * self.h_prev).T)
        self.bh -= self.learning_rate * d_h_tilde_raw.sum(axis=1, keepdims=True)
        
        self.Wr -= self.learning_rate * np.dot(d_r * sigmoid_derivative(self.r), self.x.T)
        self.Ur -= self.learning_rate * np.dot(d_r * sigmoid_derivative(self.r), self.h_prev.T)
        self.br -= self.learning_rate * (d_r * sigmoid_derivative(self.r)).sum(axis=1, keepdims=True)
        
        self.Wz -= self.learning_rate * np.dot(d_z * sigmoid_derivative(self.z), self.x.T)
        self.Uz -= self.learning_rate * np.dot(d_z * sigmoid_derivative(self.z), self.h_prev.T)
        self.bz -= self.learning_rate * (d_z * sigmoid_derivative(self.z)).sum(axis=1, keepdims=True)

        return d_h_prev

    def train(self, inputs, targets, epochs):
        h_prev = np.zeros((self.hidden_size, 1))
        
        for epoch in range(epochs):
            loss = 0
            for t in range(len(inputs)):
                x = inputs[t]
                y_true = targets[t]
                
                # Forward pass
                y_pred = self.forward(x, h_prev)
                
                # Calculate loss (for monitoring)
                loss += mse_loss(y_pred, y_true)
                
                # Backpropagate error
                d_loss = mse_loss_derivative(y_pred, y_true)
                d_h_next = self.backward(d_loss)
                h_prev = y_pred  # update state
                
            loss /= len(inputs)
            print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss}')

In [22]:
# Example setup (Assuming single feature input and output)
input_size = 687
hidden_size = 687
epochs = 10
learning_rate = 0.01

# Example synthetic data
inputs = [np.random.randn(input_size, 1) for _ in range(1628)]
targets = [np.random.randn(input_size, 1) for _ in range(1628)]

gru_cell = GRUCell(input_size, hidden_size, learning_rate)
gru_cell.train(inputs, targets, epochs)

Epoch 1/10, Loss: 1.4631657011012291
Epoch 2/10, Loss: 1.4544157093686538
Epoch 3/10, Loss: 1.4454900379272293
Epoch 4/10, Loss: 1.4366358377377924
Epoch 5/10, Loss: 1.4278639750239068
Epoch 6/10, Loss: 1.4191842887337933
Epoch 7/10, Loss: 1.4106053160286918
Epoch 8/10, Loss: 1.4021341260261126
Epoch 9/10, Loss: 1.3937762332885062
Epoch 10/10, Loss: 1.3855356080947219


In [23]:
len(targets)

1628

In [24]:
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences 
import tensorflow as tf 
import regex as re

def file_to_sentence_list(file_path): 
	with open(file_path, 'r') as file: 
		text = file.read() 

	# Splitting the text into sentences using 
	# delimiters like '.', '?', and '!' 
	sentences = [sentence.strip() for sentence in re.split( 
		r'(?<=[.!?])\s+', text) if sentence.strip()] 

	return sentences 

file_path = 'pizza.txt'
text_data = file_to_sentence_list(file_path) 

# Tokenize the text data 
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(text_data) 
total_words = len(tokenizer.word_index) + 1

# Create input sequences 
input_sequences = [] 
for line in text_data: 
	token_list = tokenizer.texts_to_sequences([line])[0] 
	for i in range(1, len(token_list)): 
		n_gram_sequence = token_list[:i+1] 
		input_sequences.append(n_gram_sequence) 

# Pad sequences and split into predictors and label 
max_sequence_len = 687 
input_sequences = np.array(pad_sequences( 
	input_sequences, maxlen=max_sequence_len+1, padding='pre')) 
X, y = input_sequences[:, :-1], input_sequences[:, -1] 

# Convert target data to one-hot encoding 
y = tf.keras.utils.to_categorical(y, num_classes=total_words) 

In [28]:
# Example setup (Assuming single feature input and output)
input_size = 687
hidden_size = 687
epochs = 10
learning_rate = 0.01

# Example synthetic data
inputs = np.array(X)
targets = np.array(y)


In [29]:
len(targets)

1628

In [30]:
gru_cell = GRUCell(input_size, hidden_size, learning_rate)
gru_cell.train(inputs, targets, epochs)

ValueError: shapes (687,687) and (1,687) not aligned: 687 (dim 1) != 1 (dim 0)