In [13]:
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def tanh(x):
    return np.tanh(x)

def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))

def tanh_derivative(x):
    return 1 - np.tanh(x)**2

def mse_loss(y_pred, y_true):
    return ((y_pred - y_true) ** 2).mean()

def mse_loss_derivative(y_pred, y_true):
    return 2 * (y_pred - y_true) / y_true.size

class GRUCell:
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        # Initialize weights
        self.init_weights()
        
    def init_weights(self):
        self.Wz = np.random.randn(self.hidden_size, self.input_size) * 0.1
        self.Uz = np.random.randn(self.hidden_size, self.hidden_size) * 0.1
        self.bz = np.zeros((self.hidden_size, 1))
        
        self.Wr = np.random.randn(self.hidden_size, self.input_size) * 0.1
        self.Ur = np.random.randn(self.hidden_size, self.hidden_size) * 0.1
        self.br = np.zeros((self.hidden_size, 1))
        
        self.Wh = np.random.randn(self.hidden_size, self.input_size) * 0.1
        self.Uh = np.random.randn(self.hidden_size, self.hidden_size) * 0.1
        self.bh = np.zeros((self.hidden_size, 1))
        
        self.Wy = np.random.randn(self.output_size, self.hidden_size) * 0.1
        self.by = np.zeros((self.output_size, 1))
        
    def forward(self, x, h_prev):
        # Store values for backpropagation
        self.x, self.h_prev = x, h_prev
        
        # Update gate
        self.z = sigmoid(np.dot(self.Wz, x) + np.dot(self.Uz, h_prev) + self.bz)
        
        # Reset gate
        self.r = sigmoid(np.dot(self.Wr, x) + np.dot(self.Ur, h_prev) + self.br)
        
        # Candidate hidden state
        self.h_tilde = tanh(np.dot(self.Wh, x) + np.dot(self.Uh, self.r * h_prev) + self.bh)
        
        # Final hidden state
        h_next = self.z * h_prev + (1 - self.z) * self.h_tilde
        
        # Output
        y_pred = np.dot(self.Wy, h_next) + self.by
        
        return y_pred, h_next

    def backward(self, d_y_pred, d_h_next):
        # Gradient of the output layer
        d_Wy = np.dot(d_y_pred, self.h_prev.T)
        d_by = d_y_pred.sum(axis=1, keepdims=True)
        d_h_next += np.dot(self.Wy.T, d_y_pred)
        
        # Derivative of final hidden state
        d_z = d_h_next * (self.h_prev - self.h_tilde)
        d_h_prev = d_h_next * self.z
        d_h_tilde = d_h_next * (1 - self.z)
        
        # Derivatives of the gates
        d_h_tilde_raw = d_h_tilde * tanh_derivative(self.h_tilde)
        d_r = np.dot(self.Uh.T, d_h_tilde_raw) * self.h_prev
        d_h_prev += np.dot(self.Uh.T, d_h_tilde_raw) * self.r
        
        # Update weights and biases
        self.Wh -= self.learning_rate * np.dot(d_h_tilde_raw, self.x.T)
        self.Uh -= self.learning_rate * np.dot(d_h_tilde_raw, (self.r * self.h_prev).T)
        self.bh -= self.learning_rate * d_h_tilde_raw.sum(axis=1, keepdims=True)
        
        self.Wr -= self.learning_rate * np.dot(d_r * sigmoid_derivative(self.r), self.x.T)
        self.Ur -= self.learning_rate * np.dot(d_r * sigmoid_derivative(self.r), self.h_prev.T)
        self.br -= self.learning_rate * (d_r * sigmoid_derivative(self.r)).sum(axis=1, keepdims=True)
        
        self.Wz -= self.learning_rate * np.dot(d_z * sigmoid_derivative(self.z), self.x.T)
        self.Uz -= self.learning_rate * np.dot(d_z * sigmoid_derivative(self.z), self.h_prev.T)
        self.bz -= self.learning_rate * (d_z * sigmoid_derivative(self.z)).sum(axis=1, keepdims=True)
        
        self.Wy -= self.learning_rate * d_Wy
        self.by -= self.learning_rate * d_by

        return d_h_prev

    def train(self, inputs, targets, epochs):
        h_prev = np.zeros((self.hidden_size, 1))

        for epoch in range(epochs):
            loss = 0
            for x, y_true in zip(inputs, targets):
                x = x.reshape(-1, 1)  # Reshape x to (input_size, 1)
                y_true = y_true.reshape(-1, 1)  # Reshape y_true if needed

                # Forward pass
                y_pred, h_next = self.forward(x, h_prev)

                # Calculate loss (for monitoring)
                loss += mse_loss(y_pred, y_true)

                # Backpropagate error
                d_loss = mse_loss_derivative(y_pred, y_true)
                d_h_next = self.backward(d_loss, h_next)
                h_prev = h_next  # update state
                
            loss /= len(inputs)
            print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss}')

    def predict(self, inputs):
        h_prev = np.zeros((self.hidden_size, 1))
        predictions = []
        for x in inputs:
            x = x.reshape(-1, 1)  # Ensure x is correctly shaped
            y_pred, h_prev = self.forward(x, h_prev)
            predictions.append(y_pred.flatten())  # Flatten the prediction to 1-D
        return predictions

In [14]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GRU
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import regex as re
from gensim.models import FastText

def file_to_sentence_list(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    sentences = [sentence.strip() for sentence in re.split(r'(?<=[.!?])\s+', text) if sentence.strip()]
    return sentences

def load_fasttext_model(sentences, vector_size=50, window=10, min_count=2):
    fasttext_model = FastText(sentences=sentences, vector_size=vector_size, window=window, min_count=min_count)
    return fasttext_model

def create_embedding_matrix(word_index, fasttext_model):
    embedding_matrix = np.zeros((len(word_index) + 1, fasttext_model.vector_size))
    for word, i in word_index.items():
        if word in fasttext_model.wv:
            embedding_matrix[i] = fasttext_model.wv[word]
    return embedding_matrix

# Read and process the text data
file_path = 'pizza.txt'
text_data = file_to_sentence_list(file_path)

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
word_index = tokenizer.word_index
total_words = len(word_index) + 1

# Create input sequences
input_sequences = []
for line in text_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad sequences and prepare input data
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
X, y = input_sequences[:, :-1], input_sequences[:, -1]

# Load or train FastText model
sentences = [text.split() for text in text_data]
fasttext_model = load_fasttext_model(sentences)

# Create an embedding matrix
embedding_matrix = create_embedding_matrix(tokenizer.word_index, fasttext_model)

In [15]:
# Assume X and y are your input and output sequences from the first code block
# X has indices that need to be transformed into embeddings

# Transform each index in X to its corresponding embedding
X_embeddings = np.array([np.mean([embedding_matrix[idx] for idx in sequence if idx > 0], axis=0) for sequence in X])
y_embeddings = np.array([embedding_matrix[target] for target in y])  # only if y is a sequence of indices

# Initialize the GRU
input_size = embedding_matrix.shape[1]  # Embedding size
hidden_size = 100  # Choose your hidden size
output_size = embedding_matrix.shape[1]  # Output size is same as input if you're predicting next word embeddings

gru = GRUCell(input_size, hidden_size, output_size)

# Train the model
gru.train(X_embeddings, y_embeddings, epochs=10)


Epoch 1/10, Loss: 1.9529582357766996e-05
Epoch 2/10, Loss: 1.8279929504754862e-05
Epoch 3/10, Loss: 1.7969110205282275e-05
Epoch 4/10, Loss: 1.7877931302374975e-05
Epoch 5/10, Loss: 1.7850743048232447e-05
Epoch 6/10, Loss: 1.7842298978598402e-05
Epoch 7/10, Loss: 1.7839396555843233e-05
Epoch 8/10, Loss: 1.7838157742109638e-05
Epoch 9/10, Loss: 1.7837431186776915e-05
Epoch 10/10, Loss: 1.7836868978637277e-05


In [21]:
# Example input sequence from the dataset for prediction
# Assuming `X_embeddings` is prepared as before
test_input = X_embeddings[50]  # Example input embedding
predicted_embedding = gru.predict([test_input])[0]  # Predict the embedding for the next word

from scipy.spatial import distance

def find_closest_word(embedding, embedding_matrix, tokenizer):
    # Ensure embedding is 1-D
    embedding = embedding.flatten()  # This line ensures the embedding is 1-D
    # Compute cosine similarity between the predicted embedding and all embeddings in the matrix
    similarities = [1 - distance.cosine(embedding, word_embedding) if not np.all(word_embedding == 0) else -np.inf for word_embedding in embedding_matrix]
    closest_word_index = np.argmax(similarities)  # Get the index of the closest embedding
    return tokenizer.index_word[closest_word_index]  # Convert index to word

predicted_word = find_closest_word(predicted_embedding, embedding_matrix, tokenizer)
print("Predicted word:", predicted_word)


Predicted word: the
