In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Input, Attention, Flatten
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
import numpy as np



In [2]:
with open('data.txt', 'r', encoding='utf-8') as file:
        lines = file.readlines()
        word_list = [line.strip().lower() for line in lines]

# Sample data for outputs
vocab = word_list
characters = set(''.join(word_list))

# Encoding dictionaries
char_to_id = {char: i + 1 for i, char in enumerate(sorted(characters))}
char_to_id[" "] = 0
word_to_id = {word: i for i, word in enumerate(vocab)}

# Inverse for decoding
id_to_word = {i: word for word, i in word_to_id.items()}

max_char_length = max(len(word) for word in word_list)  # Maximum length of words in `data`

# Convert input words to encoded characters
X=[]
Y=[]
for word in word_list:
    for i in range(1,len(word)):
        x=[]
        for char in word[:i]:
            x.append(char_to_id[char])
        X.append(x)
        Y.append(word_to_id[word])

X_padded = pad_sequences(X, maxlen=max_char_length, padding='post')
X_padded = np.array(X_padded)
Y = np.array(Y)

In [21]:
# Hyperparameters
vocab_size = len(vocab)  # Number of words in the vocabulary
char_vocab_size = 256  # Assuming ASCII characters for simplicity
embedding_dim = 64
rnn_units = 128

def model_create():
    # Model Definition
    input_chars = Input(shape=(None,), dtype='int32')
    char_embeddings = Embedding(input_dim=char_vocab_size, output_dim=embedding_dim)(input_chars)
    rnn_out, state = SimpleRNN(units=rnn_units, return_sequences=True, return_state=True)(char_embeddings)

    # Attention mechanism
    query = tf.expand_dims(state, 1)  # Use the last RNN state as the query
    attention_output, attention_weights = Attention(use_scale=True)([query, rnn_out], return_attention_scores=True)
    flattened_output = Flatten()(attention_output)

    # Output layer to predict the word index
    word_pred = Dense(vocab_size, activation='softmax')(flattened_output)

    model = Model(inputs=input_chars, outputs=word_pred)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [24]:
model = model_create()
model2 = model_create()
model3 = model_create()
model4 = model_create()
model5 = model_create()
model6 = model_create()

In [25]:
from keras.models import load_model
#model.load_weights(weights_path)
model.load_weights('03102024Final1.h5')
model2.load_weights('03102024Final1_2.h5')
model3.load_weights('03102024Final2.h5')
model4.load_weights('03102024Final2_2.h5')
model5.load_weights('03102024Final3.h5')
model6.load_weights('03102024Final3_2.h5')

In [26]:
def prediction(models,input_chars):
    input_ids = [char_to_id.get(char, 0) for char in input_chars]
    input_padded = pad_sequences([input_ids], maxlen=max_char_length, padding='post')

    # Make a prediction
    prediction = models.predict(input_padded)
    # Get the top 3 word IDs and their probabilities
    top_3_indices = np.argsort(prediction[0])[-3:][::-1]  # Sort and get top 3 indices
    top_3_probs = np.sort(prediction[0])[-3:][::-1]  # Sort and get top 3 probabilities
    top_3_words = [id_to_word[idx] for idx in top_3_indices]
    top_3_percentages = [f"{prob*100:.2f}%" for prob in top_3_probs]
    for word, percentage in zip(top_3_words, top_3_percentages):
        print(f"Word: {word}, Softmax: {percentage}")

In [28]:
import warnings
warnings.filterwarnings("ignore")
inputs="al"
prediction(model,inputs)  # epochs=300, batch_size=32
prediction(model2,inputs) # epochs=1000, batch_size=32
prediction(model3,inputs) # epochs=300, batch_size=16
prediction(model4,inputs) # epochs=1000, batch_size=16
prediction(model5,inputs) # epochs=300, batch_size=64
prediction(model6,inputs) # epochs=1000, batch_size=64

Word: athletes, Softmax: 1.22%
Word: academics, Softmax: 1.01%
Word: academic, Softmax: 0.96%
Word: alternatives, Softmax: 3.33%
Word: allah, Softmax: 2.77%
Word: alternatively, Softmax: 2.70%
Word: allow, Softmax: 2.69%
Word: colorado, Softmax: 2.52%
Word: collaboration, Softmax: 2.39%
Word: altered, Softmax: 5.39%
Word: allowing, Softmax: 4.01%
Word: alternatively, Softmax: 3.51%
Word: altered, Softmax: 4.66%
Word: almost, Softmax: 3.52%
Word: alternatively, Softmax: 3.33%
Word: albany, Softmax: 4.04%
Word: alto, Softmax: 3.45%
Word: algeria, Softmax: 3.19%
