In [14]:
import tensorflow as tf
import numpy as np
from tensorflow import keras 
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.tokenize import RegexpTokenizer
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Activation, Embedding, Dropout
from tensorflow.keras.optimizers import RMSprop
import matplotlib.pyplot as plt
import pickle
import heapq
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

In [3]:
path = '1661-0.txt'
with open(path,'r', encoding = 'utf-8') as file:
    text = file.read().lower()
print('corpus length:', len(text))

corpus length: 581888


In [4]:
# Preprocessing
text = text.lower()  # Convert to lowercase
words = text.split()  # Tokenization

In [5]:
# Unique words and frequency count
unique_words = set(words)
word_freq = {word: words.count(word) for word in unique_words}

print("Unique Words:", unique_words)



In [6]:
print("Word Frequencies:", word_freq)



In [18]:
# Prepare sequences
input_sequences = []
output_words = []

for i in range(len(words) - sequence_length):
    input_sequences.append(words[i:i + sequence_length])
    output_words.append(words[i + sequence_length])  # Ensure this corresponds to the input

# Encode the sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(input_sequences)
total_words = len(tokenizer.word_index) + 1

input_sequences = tokenizer.texts_to_sequences(input_sequences)

# Convert output_words to sequences
output_words = tokenizer.texts_to_sequences(output_words)

# Flatten output_words
output_words = [word[0] for word in output_words if len(word) > 0]

# Convert to arrays
input_sequences = pad_sequences(input_sequences, maxlen=sequence_length)
input_sequences = np.array(input_sequences)
output_words = np.array(output_words)

# Make sure they are of the same length
min_length = min(len(input_sequences), len(output_words))
input_sequences = input_sequences[:min_length]
output_words = output_words[:min_length]

# Check lengths
print("Input Sequences Shape:", input_sequences.shape)
print("Output Words Shape:", output_words.shape)


Input Sequences Shape: (105131, 5)
Output Words Shape: (105131,)


In [19]:
# Build the model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=sequence_length))
model.add(LSTM(150, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(150))
model.add(Dropout(0.2))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [21]:
# Train the model
model.fit(input_sequences, output_words, epochs=50, batch_size=64)

Epoch 1/50
[1m1643/1643[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 40ms/step - accuracy: 0.5056 - loss: 2.3095
Epoch 2/50
[1m1643/1643[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 39ms/step - accuracy: 0.5192 - loss: 2.2404
Epoch 3/50
[1m1643/1643[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 39ms/step - accuracy: 0.5261 - loss: 2.1885
Epoch 4/50
[1m1643/1643[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 39ms/step - accuracy: 0.5375 - loss: 2.1339
Epoch 5/50
[1m1643/1643[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 40ms/step - accuracy: 0.5450 - loss: 2.0941
Epoch 6/50
[1m1643/1643[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 40ms/step - accuracy: 0.5521 - loss: 2.0456
Epoch 7/50
[1m1643/1643[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 40ms/step - accuracy: 0.5621 - loss: 2.0012
Epoch 8/50
[1m1643/1643[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 40ms/step - accuracy: 0.5664 - loss: 1.9668
Epoch 9/

<keras.src.callbacks.history.History at 0x1ae2556e3d0>

In [42]:
with open('1661-0.txt', 'r', encoding='utf-8') as f:
    text_data = f.read()
text_data = text_data.lower()  # Convert to lowercase
text_data = text_data.replace('\n', ' ')

In [43]:
tokenizer = Tokenizer(num_words=14556)  # Setting vocabulary size
tokenizer.fit_on_texts([text_data])

In [44]:
def sample(preds, top_n=3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-10)  # Adding a small constant to prevent log(0)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return heapq.nlargest(top_n, range(len(preds)), preds.take)

In [45]:
def predict_next_word(model, tokenizer, input_text, n=1):
    predicted_words = []
    for text in input_text:
        sequence = tokenizer.texts_to_sequences([text.lower()])[-1]  # Ensure it's a string
        sequence = pad_sequences([sequence], maxlen=sequence_length)  # Use your defined sequence length
        predicted_probs = model.predict(sequence, verbose=0)[0]
        next_indices = sample(predicted_probs, n)
        predicted_words.append([indices_char[idx] for idx in next_indices])  # Adjust for your character mapping
    return predicted_words

In [46]:
input_text = [
    "It is not a lack of love, but a lack of friendship that makes unhappy marriages.",
    "That which does not kill us makes us stronger.",
    "I'm not upset that you lied to me, I'm upset that from now on I can't believe you.",
    "And those who were seen dancing were thought to be insane by those who could not hear the music.",
    "It is hard enough to remember my opinions, without also remembering my reasons for them!"
]

predicted_words = predict_next_word(model, tokenizer, input_text, n=3)
for i, prediction in enumerate(predicted_words):
    print(f"Input: '{input_text[i]}'")
    print(f"Predicted next words: {prediction}")
    print()

Input: 'It is not a lack of love, but a lack of friendship that makes unhappy marriages.'
Predicted next words: ['ready', 'afterwards', 'business']

Input: 'That which does not kill us makes us stronger.'
Predicted next words: ['might', 'who', 'all']

Input: 'I'm not upset that you lied to me, I'm upset that from now on I can't believe you.'
Predicted next words: ['at', 'when', 'for']

Input: 'And those who were seen dancing were thought to be insane by those who could not hear the music.'
Predicted next words: ['listened', 'whom', 'but']

Input: 'It is hard enough to remember my opinions, without also remembering my reasons for them!'
Predicted next words: ['there', 'but', 'through']

