In [None]:
!pip install tensorflow streamlit nltk



In [None]:
import pandas as pd

data = pd.read_csv('Shakespeare_data.csv')
text = data['PlayerLine'].str.cat(sep=' ')

In [None]:
import string
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
# Preprocessing
text = text.translate(str.maketrans('', '', string.punctuation)).lower().split()

In [None]:
unique_words = set(text)
print(f'Total unique words: {len(unique_words)}')

Total unique words: 27381


In [None]:
tokenizer = Tokenizer(num_words=15000)
tokenizer.fit_on_texts([text])  # Fit tokenizer on text
sequences = tokenizer.texts_to_sequences([text])[0]  # Convert text to sequences

In [None]:
# Vocabulary size
total_words = len(tokenizer.word_index) + 1  # Adding 1 for padding (if needed)
print(f'Total words in vocabulary: {total_words}')

Total words in vocabulary: 27382


In [None]:
# Get the word index and calculate total unique words
word_index = tokenizer.word_index
total_unique_words = min(len(word_index), 15000)  # Ensure it doesn't exceed 15000

print("Total unique words:", total_unique_words)

Total unique words: 15000


In [None]:
import numpy as np
# Define the length of input sequences (sliding window size)
sequence_length = 5  

max_sequences = 10000

input_sequences = []
next_words = []

for i in range(sequence_length, min(len(sequences), max_sequences)):
    input_sequences.append(sequences[i-sequence_length:i])
    next_words.append(sequences[i])

# Convert lists to numpy arrays
X = np.array(input_sequences)
y = np.array(next_words)

print(f"Input sequences: {X}")
print(f"Next words: {y}")

Input sequences: [[ 307    3  132    3  807]
 [   3  132    3  807    1]
 [ 132    3  807    1  568]
 ...
 [ 171 1947 1048   62  171]
 [1947 1048   62  171   59]
 [1048   62  171   59   43]]
Next words: [  1 568  63 ...  59  43   8]


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical

In [None]:
# One-hot encode the target labels (next words)
y = to_categorical(y, num_classes=total_unique_words)
y

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
print(len(y))

9995


In [None]:
# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=10, input_length=sequence_length))
model.add(LSTM(100, return_sequences=False))
model.add(Dense(total_unique_words, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=50, batch_size=4)

Epoch 1/50
[1m2499/2499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 9ms/step - accuracy: 0.0339 - loss: 7.4492
Epoch 2/50
[1m2499/2499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 9ms/step - accuracy: 0.0330 - loss: 6.2042
Epoch 3/50
[1m2499/2499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 9ms/step - accuracy: 0.0431 - loss: 6.0128
Epoch 4/50
[1m2499/2499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 9ms/step - accuracy: 0.0575 - loss: 5.8296
Epoch 5/50
[1m2499/2499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 9ms/step - accuracy: 0.0552 - loss: 5.6163
Epoch 6/50
[1m2499/2499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 9ms/step - accuracy: 0.0646 - loss: 5.3284
Epoch 7/50
[1m2499/2499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 9ms/step - accuracy: 0.0718 - loss: 5.0838
Epoch 8/50
[1m2499/2499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 9ms/step - accuracy: 0.0820 - loss: 4.8169
Epoch 9/50
[1m2

<keras.src.callbacks.history.History at 0x7f082a5c07f0>

In [None]:
# Save the model after training
model.save('/content/lstm.h5')



In [None]:
import pickle

# Save the tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
%%writefile app.py
import streamlit as st
import numpy as np
import tensorflow as tf
import pickle

# Load the tokenizer
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)


# Load your trained LSTM model
model = tf.keras.models.load_model('/content/lstm.h5')

# Create a Streamlit app
st.title("Shakespeare Next Word Predictor")
st.write("Type a sentence and get the next word prediction:")

# Text input for the user to enter a sentence
input_text = st.text_input("Enter a partial sentence:")

# Function to predict the next word
def predict_next_word(text):
    sequences = tokenizer.texts_to_sequences([text])[0]
    if len(sequences) < 5:  # Adjust to your sequence length
        return "Input is too short."

    input_sequence = sequences[-5:]  # Use the last 3 words
    input_sequence = np.array(input_sequence).reshape(1, -1)

    predictions = model.predict(input_sequence, verbose=0)
    predicted_word_index = np.argmax(predictions)
    predicted_word = tokenizer.index_word.get(predicted_word_index, "")

    return predicted_word

if input_text:
    next_word = predict_next_word(input_text)
    st.write(f"Predicted next word: **{next_word}**")

Overwriting app.py


In [None]:
!wget -q -O - ipv4.icanhazip.com

35.234.56.210


In [None]:
!streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.234.56.210:8501[0m
[0m
your url is: https://better-laws-listen.loca.lt
2024-09-29 12:23:41.418951: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-29 12:23:41.438329: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-29 12:23:41.443439: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has al