# Get Data to train model

In [None]:
!pip install pdfplumber

# Importing Required Lib

In [None]:
import pdfplumber
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np

# Extracting Text from PDF

In [None]:
pdf_path = "Path to PDF"


def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + " "
    return text

text = extract_text_from_pdf(pdf_path)

In [None]:
print(text)

# Preprocess The Text

In [None]:
def preprocess_text(text):
    import string
    text = text.lower().translate(str.maketrans("", "", string.punctuation))
    return text

text = preprocess_text(text)

print(text)

# Create dataset from extracted text

In [None]:
def create_dataset(text, sequence_length=5):
    text = preprocess_text(text)
    words = text.split()
    data = []
    for i in range(len(words) - sequence_length):
        input_seq = words[i:i + sequence_length]
        target_word = words[i + sequence_length]
        data.append((input_seq, target_word))
    return data

dataset = create_dataset(text)

In [None]:
dataset

# Tokenize text and create input sequences

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([" ".join(seq[0]) for seq in dataset])  # Fit on input sequences only
vocab_size = len(tokenizer.word_index) + 1

In [None]:
vocab_size

2003

# Generate input sequences and targets

In [None]:
input_sequences = []
target_words = []
for input_seq, target_word in dataset:
    tokenized_input_seq = tokenizer.texts_to_sequences([" ".join(input_seq)])[0]
    tokenized_target = tokenizer.texts_to_sequences([target_word])[0]
    input_sequences.append(tokenized_input_seq)
    target_words.append(tokenized_target[0])

In [None]:
# target_words

# Pad sequences

In [None]:
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
target_words = np.array(target_words)

In [None]:
input_sequences

array([[2001,    9, 1035,  702,  335],
       [   9, 1035,  702,  335,  336],
       [1035,  702,  335,  336,  527],
       ...,
       [  48, 1998, 1999,  334,    2],
       [1998, 1999,  334,    2, 2000],
       [1999,  334,    2, 2000, 2002]], dtype=int32)

# One-hot encode target variable

In [None]:
y = np.eye(vocab_size)[target_words]

In [None]:
y.shape

(11421, 2003)

# Build and train the LSTM model

In [None]:
embedding_dim = 2003

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_sequence_len),
    LSTM(500, return_sequences=True),
    LSTM(500),
    Dense(vocab_size, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


model.fit(input_sequences, y, epochs=30, verbose=1)



Epoch 1/30
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 16ms/step - accuracy: 0.0510 - loss: 6.6531
Epoch 2/30
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.0667 - loss: 5.9815
Epoch 3/30
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - accuracy: 0.0963 - loss: 5.5901
Epoch 4/30
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - accuracy: 0.1363 - loss: 5.0723
Epoch 5/30
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - accuracy: 0.1961 - loss: 4.4053
Epoch 6/30
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - accuracy: 0.2656 - loss: 3.6893
Epoch 7/30
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.3911 - loss: 2.8639
Epoch 8/30
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - accuracy: 0.5600 - loss: 2.0513
Epoch 9/30
[1m357/357[0m 

<keras.src.callbacks.history.History at 0x79e1a0356ef0>

# Save the model

In [None]:
model.save('pdf_next_word_predictor.h5')

# Generate text

In [None]:
def generate_text(seed_text, next_words=5):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted_word_index = np.argmax(predicted, axis=-1)[0]

        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                seed_text += " " + word
                break
    return seed_text

In [None]:
seed_text = "You both peered into"
generated_text = generate_text(seed_text, next_words=15)
print("Generated text:", generated_text)

Generated text: You both peered into the cage and you could see that the cat was holding the canary in its
