In [None]:
# Objective:The goal of this assignment is to use Python to design and implement a generative model for text data. 
# The model should be capable of generating new, creative, and grammatically correct sentences given an input prompt.

In [None]:
#importing Necessary Libraries
import requests
from bs4 import BeautifulSoup
import nltk
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
# Loading the project Gutenberg data
r = requests.get(r'https://www.gutenberg.org/cache/epub/64317/pg64317.txt')
text_data = r.text

In [None]:
# first, remove unwanted new line and tab characters from the text
for char in ["\n", "\r", "\d", "\t"]:
    text_data = text_data.replace(char, " ")

In [None]:
# (removing the project gutenburg introduction(0:1433)/footnotes(277912:--))
text_data = text_data[1433:277912]
print(text_data)

In [None]:
# lowercase the text
text_data = text_data.lower()

import re
# remove punctuation from text
text_data = re.sub(r"[^\w\s]", "", text_data)

In [None]:
# import nltk
# nltk.download()

In [None]:
# Step 2: Data Preprocessing
sentences = sent_tokenize(text_data)  # Split text into sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
total_words = len(tokenizer.word_index) + 1

input_sequences = []
for line in sentences:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
label = np.array(label)
label = np.expand_dims(label, axis=1)
predictors_train, predictors_test, label_train, label_test = train_test_split(predictors, label, test_size=0.2, random_state=42)

In [None]:
# Step 3: Model Design and Training
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

model.fit(predictors_train, label_train, epochs=50, verbose=1) 

In [None]:
# Step 4: Model Evaluation
loss = model.evaluate(predictors_test, label_test, verbose=0)
print(f"Perplexity: {np.exp(loss):.2f}")

In [None]:
# Generate text given a seed sentence
def generate_text(seed_text, next_words):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        # predicted_probs = model.predict(token_list, verbose=0)
        predicted_probs = model.predict(token_list)[0]
        predicted_index = np.argmax(predicted_probs)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

In [None]:
# Example text generation
seed_sentence = "The Great Gatsby" # here is the prompt to prive seed input. and the length of sentence generation.
generated_text = generate_text(seed_sentence, 100)
print(f"Generated Text: {generated_text}")