In [1]:
#CREATE A TEXT GENERATION MODEL USING GPT OR LSTM TO GENERATE COHERENT PARAGRAPHS ON SPECIFIC TOPICS.
#DELIVERABLE: A NOTEBOOK DEMONSTRATING GENERATED TEXT BASED ON USER PROMPTS.

#Text generation is a technique that involves the creation of human-like text using artificial intelligence 
#  and machine learning algorithms. It enables computers to generate coherent and contextually relevant text 
#  based on patterns and structures learned from existing textual data.

#Text generation is a fascinating application of deep learning in natural language processing (NLP). 
#  It involves training a model on a given text dataset, which can then generate new, coherent sequences 
#  of text based on the patterns it has learned.

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Define the Sample text that will be used to train the model 

input_text ="""In a survey of nearly 75,000 adults, researchers compared the participants' preferred sleep timing, known as chronotype, with their actual sleep behavior. They determined that regardless of one's preferred bedtime, everyone benefits from turning in early. Morning larks and night owls alike tended to have higher rates of mental and behavioral disorders if they stayed up late," the researchers from Stanford Medicine have said."""

In [3]:
# To train the model, the input text needs to be converted into a numerical format. 
# This is done using Keras’s Tokenizer, which converts the words in the text to sequences of integers. 
# Each unique word is assigned a specific index, and the tokenizer creates a mapping of words to indices.

# Tokenize the text
tokenizer = Tokenizer()

#fit_on_texts(): Creates the word-to-index mapping
tokenizer.fit_on_texts([input_text])

#Stores the number of unique words in the given input text (plus one for padding).
total_words = len(tokenizer.word_index) + 1

In [4]:
#fit_on_texts used in conjunction with texts_to_sequences produces the one-hot encoding for a text
# Create sequences for text generation
# For example 'Morning Larks and night' will generate the below sequences
#"Morning Larks"
#"Morning Larks and"
#"Morning Larks and night"

input_sequences = []
for line in input_text.split('.'):
   token_list = tokenizer.texts_to_sequences([line])[0]
   for i in range(1, len(token_list)):
       n_gram_sequence = token_list[:i+1]
       input_sequences.append(n_gram_sequence)

In [5]:
input_sequences

[[2, 11],
 [2, 11, 12],
 [2, 11, 12, 1],
 [2, 11, 12, 1, 13],
 [2, 11, 12, 1, 13, 14],
 [2, 11, 12, 1, 13, 14, 15],
 [2, 11, 12, 1, 13, 14, 15, 16],
 [2, 11, 12, 1, 13, 14, 15, 16, 3],
 [2, 11, 12, 1, 13, 14, 15, 16, 3, 17],
 [2, 11, 12, 1, 13, 14, 15, 16, 3, 17, 4],
 [2, 11, 12, 1, 13, 14, 15, 16, 3, 17, 4, 18],
 [2, 11, 12, 1, 13, 14, 15, 16, 3, 17, 4, 18, 5],
 [2, 11, 12, 1, 13, 14, 15, 16, 3, 17, 4, 18, 5, 6],
 [2, 11, 12, 1, 13, 14, 15, 16, 3, 17, 4, 18, 5, 6, 19],
 [2, 11, 12, 1, 13, 14, 15, 16, 3, 17, 4, 18, 5, 6, 19, 20],
 [2, 11, 12, 1, 13, 14, 15, 16, 3, 17, 4, 18, 5, 6, 19, 20, 21],
 [2, 11, 12, 1, 13, 14, 15, 16, 3, 17, 4, 18, 5, 6, 19, 20, 21, 22],
 [2, 11, 12, 1, 13, 14, 15, 16, 3, 17, 4, 18, 5, 6, 19, 20, 21, 22, 23],
 [2, 11, 12, 1, 13, 14, 15, 16, 3, 17, 4, 18, 5, 6, 19, 20, 21, 22, 23, 24],
 [2,
  11,
  12,
  1,
  13,
  14,
  15,
  16,
  3,
  17,
  4,
  18,
  5,
  6,
  19,
  20,
  21,
  22,
  23,
  24,
  25],
 [2,
  11,
  12,
  1,
  13,
  14,
  15,
  16,
  3,
  17,
  

In [6]:
# The above command split each sentence into sequences of increasing length, creating the n-gram sequences necessary for training.
# The sequences generated in the previous step are of varying lengths. Since the model requires all input sequences 
# to be of the same length, we pad them to a uniform size using pad_sequences(). 
# The padding is done at the beginning of the sequences to ensure that the sequences are aligned properly.
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [7]:
# input sequences are now padded with zeros
input_sequences

array([[ 0,  0,  0, ...,  0,  2, 11],
       [ 0,  0,  0, ...,  2, 11, 12],
       [ 0,  0,  0, ..., 11, 12,  1],
       ...,
       [ 0,  0, 36, ...,  8, 52, 53],
       [ 0, 36, 37, ..., 52, 53, 10],
       [36, 37,  9, ..., 53, 10, 54]])

In [8]:
# Preparing the input and output for the model 
# Each sequence is split into input (X) and output (y). 
# The input consists of all words in the sequence except the last one
# the output is the last word in the sequence. 
# The output (y) is one-hot encoded to allow the model to predict the next word from a vocabulary of all possible words.
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [9]:
# Build LSTM model for text generation
# Embedding Layer - Converts the input word indices into dense vectors of fixed size.
# LSTM Layer - Processes the input sequences and learns the temporal relationships between words.
# Dense Layer - Outputs a probability distribution over the vocabulary using a softmax activation function, which predicts the next word in the sequence.

model_textgen_lstm = Sequential()
model_textgen_lstm.add(Embedding(total_words, 64, input_length=max_sequence_len-1))
model_textgen_lstm.add(LSTM(100))
model_textgen_lstm.add(Dense(total_words, activation='softmax'))



In [10]:
# Compile the model using the Adam optimizer and categorical crossentropy as the loss function. 
model_textgen_lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model for 100 epochs.
model_textgen_lstm.fit(X, y, epochs=100, verbose=1)

Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 60ms/step - accuracy: 0.0000e+00 - loss: 4.0086
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.0743 - loss: 4.0015
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.1381 - loss: 3.9953
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.0850 - loss: 3.9880
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 0.0954 - loss: 3.9786
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.0954 - loss: 3.9666
Epoch 7/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 0.0850 - loss: 3.9508
Epoch 8/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.0638 - loss: 3.9183
Epoch 9/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0

<keras.src.callbacks.history.History at 0x2139939f0e0>

In [11]:
# A trained model can be used to generate new text. 
# The function generate_text() takes a seed text and generates a specified number of new words by predicting
#   the next word repeatedly, updating the seed text with the new predictions.
def generate_text(seed_text, next_words, max_sequence_len):
   for _ in range(next_words):
       token_list = tokenizer.texts_to_sequences([seed_text])[0]
       token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
       predicted = np.argmax(model_textgen_lstm.predict(token_list), axis=-1)
       output_word = ""
       for word, index in tokenizer.word_index.items():
           if index == predicted:
               output_word = word
               break
       seed_text += " " + output_word
   return seed_text

In [12]:
# Test your LSTM model. Pass a seed text and see how the model generate the text
print(generate_text("Morning larks", 20, max_sequence_len))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 372ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5