<a href="https://colab.research.google.com/github/KevinJayne2023/LSTM_text_generation/blob/main/LSTM_Text_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
nlp = spacy.load('en_core_web_lg',disable=["tagger", "ner", "lemmatizer"])

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Read in the text file with the read_file function.
def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
    return str_text

## Tokenize and Clean Text

In [None]:
# Use the separate_punc function to remove the puncutation.
def separate_punc(md_text):
    # Create a list comprehension to get only the tokens
    return [token.text.lower() for token in nlp(md_text) \
            if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n']

In [None]:
# Pass in the first four chapters of Moby Dick to the read_file function.
md_text = read_file('/content/drive/MyDrive/moby_dick_four_chapters.txt')
# Clean and tokenize the text using the separate_punc function.
tokens = separate_punc(md_text)

In [None]:
len(tokens)

11338

In [None]:
# Look over the tokems to make sure all the punctuation was removed
print(tokens[:300])

['call', 'me', 'ishmael', 'some', 'years', 'ago', 'never', 'mind', 'how', 'long', 'precisely', 'having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on', 'shore', 'i', 'thought', 'i', 'would', 'sail', 'about', 'a', 'little', 'and', 'see', 'the', 'watery', 'part', 'of', 'the', 'world', 'it', 'is', 'a', 'way', 'i', 'have', 'of', 'driving', 'off', 'the', 'spleen', 'and', 'regulating', 'the', 'circulation', 'whenever', 'i', 'find', 'myself', 'growing', 'grim', 'about', 'the', 'mouth', 'whenever', 'it', 'is', 'a', 'damp', 'drizzly', 'november', 'in', 'my', 'soul', 'whenever', 'i', 'find', 'myself', 'involuntarily', 'pausing', 'before', 'coffin', 'warehouses', 'and', 'bringing', 'up', 'the', 'rear', 'of', 'every', 'funeral', 'i', 'meet', 'and', 'especially', 'whenever', 'my', 'hypos', 'get', 'such', 'an', 'upper', 'hand', 'of', 'me', 'that', 'it', 'requires', 'a', 'strong', 'moral', 'principle', 'to', 'prevent', 'me', 'from', '

In [None]:
# Find how many tokens contain "?--"?
count = 0
for token in tokens:
    if "?--" in token:
        count += 1
print(count)

6


## Create Sequences of Tokens

In [None]:
# Organize into sequences of tokens.
# Use 25 words to predict the 26th word using "+1".
train_len = 26

text_sequences = []

for i in range(train_len, len(tokens)):
    seq = tokens[i-train_len:i]
    text_sequences.append(seq)

In [None]:
#Print the first 3 sequences of 26 words
print(text_sequences[0])

print(text_sequences[1])

print(text_sequences[2])

['call', 'me', 'ishmael', 'some', 'years', 'ago', 'never', 'mind', 'how', 'long', 'precisely', 'having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on']
['me', 'ishmael', 'some', 'years', 'ago', 'never', 'mind', 'how', 'long', 'precisely', 'having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on', 'shore']
['ishmael', 'some', 'years', 'ago', 'never', 'mind', 'how', 'long', 'precisely', 'having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on', 'shore', 'i']


In [None]:
#Join the sequences
print(' '.join(text_sequences[0]))

print(' '.join(text_sequences[1]))

print(' '.join(text_sequences[2]))

call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on
me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore
ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore i


In [None]:
# The list of text_sequences should be 26 less than the total tokens.
len(text_sequences)

11312

## Tokenization with Keras

In [None]:
# Import the Keras tokenization to format the data from words into a numerical format.
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
# Initialize the Keras Tokenizer class and map each word with an index
tokenizer = Tokenizer()

tokenizer.fit_on_texts(text_sequences)

In [None]:
# Get the dictionary mapping of words to their indices
print(tokenizer.word_index)

{'the': 1, 'a': 2, 'and': 3, 'of': 4, 'i': 5, 'to': 6, 'in': 7, 'it': 8, 'that': 9, 'he': 10, 'his': 11, 'was': 12, 'but': 13, 'me': 14, 'with': 15, 'as': 16, 'at': 17, 'this': 18, 'you': 19, 'is': 20, 'all': 21, 'for': 22, 'my': 23, 'on': 24, 'be': 25, "'s": 26, 'not': 27, 'from': 28, 'there': 29, 'one': 30, 'up': 31, 'what': 32, 'him': 33, 'so': 34, 'bed': 35, 'now': 36, 'about': 37, 'no': 38, 'into': 39, 'by': 40, 'were': 41, 'out': 42, 'or': 43, 'harpooneer': 44, 'had': 45, 'then': 46, 'have': 47, 'an': 48, 'upon': 49, 'little': 50, 'some': 51, 'old': 52, 'like': 53, 'if': 54, 'they': 55, 'would': 56, 'do': 57, 'over': 58, 'landlord': 59, 'thought': 60, 'room': 61, 'when': 62, 'could': 63, "n't": 64, 'night': 65, 'here': 66, 'head': 67, 'such': 68, 'which': 69, 'man': 70, 'did': 71, 'sea': 72, 'time': 73, 'other': 74, 'very': 75, 'go': 76, 'these': 77, 'more': 78, 'though': 79, 'first': 80, 'sort': 81, 'said': 82, 'last': 83, 'down': 84, 'most': 85, 'been': 86, 'never': 87, 'your':

In [None]:
# Get the dictionary of words and the number of times they appear in the text.
print(tokenizer.word_counts)

OrderedDict([('call', 27), ('me', 2471), ('ishmael', 133), ('some', 758), ('years', 135), ('ago', 84), ('never', 449), ('mind', 164), ('how', 321), ('long', 374), ('precisely', 37), ('having', 142), ('little', 767), ('or', 950), ('no', 1003), ('money', 120), ('in', 5647), ('my', 1786), ('purse', 71), ('and', 9646), ('nothing', 281), ('particular', 152), ('to', 6497), ('interest', 24), ('on', 1716), ('shore', 26), ('i', 7150), ('thought', 676), ('would', 702), ('sail', 104), ('about', 1014), ('a', 10377), ('see', 416), ('the', 15540), ('watery', 26), ('part', 234), ('of', 8287), ('world', 234), ('it', 4238), ('is', 1950), ('way', 390), ('have', 806), ('driving', 26), ('off', 416), ('spleen', 26), ('regulating', 26), ('circulation', 26), ('whenever', 130), ('find', 78), ('myself', 416), ('growing', 26), ('grim', 26), ('mouth', 130), ('damp', 78), ('drizzly', 26), ('november', 26), ('soul', 78), ('involuntarily', 52), ('pausing', 52), ('before', 364), ('coffin', 104), ('warehouses', 52), 

In [None]:
# What is the size of the vocabulary
vocabulary_size = len(tokenizer.word_counts)
print(vocabulary_size)

2718


In [None]:
# Encode each word in the text_sequences to the indices.
sequences = tokenizer.texts_to_sequences(text_sequences)

In [None]:
# Get the encoded indices for the the first 26 words
print(sequences[0])

[956, 14, 263, 51, 261, 408, 87, 219, 129, 111, 954, 260, 50, 43, 38, 314, 7, 23, 546, 3, 150, 259, 6, 2713, 14, 24]


In [None]:
# Get the word associated with the indices for the first sequence.
for i in sequences[0]:
    print(f'{i} : {tokenizer.index_word[i]}')

956 : call
14 : me
263 : ishmael
51 : some
261 : years
408 : ago
87 : never
219 : mind
129 : how
111 : long
954 : precisely
260 : having
50 : little
43 : or
38 : no
314 : money
7 : in
23 : my
546 : purse
3 : and
150 : nothing
259 : particular
6 : to
2713 : interest
14 : me
24 : on


In [None]:
# Get the word associated with a specific index.
tokenizer.index_word.get(953)

'wrapped'

In [None]:
# Number of times the word "call" appears in the text.
tokenizer.word_counts['call']

27

## Convert the List of Sequences to Arrays.

In [None]:
# Import numpy to convert the list of sequences to arrays.
import numpy as np

In [None]:
# Convert the all 26 word list of lists to arrays.
num_sequences = np.array(sequences)
print(num_sequences)

[[ 956   14  263 ... 2713   14   24]
 [  14  263   51 ...   14   24  957]
 [ 263   51  261 ...   24  957    5]
 ...
 [ 952   12  166 ...  262   53    2]
 [  12  166 2712 ...   53    2 2718]
 [ 166 2712    3 ...    2 2718   26]]


In [None]:
len(num_sequences)

11312

In [None]:
# Get the first array.
print(num_sequences[0])

[ 956   14  263   51  261  408   87  219  129  111  954  260   50   43
   38  314    7   23  546    3  150  259    6 2713   14   24]


## Create input sequences and one-hot encode the target variable.

In [None]:
# Import the to_categorical function to convert the arrays to binary values.
import keras
from keras.utils import to_categorical

In [None]:
# Get the first 25 numbers from each array to be our X.
for sequence in num_sequences[:,:-1]:
    print(sequence)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[ 170    1 2354    1  379 2355    3   21    1 2356  835   41   75 2357
   34    9    5   60   18  196  108  128    2   75 2358]
[   1 2354    1  379 2355    3   21    1 2356  835   41   75 2357   34
    9    5   60   18  196  108  128    2   75 2358   50]
[2354    1  379 2355    3   21    1 2356  835   41   75 2357   34    9
    5   60   18  196  108  128    2   75 2358   50 2359]
[   1  379 2355    3   21    1 2356  835   41   75 2357   34    9    5
   60   18  196  108  128    2   75 2358   50 2359   43]
[ 379 2355    3   21    1 2356  835   41   75 2357   34    9    5   60
   18  196  108  128    2   75 2358   50 2359   43 2360]
[2355    3   21    1 2356  835   41   75 2357   34    9    5   60   18
  196  108  128    2   75 2358   50 2359   43 2360   22]
[   3   21    1 2356  835   41   75 2357   34    9    5   60   18  196
  108  128    2   75 2358   50 2359   43 2360   22   11]
[  21    1 2356  835   41   75 2357   3

In [None]:
# Get the last number (number 26) from each array to be our y value.
for sequence in num_sequences[:,-1]:
    print(sequence)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
98
38
239
4
23
44
59
82
5
32
81
4
2
511
20
10
226
10
337
368
68
211
269
8
12
36
241
49
2078
523
1
59
2079
144
15
11
2080
2081
3
101
6
25
2082
2083
17
133
709
23
838
38
10
2084
812
10
26
48
2085
839
840
6
35
3
840
6
2086
328
10
26
1
839
32
2087
1
2088
13
6
65
10
192
42
2
394
19
94
3
5
57
64
94
32
24
841
2089
33
34
211
329
134
25
10
392
64
311
11
67
392
64
311
11
2090
81
4
2
2091
275
20
18
19
97
306
14
307
39
2
2092
2093
57
19
2094
6
137
59
9
18
44
20
2095
842
18
2096
466
65
43
184
395
216
7
394
11
67
418
18
224
9
26
954
8
82
1
59
3
5
274
33
10
63
64
311
8
66
1
2097
26
2098
15
32
532
362
15
123
6
25
293
336
64
29
118
345
123
7
1
165
5
91
19
32
8
20
59
82
5
442
2099
19
784
180
190
2100
9
2101
6
14
2102
27
270
134
25
27
333
42
2
2103
3
843
2
2104
13
5
2105
844
19
162
25
845
521
54
9
470
44
2106
19
2
2107
135
11
67
5
162
846
8
22
33
82
5
36
703
39
2
2108
144
17
18
347
2109
4
1
59
26
8
26
847
2110
82
10
847
82
2111
57
19
276
21

In [None]:
# X to be the first 25 numbers of each array.
X = num_sequences[:,:-1]
# y to be the last number of each array.
y = num_sequences[:,-1]

In [None]:
# Shape of X
print(X.shape)

seq_len = X.shape[1]
print(seq_len)

(11312, 25)
25


In [None]:
# Shape of y
y.shape

(11312,)

In [None]:
# Next,one-hot encode the target variable to transform each index to a binary value.
# We increase the vocabulary by 1 so we can predict the next word.
y = to_categorical(y, num_classes=vocabulary_size+1)

In [None]:
# Get the shape of y again.
y.shape

(11312, 2719)

In [None]:
# Print the first 24 binary values in the first array.
print(y[0,:25])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1.]


## LSTM  Model

In [None]:
# Import the dependencies for LSTM model.
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(Dropout(0.2))  # Add dropout after the first LSTM layer
    model.add(LSTM(150))
    model.add(Dropout(0.2))  # Add dropout after the second LSTM layer
    model.add(Dense(150, activation='leaky_relu'))
    model.add(Dense(vocabulary_size, activation='softmax'))

    model.build((None, seq_len))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()

    return model

In [None]:
model = create_model(vocabulary_size + 1, seq_len)



In [None]:
# Define the EarlyStopping callback
early_stopping = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)

# Example training code

model.fit(X, y, epochs=75, batch_size=50, validation_data=(X, y), callbacks=[early_stopping])

Epoch 1/75
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 206ms/step - accuracy: 0.0724 - loss: 5.7188 - val_accuracy: 0.0736 - val_loss: 5.6136
Epoch 2/75
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 242ms/step - accuracy: 0.0717 - loss: 5.6776 - val_accuracy: 0.0742 - val_loss: 5.5329
Epoch 3/75
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 201ms/step - accuracy: 0.0755 - loss: 5.5622 - val_accuracy: 0.0763 - val_loss: 5.4232
Epoch 4/75
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 205ms/step - accuracy: 0.0745 - loss: 5.4571 - val_accuracy: 0.0805 - val_loss: 5.3448
Epoch 5/75
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 201ms/step - accuracy: 0.0798 - loss: 5.4182 - val_accuracy: 0.0843 - val_loss: 5.2418
Epoch 6/75
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 242ms/step - accuracy: 0.0819 - loss: 5.3478 - val_accuracy: 0.0827 - val_loss: 5.1472
Epoch 7/75

In [None]:
def create_model(vocabulary_size, seq_len):

    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='leaky_relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.build((vocabulary_size, seq_len))
    model.summary()

    return model

## Training the Model

In [None]:
# Define the model and pass in the vocabulary (+1) and the seq_len (25 words).
model = create_model(vocabulary_size + 1, seq_len)



In [None]:
model.fit(X, y, batch_size=128, epochs=300,verbose=1)

Epoch 1/300


KeyboardInterrupt: 

In [None]:
from pickle import dump

In [None]:
# Save the model to file
model.save('four_chapters_moby_dick_model_300.keras')
# Save the tokenizer
dump(tokenizer, open('four_chapters_moby_dick_tokenizer_300', 'wb'))

## Generating New Text

In [None]:
# Import the dependencies needed for the LSTM.
from random import randint
from pickle import load
from keras.models import load_model
# May needt to use `pip install Keras-Preprocessing`
from keras_preprocessing.sequence import pad_sequences

In [None]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    Generate text using a trained language model.

    INPUTS:
     - model: Trained language model (e.g., LSTM) capable of text generation.
     - tokenizer: Tokenizer that was fit on text data.
     - seq_len: Length of the training sequences used to train the model.
     - seed_text: A raw string text serving as the seed for text generation.
     - num_gen_words: The number of words to be generated by model.
    '''

    # Final Output
    output_text = []

    # Intial Seed Sequence
    input_text = seed_text

    # Create num_gen_words
    for i in range(num_gen_words):

        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        # Pad sequences to our trained rate of 25 words.
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        # Predict Class Probabilities for each word
        pred_w = model.predict(pad_encoded, verbose=0)[0]

        pred_word_ind = np.argmax(pred_w, axis= -1)

        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind]

        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word

        output_text.append(pred_word)

    # Make it look like a sentence.
    return ' '.join(output_text)

## Test: Grab a random seed sequence

In [None]:
# Import the random module.
import random

In [None]:
# Pick a random sequence of 26 words.

# Join the words


In [None]:
# Import the load_model method.
from keras.models import load_model

In [None]:
# Set the model to the saved trained 300 epoch model.

# Set the tokenizer to the trained tokenizer from the model.


In [None]:
# Call the generate_text function and pass in the required parameters. We set the num_gen_words = 25.


- **The next 25 words aren't that accurate.**

## Explore Generating Text

In [None]:
# Choose a 26 word text string from the first four chapters of Moby Dick.
# If less than 26 the accuracy is off.
text = """provide 25 words from the first four chapters of Mody Dick"""

In [None]:
# Create tokens by using the separate_punc function.

# Join the tokens and set them to the "seed_text" variable.


['seeing', 'now', 'that', 'there', 'were', 'no', 'curtains', 'to', 'the', 'window', 'and', 'that', 'the', 'street', 'being', 'very', 'narrow', 'the', 'house', 'opposite', 'commanded', 'a', 'plain', 'view', 'into', 'the', 'room', 'and']
seeing now that there were no curtains to the window and that the street being very narrow the house opposite commanded a plain view into the room and


In [None]:
# Call the generate_text function and pass in the required parameters. Set the `num_gen_words` to 25.
generate_text(m)

**Question: How would we gain better accuracy for the next 50 words?**

- Increase or decrease the length of the sequence?
- Decrease the batch size?