## Import Libraries

In [24]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM
from keras.utils import to_categorical
from pickle import dump, load
from nltk.corpus import words
import nltk
import spacy
import keras

import numpy as np
import fitz
import string
import PyPDF2
import random
import json
import re

nltk.download('words')
random.seed(42)

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\jayaw\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
nlp = spacy.load('en_core_web_lg')
nlp.max_length = 1198623

In [3]:
vocabulary = set(words.words())

## Open and Convert PDF File to Python String

In [4]:
pdf_document = "Datasets\THE MAGIC OF THINKING BIG by David J Schwartz (z-lib.org).pdf"
doc = fitz.open(pdf_document)
text = ""
for page in doc:
    text += page.get_text()
print(text)

CONTENTS
Cover
About the Book
About the Author
Title Page
Dedication
Preface
What This Book Will Do for You
1. Believe You Can Succeed and You Will
2. Cure Yourself of Excusitis, the Failure Disease
3. Build Confidence and Destroy Fear
4. How to Think Big
5. How to Think and Dream Creatively
6. You Are What You Think You Are
7. Manage Your Environment: Go First Class
8. Make Your Attitudes Your Allies
9. Think Right Toward People
10. Get the Action Habit
11. How to Turn Defeat into Victory
12. Use Goals to Help You Grow
13. How to Think like a Leader
Copyright
ABOUT THE BOOK
More than 6 million readers around the world have improved their
lives by reading The Magic of Thinking Big.
First published in 1959, David J Schwartz’s classic teachings are as
powerful today as they were then. Practical, empowering and hugely
engaging, this book will not only inspire you, it will give you the tools to
change your life for the better – starting from now. His step-by-step
approach will show you how

## Data / Text Cleaning

### Create a list of Sentences

In [5]:
text_copy = text
text_copy = text_copy.replace('\n', ' ')
text_copy



In [6]:
sentences = re.split(r'\.\s', text_copy)
sentences

['CONTENTS Cover About the Book About the Author Title Page Dedication Preface What This Book Will Do for You 1',
 'Believe You Can Succeed and You Will 2',
 'Cure Yourself of Excusitis, the Failure Disease 3',
 'Build Confidence and Destroy Fear 4',
 'How to Think Big 5',
 'How to Think and Dream Creatively 6',
 'You Are What You Think You Are 7',
 'Manage Your Environment: Go First Class 8',
 'Make Your Attitudes Your Allies 9',
 'Think Right Toward People 10',
 'Get the Action Habit 11',
 'How to Turn Defeat into Victory 12',
 'Use Goals to Help You Grow 13',
 'How to Think like a Leader Copyright ABOUT THE BOOK More than 6 million readers around the world have improved their lives by reading The Magic of Thinking Big',
 'First published in 1959, David J Schwartz’s classic teachings are as powerful today as they were then',
 'Practical, empowering and hugely engaging, this book will not only inspire you, it will give you the tools to change your life for the better – starting from n

### Filter some Sentences

In [None]:
sentences_copy = []

for idx, i in enumerate(sentences):
    if 'Success means winning admiration' in i:
        sentences_copy = sentences[idx:]  # Keep sentences from this point onward
        break


sentences = sentences_copy
sentences

['Success means winning admiration, leadership, being looked up to by people in your business and social life',
 'Success means freedom: freedom from worries, fears, frustrations, and failure',
 'Success means self-respect, continually finding more real happiness and satisfaction from life, being able to do more for those who depend on you',
 'Success means winning',
 'Success—achievement—is the goal of life! Every human being wants success',
 'Everybody wants the best this life can deliver',
 'Nobody enjoys crawling, living in mediocrity',
 'No one likes feeling second-class and feeling forced to go that way',
 'Some of the most practical success-building wisdom is found in that biblical quotation stating that faith can move mountains',
 'Believe, really believe, you can move a mountain, and you can',
 'Not many people believe that they can move mountains',
 'So, as a result, not many people do',
 'On some occasion you’ve probably heard someone say something like “It’s nonsense to thi

In [8]:
text = ' '.join(sentences)
text



## Split Text into Words (Tokenization)

In [9]:
# Create tokens from the document
tokens = [token.text for token in nlp(text)]
print("Number of words:", len(tokens))
tokens

Number of words: 90278


['Success',
 'means',
 'winning',
 'admiration',
 ',',
 'leadership',
 ',',
 'being',
 'looked',
 'up',
 'to',
 'by',
 'people',
 'in',
 'your',
 'business',
 'and',
 'social',
 'life',
 'Success',
 'means',
 'freedom',
 ':',
 'freedom',
 'from',
 'worries',
 ',',
 'fears',
 ',',
 'frustrations',
 ',',
 'and',
 'failure',
 'Success',
 'means',
 'self',
 '-',
 'respect',
 ',',
 'continually',
 'finding',
 'more',
 'real',
 'happiness',
 'and',
 'satisfaction',
 'from',
 'life',
 ',',
 'being',
 'able',
 'to',
 'do',
 'more',
 'for',
 'those',
 'who',
 'depend',
 'on',
 'you',
 'Success',
 'means',
 'winning',
 'Success',
 '—',
 'achievement',
 '—',
 'is',
 'the',
 'goal',
 'of',
 'life',
 '!',
 'Every',
 'human',
 'being',
 'wants',
 'success',
 'Everybody',
 'wants',
 'the',
 'best',
 'this',
 'life',
 'can',
 'deliver',
 'Nobody',
 'enjoys',
 'crawling',
 ',',
 'living',
 'in',
 'mediocrity',
 'No',
 'one',
 'likes',
 'feeling',
 'second',
 '-',
 'class',
 'and',
 'feeling',
 'forced'

## Create Text Sequences

In [10]:
TRAIN_LEN = 26 # (25 input words and 1 output word)

In [11]:
text_sequences = []

for i in range(TRAIN_LEN, len(tokens)):
    text_sequences.append(tokens[i-TRAIN_LEN:i])

text_sequences

[['Success',
  'means',
  'winning',
  'admiration',
  ',',
  'leadership',
  ',',
  'being',
  'looked',
  'up',
  'to',
  'by',
  'people',
  'in',
  'your',
  'business',
  'and',
  'social',
  'life',
  'Success',
  'means',
  'freedom',
  ':',
  'freedom',
  'from',
  'worries'],
 ['means',
  'winning',
  'admiration',
  ',',
  'leadership',
  ',',
  'being',
  'looked',
  'up',
  'to',
  'by',
  'people',
  'in',
  'your',
  'business',
  'and',
  'social',
  'life',
  'Success',
  'means',
  'freedom',
  ':',
  'freedom',
  'from',
  'worries',
  ','],
 ['winning',
  'admiration',
  ',',
  'leadership',
  ',',
  'being',
  'looked',
  'up',
  'to',
  'by',
  'people',
  'in',
  'your',
  'business',
  'and',
  'social',
  'life',
  'Success',
  'means',
  'freedom',
  ':',
  'freedom',
  'from',
  'worries',
  ',',
  'fears'],
 ['admiration',
  ',',
  'leadership',
  ',',
  'being',
  'looked',
  'up',
  'to',
  'by',
  'people',
  'in',
  'your',
  'business',
  'and',
  'socia

In [12]:
for i in range(5):
    print(f"Text #{i+1}: {(' '.join(text_sequences[i]))}")

Text #1: Success means winning admiration , leadership , being looked up to by people in your business and social life Success means freedom : freedom from worries
Text #2: means winning admiration , leadership , being looked up to by people in your business and social life Success means freedom : freedom from worries ,
Text #3: winning admiration , leadership , being looked up to by people in your business and social life Success means freedom : freedom from worries , fears
Text #4: admiration , leadership , being looked up to by people in your business and social life Success means freedom : freedom from worries , fears ,
Text #5: , leadership , being looked up to by people in your business and social life Success means freedom : freedom from worries , fears , frustrations


## Encode Sequences of Words

In [13]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

In [14]:
for i in range(5):
    print(f"Sequence #{i+1}: {sequences[i]}")

Sequence #1: [85, 597, 911, 2690, 1, 342, 1, 173, 473, 75, 3, 73, 24, 9, 20, 123, 5, 662, 144, 85, 597, 2115, 43, 2115, 60, 2691]
Sequence #2: [597, 911, 2690, 1, 342, 1, 173, 473, 75, 3, 73, 24, 9, 20, 123, 5, 662, 144, 85, 597, 2115, 43, 2115, 60, 2691, 1]
Sequence #3: [911, 2690, 1, 342, 1, 173, 473, 75, 3, 73, 24, 9, 20, 123, 5, 662, 144, 85, 597, 2115, 43, 2115, 60, 2691, 1, 1538]
Sequence #4: [2690, 1, 342, 1, 173, 473, 75, 3, 73, 24, 9, 20, 123, 5, 662, 144, 85, 597, 2115, 43, 2115, 60, 2691, 1, 1538, 1]
Sequence #5: [1, 342, 1, 173, 473, 75, 3, 73, 24, 9, 20, 123, 5, 662, 144, 85, 597, 2115, 43, 2115, 60, 2691, 1, 1538, 1, 2692]


In [15]:
sequences = np.array(sequences)
sequences

array([[  85,  597,  911, ..., 2115,   60, 2691],
       [ 597,  911, 2690, ...,   60, 2691,    1],
       [ 911, 2690,    1, ..., 2691,    1, 1538],
       ...,
       [ 191,   24,  909, ...,    3,   54,   58],
       [  24,  909,   63, ...,   54,   58,    4],
       [ 909,   63,    2, ...,   58,    4,  547]])

In [16]:
VOCABULARY_SIZE = len(tokenizer.word_counts)
VOCABULARY_SIZE

6527

## Train Test Split

In [17]:
X = sequences[:,:-1]
y = sequences[:,-1]

y = to_categorical(y, num_classes=VOCABULARY_SIZE+1)

In [18]:
SEQ_LEN = X.shape[1]

## Create Model

In [19]:
def create_model(vocab_size, sequence_len):
    model = Sequential()
    model.add(Embedding(vocab_size, 25, input_length=sequence_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))
    model.add(Dense(vocab_size, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

## Train Model

In [20]:
model = create_model(VOCABULARY_SIZE+1, SEQ_LEN)
model.summary()
# model.fit(X, y, batch_size=128, epochs=300, verbose=1)



In [21]:
# model.save('lstm.h5')
# dump(tokenizer, open('lstm_tokenizer', 'wb'))



## Load Models

In [22]:
model = load_model('models/lstm.keras')
with open('models/lstm_tokenizer', 'rb') as f:
    tokenizer = load(f)

In [30]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    output_text = []
    input_text = seed_text

    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        pred_word_ind = np.argmax(model.predict(pad_encoded, verbose=0)[0])
        pred_word = tokenizer.index_word[pred_word_ind]
        input_text += ' ' + pred_word
        output_text.append(pred_word)

    return ' '.join(output_text)

## Test the Model using a Random Sequence

In [27]:
seed_text = text_sequences[random.randint(0, len(text_sequences))]
seed_text

['of',
 'raw',
 'material',
 'for',
 'your',
 'new',
 'thought',
 'The',
 'teller',
 'in',
 'your',
 'memory',
 'bank',
 'is',
 'tremendously',
 'reliable',
 'He',
 'never',
 'crosses',
 'you',
 'up',
 'If',
 'you',
 'approach',
 'him',
 'and']

In [28]:
seed_text = ' '.join(seed_text)
seed_text

'of raw material for your new thought The teller in your memory bank is tremendously reliable He never crosses you up If you approach him and'

In [31]:
generate_text(model, tokenizer, SEQ_LEN, seed_text, 25)

'say , , , , , , , , , , , , , , , , , , , , , , , ,'