# Importing Data

In [1]:
data = open("BUISNESS ADVENTURES.txt", "r", encoding= 'utf8').read()
print (data[:56])

Business Adventures
Twelve Classic Tales from the World


In [2]:
text = open("BUISNESS ADVENTURES.txt", 'r', encoding='utf-8').read().lower()
print('corpus length:', len(text))

corpus length: 889256


In [3]:
import string

In [4]:
def clean_doc(doc):
    # replace '--' with a space ' '
    doc = doc.replace('--', ' ')
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    tokens = [' ' if w in string.punctuation else w for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # make lower case
    tokens = [word.lower() for word in tokens]
    return tokens
 
tokens = clean_doc(text)

number_of_unique_tokens = len(set(tokens))


In [5]:
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % number_of_unique_tokens)
print('These are the first 200 tokens: %s' % tokens[:200])

Total Tokens: 124437
Unique Tokens: 10353
These are the first 200 tokens: ['business', 'adventures', 'twelve', 'classic', 'tales', 'from', 'the', 'world', 'of', 'wall', 'street', 'john', 'brooks', 'contents', 'the', 'fluctuation', 'the', 'little', 'crash', 'in', 'the', 'fate', 'of', 'the', 'edsel', 'a', 'cautionary', 'tale', 'the', 'federal', 'income', 'tax', 'its', 'history', 'and', 'peculiarities', 'a', 'reasonable', 'amount', 'of', 'time', 'insiders', 'at', 'texas', 'gulf', 'sulphur', 'xerox', 'xerox', 'xerox', 'xerox', 'making', 'the', 'customers', 'whole', 'the', 'death', 'of', 'a', 'president', 'the', 'impacted', 'philosophers', 'at', 'ge', 'the', 'last', 'great', 'corner', 'a', 'company', 'called', 'piggly', 'wiggly', 'a', 'second', 'sort', 'of', 'life', 'david', 'businessman', 'stockholder', 'season', 'annual', 'meetings', 'and', 'corporate', 'power', 'one', 'free', 'bite', 'a', 'his', 'and', 'his', 'job', 'in', 'defense', 'of', 'sterling', 'the', 'the', 'and', 'the', 'dollar',

In [6]:
sequence_length = 2

# organize into sequences of tokens of input words plus one output word
length = sequence_length + 1
sequences = list()
for i in range(length, len(tokens)):
    # select sequence of tokens
    seq = tokens[i-length:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)

In [7]:
print ('Total Sequences: %d' % len(sequences))
print ('This is the first sequence: {0}'.format(sequences[0]))

Total Sequences: 124434
This is the first sequence: business adventures twelve


# Basic Liabraries

In [8]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

# Applying LSTM Model

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequences)
sequences = tokenizer.texts_to_sequences(sequences)
# vocab_size = len(tokenizer.word_index) + 1
vocab_size = number_of_unique_tokens + 1
 
sequences0 = np.array(sequences)
X, y = sequences0[:,:-1], sequences0[:,-1]
y = to_categorical(y, num_classes=vocab_size)

In [10]:
dimensions_to_represent_word = 100
 
model = Sequential()
model.add(Embedding(vocab_size, sequence_length, input_length=sequence_length))
# We will use a two LSTM hidden layers with 100 memory cells each. 
# More memory cells and a deeper network may achieve better results.
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2, 2)              20708     
                                                                 
 lstm (LSTM)                 (None, 2, 100)            41200     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 100)               10100     
                                                                 
 dense_1 (Dense)             (None, 10354)             1045754   
                                                                 
Total params: 1,198,162
Trainable params: 1,198,162
Non-trainable params: 0
_________________________________________________________________
None


In [11]:
model.fit(X, y, batch_size=201, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1f49c5478e0>

In [12]:
print (X.shape)
prediction = model.predict(X[0].reshape(1,sequence_length))
print (prediction.shape)
print (prediction)

(124434, 2)
(1, 10354)
[[2.0884431e-18 1.4248555e-02 5.7897653e-02 ... 0.0000000e+00
  0.0000000e+00 1.3772051e-18]]


In [13]:
test = ['thank you',
'welcome to',
'when there',
'more than',
'it cannot',
'is that',
'although this',
'do you',
'I was',
'the only',
'a great',
'thats very']

In [14]:
for t in test:
    example = tokenizer.texts_to_sequences([t])
    prediction = model.predict(np.array(example))
    predicted_word = np.argmax(prediction)
    reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
    print ("{0} -> {1}".format(t, reverse_word_map[predicted_word]))

thank you -> have
welcome to -> agreement
when there -> was
more than -> the
it cannot -> be
is that -> the
although this -> unexpectedly
do you -> desire
I was -> the
the only -> exchange
a great -> appeal
thats very -> permanent
