## Words Prediction
### Use cases of Words Prediction/text generation

- Search engines
- Chatbots
- Text summarize
- Question answering

## Workflow
1. Import Corpus text data
2. Preprocessing & Feature Engineering
3. Build LSTM Model
4. Train & Evalute Model
5. Predict words on test data

In [32]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [33]:
filename = 'sherlock.txt'
text = open(filename, 'r', encoding='utf-8').read()
text = text.lower().split('\n')

In [34]:
len(text)

13052

In [35]:
print(text[5000:6000])

['shaken than i had ever seen him.', '', '"that hurts my pride, watson," he said at last. "it is a petty', 'feeling, no doubt, but it hurts my pride. it becomes a personal', 'matter with me now, and, if god sends me health, i shall set my', 'hand upon this gang. that he should come to me for help, and that', 'i should send him away to his death--!" he sprang from his chair', 'and paced about the room in uncontrollable agitation, with a', 'flush upon his sallow cheeks and a nervous clasping and', 'unclasping of his long thin hands.', '', '"they must be cunning devils," he exclaimed at last. "how could', 'they have decoyed him down there? the embankment is not on the', 'direct line to the station. the bridge, no doubt, was too', 'crowded, even on such a night, for their purpose. well, watson,', 'we shall see who will win in the long run. i am going out now!"', '', '"to the police?"', '', '"no; i shall be my own police. when i have spun the web they may', 'take the flies, but not before."

In [36]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)
total_words = len(tokenizer.word_index) + 1
print(tokenizer.word_index)
print(total_words)

8464


In [37]:
input_sequences = []
for line in text:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [38]:
input_sequences[:10]

[[142, 4687],
 [142, 4687, 1],
 [142, 4687, 1, 987],
 [142, 4687, 1, 987, 5],
 [142, 4687, 1, 987, 5, 125],
 [142, 4687, 1, 987, 5, 125, 33],
 [142, 4687, 1, 987, 5, 125, 33, 47],
 [142, 4687, 1, 987, 5, 125, 33, 47, 558],
 [142, 4687, 1, 987, 5, 125, 33, 47, 558, 2162],
 [142, 4687, 1, 987, 5, 125, 33, 47, 558, 2162, 2163]]

In [39]:
# pad sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
max_sequence_len

17

In [40]:
input_sequences[:3]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,  142, 4687],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,  142, 4687,    1],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,  142, 4687,    1,  987]])

In [41]:
#create predictores and label
X, labels = input_sequences[:,:-1],input_sequences[:,-1]
y = to_categorical(labels, num_classes=total_words)

In [42]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(total_words, activation='softmax'))
adam = Adam(learning_rate=0.01)
model.compile( optimizer=adam,loss="categorical_crossentropy", metrics=['acccuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 16, 100)           846400    
                                                                 
 lstm_3 (LSTM)               (None, 100)               80400     
                                                                 
 dropout_3 (Dropout)         (None, 100)               0         
                                                                 
 dense_3 (Dense)             (None, 8464)              854864    
                                                                 
Total params: 1781664 (6.80 MB)
Trainable params: 1781664 (6.80 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [43]:
histroy=model.fit(X, y, batch_size=128, epochs=10)

Epoch 1/10


TypeError: in user code:

    File "C:\Users\acer\anaconda3\lib\site-packages\keras\src\engine\training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\acer\anaconda3\lib\site-packages\keras\src\engine\training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\acer\anaconda3\lib\site-packages\keras\src\engine\training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\acer\anaconda3\lib\site-packages\keras\src\engine\training.py", line 1085, in train_step
        return self.compute_metrics(x, y, y_pred, sample_weight)
    File "C:\Users\acer\anaconda3\lib\site-packages\keras\src\engine\training.py", line 1179, in compute_metrics
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
    File "C:\Users\acer\anaconda3\lib\site-packages\keras\src\engine\compile_utils.py", line 605, in update_state
        metric_obj.update_state(y_t, y_p, sample_weight=mask)
    File "C:\Users\acer\anaconda3\lib\site-packages\keras\src\utils\metrics_utils.py", line 77, in decorated
        update_op = update_state_fn(*args, **kwargs)
    File "C:\Users\acer\anaconda3\lib\site-packages\keras\src\metrics\base_metric.py", line 140, in update_state_fn
        return ag_update_state(*args, **kwargs)
    File "C:\Users\acer\anaconda3\lib\site-packages\keras\src\metrics\base_metric.py", line 723, in update_state  **
        matches = ag_fn(y_true, y_pred, **self._fn_kwargs)

    TypeError: 'str' object is not callable


In [27]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted=np.argmax(model.predict(token_list), axis=1)
        
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
        return seed_text.title()

In [29]:
print(generate_text("the adventure of the blue carbuncle ", 3, model, max_sequence_len))
print(generate_text("the adventure of the blue carbuncle ", 5, model, max_sequence_len))
print(generate_text("the adventure of the blue carbuncle ", 10, model, max_sequence_len))

The Adventure Of The Blue Carbuncle  He'D
The Adventure Of The Blue Carbuncle  He'D
The Adventure Of The Blue Carbuncle  He'D
