# Persona Dialog Generation

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from persona.preprocess import prepare_json_data, pad_sequences, integer_encode, one_hot_encode, one_hot_encode_target
import random

## Preprocess the data

In [3]:
MAX_SEQ_LEN = 15
input_word_model, output_word_model, pairs = \
    prepare_json_data('input', 'output', './data/persona.dialog.json', MAX_SEQ_LEN)

READ 804 sentence pairs
Trimmed to 715 sentence pairs
Counting words...
Counted Words:
input 261
output 131


In [4]:
input_seqs = [pair[0] for pair in pairs]
output_seqs = [pair[1] for pair in pairs]
for i in range(10):
    rando = random.randint(0, len(pairs))
    print(input_seqs[rando],"-->" ,output_seqs[rando])

:open.path: start a file manager in {path} --> opening {path} in file manager.. .
:date: is today {weekday} --> today is the {day}{day suffix} of {month} .
:remove.all: clear the todo --> i ve removed all tasks from your todo list .
:say: repeat the phrase {words} --> {words} .
:update: regenerate {skill} s intent cache --> the intent cache for {skill} has been updated .
:show: what s on my todo --> the following are on your todo list: {tasks}
:open.path: open file manager to {path} --> opening {path} in file manager.. .
:pair: re pair my device --> please pair with the following code: {code}
:say: repeat the word {words} --> {words} .
:mem.total: total ram --> {total short} of memory is installed .


#### Pad the input and output
This process will add the PAD, SOS, EOS, and UNK tokens to sequence

In [5]:
padded_input = pad_sequences(input_seqs, MAX_SEQ_LEN)
print(padded_input[1])

['SOS', ':why.did.skill.fail:', 'why', 'did', '{skill}', 'fail', 'EOS', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']


In [6]:
padded_output = pad_sequences(output_seqs, MAX_SEQ_LEN)
print(padded_output[1])

['SOS', '{skill}', 'failed', 'because', 'of', '{error}', '.', 'EOS', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']


#### One hot encode

In [7]:
one_hot_input = one_hot_encode(padded_input, input_word_model, MAX_SEQ_LEN)
one_hot_input.shape

(715, 15, 261)

In [8]:
one_hot_output = one_hot_encode(padded_output, output_word_model, MAX_SEQ_LEN)
print(one_hot_output.shape)
one_hot_output[1]

(715, 15, 131)


array([[ 0.,  1.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [9]:
one_hot_target = one_hot_encode_target(padded_output, output_word_model, MAX_SEQ_LEN)
one_hot_target.shape

(715, 15, 131)

## Build and Train the Model

In [10]:
from persona.model.dialog import DialogModel

Using TensorFlow backend.


In [11]:
model = DialogModel(one_hot_input, one_hot_output, one_hot_target)
model.encoder(input_word_model.n_words)
model.decoder(output_word_model.n_words)
model.train(optimizer='adam', epochs=100, summary=True)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, None, 261)     0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, None, 131)     0                                            
____________________________________________________________________________________________________
lstm_1 (LSTM)                    [(None, 128), (None,  199680      input_1[0][0]                    
____________________________________________________________________________________________________
lstm_2 (LSTM)                    [(None, None, 128), ( 133120      input_2[0][0]                    
                                                                   lstm_1[0][1]            

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [12]:
from persona.preprocess import pad_sequence

try:
    while True:
        _input = input("input: ")
        padded_input = [pad_sequence(_input, MAX_SEQ_LEN).split()]
        one_hot = one_hot_encode(padded_input, input_word_model, MAX_SEQ_LEN)
        prediction, confidence = model.decode(one_hot, output_word_model, MAX_SEQ_LEN)
        print("response: ", prediction, confidence)
except KeyboardInterrupt:
    pass

input: :pair: re pair my device
response:  i ve removed all tasks from your todo list . 0.674955844879
input: :open.path: start a file manager in {path}
response:  opening {path} in file manager.. . 0.989449818929
input: :say: repeat the word {words}
response:  {words} . 0.972003757954
input: :show: what s on my todo
response:  the following are on your todo list: {tasks} 0.986812509596
