In [1]:
import numpy as np
import csv
import os
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import backend as K

In [2]:
def load_dataset(file):
    data=[]
    with open (file) as read:
        reader=csv.reader(read)
        for row in reader:
            data.append(row)
    data=np.array(data)
    data=data[:,-2:]
    return data

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer(char_level=False)
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
    global prediksi
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    prediksi = prediction
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
        
    return ' '.join(target)

# translate
def translate(model, all_tokenizer, sources):
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, all_tokenizer, source)
    return translation

def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision
    
def f1(y_true, y_pred):
    result_precision = precision(y_true, y_pred)
    result_recall = recall(y_true, y_pred)
    return 2*((result_precision*result_recall)/(result_precision+result_recall+K.epsilon()))

In [5]:
#load dataset
dataset=load_dataset('cleaning_data.csv')
dataset=np.reshape(dataset,(-1,2))
token_dataset=dataset.reshape(-1,1)#dataset to tokenize

In [6]:
#tokenizing
tokenizer=create_tokenizer(token_dataset[:,0])
wordindex=tokenizer.word_index
len_vocab=len(wordindex)+1
maxlength=max_length(token_dataset[:, 0])

In [14]:
model=load_model('Model/model_gru_gru.h5', custom_objects={'precision':precision, 'recall':recall, 'f1':f1})

In [17]:
dataset[:,0][1]

'well i thought we would start with pronunciation if that is okay with you'

In [None]:
for i in range(20):
    short_questions=dataset[:,0]
    seq_index = np.random.randint(1, len(short_questions))
    q=short_questions[seq_index]
    #we tokenize
    X = tokenizer.texts_to_sequences(q)
    X = pad_sequences(X, maxlength, padding='post')
        
    # find reply and print it out
    a = translate(model, tokenizer, X)
    #a = set(a)
    words = a.split()
    #print('ANSWER: %s' % (thing))
    output= " ".join(sorted(set(words), key=words.index))
    print ('Q:', short_questions[seq_index])
    print ('A:', output)

In [20]:
while(True):
    q = (input(str("YOU: ")))
    if q == 'bye':
        break
    q = q.strip().split('\n')

    #we tokenize
    X = tokenizer.texts_to_sequences(q)
    X = pad_sequences(X, maxlength, padding='post')
        
    # find reply and print it out
    a = translate(model, tokenizer, X)
    #a = set(a)
    words = a.split()
    #print('ANSWER: %s' % (thing))
    print ('ANSWER: ' + " ".join(sorted(set(words), key=words.index)))

ANSWER: have do you
ANSWER: have anything to you
ANSWER: all respect you to
