In [None]:
import os
import yaml
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, activations, models, preprocessing
from tensorflow.keras import preprocessing, utils
import re
import matplotlib.pyplot as plt
from nltk.translate.bleu_score import corpus_bleu


In [None]:
print(tf.__version__)

dir_path = '/kaggle/input/chatterbotenglisch'
files_list = os.listdir(dir_path + os.sep)

batch_size = 16  # Batch size for training.
epochs = 120  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
embedding_dim = 100
val = 10

In [None]:
questions = list()
answers = list()

for filepath in files_list:
    stream = open(dir_path + os.sep + filepath, 'rb')
    docs = yaml.safe_load(stream)
    conversations = docs['conversations']
    for con in conversations:
        if len(con) > 2:
            questions.append(con[0])
            replies = con[1:]
            ans = ''
            for rep in replies:
                ans += ' ' + rep
            answers.append(ans)
        elif len(con) > 1:
            questions.append(con[0])
            answers.append(con[1])

answers_with_tags = list()
for i in range(len(answers)):
    if type(answers[i]) == str:
        answers_with_tags.append(answers[i])
    else:
        questions.pop(i)

answers = list()
for i in range(len(answers_with_tags)):
    answers.append('<START> ' + answers_with_tags[i] + ' <END>')

In [None]:
def tokenize(sentences):
    sentences_clear = []
    for sentence in sentences:
        sentence = sentence.lower()
        sentence = re.sub('can\'t', 'can not', sentence)
        sentence = re.sub('n\'t', ' not', sentence)
        sentence = re.sub('\'ve', ' have', sentence)
        sentence = re.sub('\'ll', ' will', sentence)
        sentence = re.sub('\'s', ' is', sentence)
        sentence = re.sub('\'m', ' am', sentence)
        sentence = re.sub('\'re', ' are', sentence)
        sentence = re.sub('\'d', ' would', sentence)
        sentences_clear.append(sentence)
    return sentences_clear

In [None]:
print(answers[0])

answersTok = tokenize(answers)
questionsTok = tokenize(questions)
val_data_count = int(len(answersTok) * val / 100)
train_data_count = len(answersTok) - val_data_count
print(answersTok[0])
print("val_data_count: ", val_data_count)
print("train_data_count: ", train_data_count)

In [None]:
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(questionsTok + answersTok)
word_index = tokenizer.word_index
VOCAB_SIZE = len(word_index) + 1
print('Found %s unique tokens.' % VOCAB_SIZE)  # Found 56855 unique tokens.

In [None]:
# shuffle
indices = np.arange(len(questions))
np.random.shuffle(indices)
answers = list()
questions = list()
for i in indices:
    answers.append(answersTok[i])
    questions.append(questionsTok[i])

In [None]:
# TODO: separate maxlentrain from maxlenval *(optional)and vocabsize also 
# encoder_input_data
tokenized_questions = tokenizer.texts_to_sequences(questions)
maxlen_questions = max([len(x) for x in tokenized_questions])
print(maxlen_questions)
# decoder_input_data
tokenized_answers = tokenizer.texts_to_sequences(answers)
maxlen_answers = max([len(x) for x in tokenized_answers])
print(maxlen_answers)


In [None]:
tokenized_questions_train = tokenized_questions[:train_data_count]
tokenized_answers_train = tokenized_answers[:train_data_count]
questions_train = questions[:train_data_count]
answers_train = answers[:train_data_count]
tokenized_questions_val = tokenized_questions[train_data_count+1:]
tokenized_answers_val = tokenized_answers[train_data_count+1:]
questions_val = questions[train_data_count+1:]
answers_val = answers[train_data_count+1:]
print(len(tokenized_questions_val))

In [None]:
encoder_input_data_train = preprocessing.sequence.pad_sequences(tokenized_questions_train, maxlen=maxlen_questions, padding='post')
decoder_input_data_train = preprocessing.sequence.pad_sequences(tokenized_answers_train, maxlen=maxlen_answers, padding='post')
for i in range(len(tokenized_answers_train)):
    tokenized_answers_train[i] = tokenized_answers_train[i][1:]
padded_answers = preprocessing.sequence.pad_sequences(tokenized_answers_train, maxlen=maxlen_answers, padding='post')
decoder_output_data_train = utils.to_categorical(padded_answers, VOCAB_SIZE)

encoder_input_data_val = preprocessing.sequence.pad_sequences(tokenized_questions_val, maxlen=maxlen_questions, padding='post')
decoder_input_data_val = preprocessing.sequence.pad_sequences(tokenized_answers_val, maxlen=maxlen_answers, padding='post')
for i in range(len(tokenized_answers_val)):
    tokenized_answers_val[i] = tokenized_answers_val[i][1:]
padded_answers = preprocessing.sequence.pad_sequences(tokenized_answers_val, maxlen=maxlen_answers, padding='post')
decoder_output_data_val = utils.to_categorical(padded_answers, VOCAB_SIZE)
print(encoder_input_data_train)

In [None]:
encoder_inputs = tf.keras.layers.Input(shape=(None,))
encoder_embedding = tf.keras.layers.Embedding(VOCAB_SIZE, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_outputs, state_h, state_c = tf.keras.layers.LSTM(latent_dim, dropout=0.2, recurrent_dropout=0.2,
                                                         return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = tf.keras.layers.Input(shape=(None,))
decoder_embedding = tf.keras.layers.Embedding(VOCAB_SIZE, embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(latent_dim, dropout=0.2, recurrent_dropout=0.2, return_state=True,
                                    return_sequences=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = tf.keras.layers.Dense(VOCAB_SIZE, activation=tf.keras.activations.softmax)
output = decoder_dense(decoder_outputs)

In [None]:
model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output)
model.summary()

In [None]:
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy', metrics=['acc'])

In [None]:
history = model.fit([encoder_input_data_train, decoder_input_data_train], decoder_output_data_train, batch_size=batch_size, epochs=epochs,
                    validation_data=([encoder_input_data_val, decoder_input_data_val],decoder_output_data_val))
model.save('model.h5')

In [None]:
#history = model.fit_generator(generator=my_training_batch_generator
#                              , steps_per_epoch = int(len(tokenized_questions_train) // batch_size)
#                              , epochs=epochs
#                              , validation_data = my_validation_batch_generator
#                              , validation_steps = int(len(tokenized_questions_val) // batch_size))

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

In [None]:
# Next: inference mode (sampling).
# Here's the drill:
# 1) encode input and retrieve initial decoder state
# 2) run one step of decoder with this initial state
# and a "start of sequence" token as target.
# Output will be the next target token
# 3) Repeat with the current target token and current states

# Define sampling models
encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)

decoder_state_input_h = tf.keras.layers.Input(shape=(latent_dim,))
decoder_state_input_c = tf.keras.layers.Input(shape=(latent_dim,))

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = tf.keras.models.Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [None]:
# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_word_index = dict(
    (i, word) for word, i in word_index.items())

In [None]:
def decode_sequence(input_tokens):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_tokens)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))
    # Populate the first word of target sequence with the start word.
    target_seq[0][0] = word_index['start']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_word_index[sampled_token_index]
        decoded_sentence += ' ' + sampled_word

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_word == 'end' or
                len(decoded_sentence.split()) > maxlen_answers):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0][0] = sampled_token_index

        # Update states
        states_value = [h, c]
    if (len('end') > len(decoded_sentence)) and ('end' in decoded_sentence[len(decoded_sentence) - len('end'):]):
        decoded_sentence = decoded_sentence[:len(decoded_sentence) - len('end')]

    return decoded_sentence

In [None]:
def str_to_tokens(sentence: str):
    words = sentence.split()
    tokens_list = list()
    for word in words:
        if word in word_index:
            tokens_list.append(word_index[word])
    if len(tokens_list) == 0:
        return None
    else:
        return preprocessing.sequence.pad_sequences([tokens_list], maxlen=maxlen_questions, padding='post')

In [None]:
# Evaluate the model
def eval_model(raw_questions, raw_answers):
    predicted, actual = list(), list()
    for i in range(len(raw_questions)):
        raw_question = raw_questions[i]
        tok = str_to_tokens(raw_question)
        if tok is None:
            continue
        translation = decode_sequence(tok)
        raw_answer = raw_answers[i]
        raw_answer = raw_answer.replace('<start>','').replace('<end>','')
        #if(raw_answer[len(raw_answer)])
        if("end"==translation[len(translation)-3:]):
            translation = translation[:-3]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_question, raw_answer, translation))
        actual.append([raw_answer.split()])
        predicted.append(translation.split())

    # Bleu Scores
    print("##############################################")
    print(actual[0])
    print(predicted[0])
    print(' 1-gram score1: %f' % corpus_bleu(actual, predicted, weights=(1,0,0,0)))
    print(' 2-gram score1: %f' % corpus_bleu(actual, predicted, weights=(0,1,0,0)))
    print(' 3-gram score1: %f' % corpus_bleu(actual, predicted, weights=(0,0,1,0)))
    
    print(' 4-gram score1: %f' % corpus_bleu(actual, predicted, weights=(0,0,0,1)))
    print(' 4-gram score2: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [None]:
print('Testing on trained examples')
#print(answers_train)
def tok(sentences):
    sentences_clear = []
    for sentence in sentences:
        sentence = sentence.lower()
        sentence=sentence.replace('<start>','').replace('<end>','')
        print(sentence)
        sentence = re.sub('[^\w\d\s]', '', sentence)
        print(sentence)
        sentences_clear.append(sentence)
    return sentences_clear
answers_train=tok(answers_train)
print(answers_train)


In [None]:
eval_model(questions_train, answers_train)

In [None]:
# Evaluate the model
def eval_model(raw_questions, raw_answers):
    predicted, actual = list(), list()
    for i in range(len(raw_questions)):
        raw_question = raw_questions[i]
        tok = str_to_tokens(raw_question)
        if tok is None:
            continue
        translation = decode_sequence(tok)
        raw_answer = raw_answers[i]
        raw_answer = raw_answer.replace('<start>','').replace('<end>','')
        translation = translation[:-3]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_question, raw_answer, translation))
        actual.append(raw_answer.split())
        predicted.append(translation.split())

    # Bleu Scores
    score1 = corpus_bleu(reference, candidate, weights=(1,1,1,1))
    score2 = sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25))

print('Testing on validation examples'
eval_model(questions_val, answers_val) 

In [None]:
exit_pro = False
while not exit_pro:
    input_seq = input('Enter question or exit : ')
    input_seq = input_seq.lower().strip()
    if input_seq == "exit":
        exit_pro = True
    else:
        input_tokens = str_to_tokens(input_seq)
        if decoded_sentence is None:
            print("Sorry, I can't answer this question")
        else:
            decoded_sentence = decode_sequence(input_tokens)
            print(decoded_sentence)

In [None]:
st="you are a cheat end"

print(st[:-3]))

In [None]:
from nltk.translate.bleu_score import sentence_bleu
reference = [[['1','3']],[['1','2']]]
candidate = [['1','2'],['1','3']]


score = corpus_bleu(reference, candidate, weights=(1,1,1,1))
print(score)