In [0]:
%tensorflow_version 1.0

import json                                                       
import numpy as np
import pandas as pd
import string
import tensorflow as tf
from string import digits
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split


`%tensorflow_version` only switches the major version: 1.x or 2.x.
You set: `1.0`. This will be interpreted as: `1.x`.


TensorFlow 1.x selected.


In [0]:
print(tf.__version__)      #1.15.2 

1.15.2


In [0]:
# Preprocess
# Converting json to clean pandas format
def squad_json_to_dataframe_train(input_file_path, record_path = ['data','paragraphs','qas','answers'],
            verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
    js['q_idx'] = ndx
    main = pd.concat([ m[['id','question','context']].set_index('id'),js.set_index('q_idx')],1,sort=False).reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
# training data

input_file_path = "gdrive/My Drive/Colab Notebooks/train-v2.0.json"
record_path = ['data','paragraphs','qas','answers']
train = squad_json_to_dataframe_train(input_file_path,record_path)

Reading the json file
processing...




shape of the dataframe is (130319, 6)
Done


In [0]:
train.head()

Unnamed: 0,index,question,context,text,answer_start,c_id
0,56be85543aeaaa14008c9063,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,in the late 1990s,269.0,0
1,56be85543aeaaa14008c9065,What areas did Beyonce compete in when she was...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,singing and dancing,207.0,0
2,56be85543aeaaa14008c9066,When did Beyonce leave Destiny's Child and bec...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,2003,526.0,0
3,56bf6b0f3aeaaa14008c9601,In what city and state did Beyonce grow up?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,"Houston, Texas",166.0,0
4,56bf6b0f3aeaaa14008c9602,In which decade did Beyonce become famous?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,late 1990s,276.0,0


In [0]:
# Question and Answer added to list if they are string

train_question_list = list()
train_answer_list = list()

i=0
for i in range(len(train)):
    if type(train["text"][i]) == str and type(train["question"][i])==str:
        train_answer_list.append(train["text"][i])
        train_question_list.append(train["question"][i])
    i+=1

In [0]:
# Question and Answers are lowered and punctuations are removed

for k in range(len(train_answer_list)):
    train_question_list[k]=train_question_list[k].lower()
    train_answer_list[k]   = train_answer_list[k].lower()

for m in range(len(train_answer_list)):
    train_question_list[m] = train_question_list[m].translate(str.maketrans('', '', string.punctuation))
    train_answer_list[m]= train_answer_list[m].translate(str.maketrans('', '', string.punctuation))

In [0]:
print("Question and Answer List Length is {}".format(len(train_question_list)))

Question and Answer List Length is 86821


In [0]:
#Preparing input data for the Encoder

print(train_question_list[0])
tokenizer_train_question = Tokenizer()
tokenizer_train_question.fit_on_texts(train_question_list)                                    #Finds the all off the unique words in the data and assigns each a unique integer.list of texts to train on.
train_question_sequences = tokenizer_train_question.texts_to_sequences(train_question_list)   #Converts each sequence from a list of words to a list of integers.list of texts to turn to sequences.
print(train_question_sequences[0])

when did beyonce start becoming popular
[14, 8, 326, 151, 1474, 211]


In [0]:
 #Find the max input length for question

length_list = list()
for seq in train_question_sequences:                                  
    length_list.append(len(seq))
max_input_length = np.array(length_list).max()
print('Max input length is {}'.format(max_input_length))

Max input length is 40


In [0]:
#Creating dictionary

train_question_dictionary = tokenizer_train_question.word_index
num_train_question_tokens = len(train_question_dictionary) + 1
print('Number of question tokens = {}\n'.format(num_train_question_tokens))
print(train_question_dictionary)   # All question words

Number of question tokens = 39982



In [0]:
# Preparing input data for the Decoder

print(train_answer_list[0])                           #Append START tag at the first position and END tag at the last position of the answer list.
train_answer_lines = list()
for line in train_answer_list:
    #train_answer_lines.append('<START> ' + line + ' <END>')    # Punctuations are removed so different method used.
    train_answer_lines.append('starttt ' + line + ' enddd')
print(train_answer_lines[0])

in the late 1990s
starttt in the late 1990s enddd


In [0]:
tokenizer_train_answer = Tokenizer()
tokenizer_train_answer.fit_on_texts(train_answer_lines)                                #Finds the all off the unique words in the data and assigns each a unique integer.list of texts to train on.
train_answer_sequences = tokenizer_train_answer.texts_to_sequences(train_answer_lines) #Converts each sequence from a list of words to a list of integers.list of texts to turn to sequences.
print(train_answer_sequences[0])

[1, 8, 3, 147, 558, 2]


In [0]:
#Find the max input length for answer

length_list = list()                                              
for seq in train_answer_sequences:
    length_list.append(len(seq))
max_output_length = np.array(length_list).max()
print('Max output length is {}'.format(max_output_length))

Max output length is 45


In [0]:
train_answer_dictionary = tokenizer_train_answer.word_index
num_train_answer_tokens = len(train_answer_dictionary) + 1
print('Number of answer tokens = {}\n'.format(num_train_answer_tokens))

print(train_answer_dictionary)

Number of answer tokens = 42276



In [0]:
X_train, Y_train = train_question_list, train_answer_lines
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1,random_state=53)     # Validation do not work properly we need more data
#print(len(X_train), len(X_test))

In [0]:
# To avoid Memory Crash batches are generated

def generate_batch(X_train, Y_train, batch_size):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X_train), batch_size):
            encoder_input_data = np.zeros((batch_size, max_input_length),dtype='float32')                                 
            decoder_input_data = np.zeros((batch_size, max_output_length),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_output_length, num_train_answer_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X_train[j:j+batch_size], Y_train[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = train_question_dictionary[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        try:
                            decoder_input_data[i, t] = train_answer_dictionary[word] # decoder input seq
                        except:
                          continue
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, train_answer_dictionary[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)
            

In [0]:
#Model definition

dimension = 256

encoder_inputs = tf.keras.layers.Input(shape=( None , ))

encoder_embedding = tf.keras.layers.Embedding( num_train_question_tokens, dimension , mask_zero=True ) (encoder_inputs)

encoder_outputs, state_h, state_c = LSTM(dimension, return_state=True)(encoder_embedding) #recurrent_dropout=0.2 ,dropout=0.2 )    # dropout not needed because there is no validation

encoder_states = [state_h, state_c]

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [0]:
decoder_inputs = Input(shape=(None,))

decoder_embedding = Embedding(num_train_answer_tokens, dimension, mask_zero=True) (decoder_inputs)

decoder_lstm = LSTM( dimension , return_state=True , return_sequences=True) #recurrent_dropout=0.2 , dropout=0.2)            # dropout not needed because there is no validation

decoder_outputs , _ , _ = decoder_lstm (decoder_embedding, initial_state=encoder_states)

decoder_dense = Dense( num_train_answer_tokens, activation="softmax") 

output = decoder_dense (decoder_outputs)

In [0]:
model = Model([encoder_inputs, decoder_inputs], output)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 256)    10235392    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 256)    10822656    input_2[0][0]                    
______________________________________________________________________________________________

In [0]:
saved_model = load_model('gdrive/My Drive/Colab Notebooks/model_final.h5')    # Loading trained model 

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [0]:
weights=saved_model.get_weights()          # Weights are transfered to our model
model.set_weights(weights)

In [0]:
#batch_size = 32
#train_samples = len(X_train)
#epochs = 5

#model.fit_generator(generate_batch(X_train, Y_train, batch_size),
                    #steps_per_epoch = train_samples//batch_size,
                    #epochs=epochs)

In [0]:
#model.save("gdrive/My Drive/Colab Notebooks/model_final.h5")       # Saving model for later use

In [0]:
def make_inference_models():
    
    encoder_model = Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = Input(shape=(dimension,))
    decoder_state_input_c = Input(shape=(dimension,))
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    


    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_embedding , initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
    
    return encoder_model , decoder_model

In [0]:
def str_to_tokens(sentence:str):
    words = sentence.lower().split()
    encoded = tokenizer_train_question.texts_to_sequences([words])[0]
    encoded = pad_sequences([encoded], maxlen=max_input_length, padding ="post")
    #print(encoded)
    return encoded

In [0]:
# Predict

enc_model , dec_model = make_inference_models()

while True:
    user_question=input( 'Enter question:  ' )
    states_values = enc_model.predict(str_to_tokens(user_question))
    empty_target_seq = np.zeros((1,1))
    empty_target_seq[0, 0] = train_answer_dictionary['starttt']
    stop_condition = False
    decoded_translation = ''

    if user_question == "esc":
        print("Exiting from the program")
        break
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        
        for word , index in train_answer_dictionary.items() :
            if sampled_word_index == index :
                if word != 'enddd':
                    decoded_translation += ' {}'.format( word )
                sampled_word = word
        
        if sampled_word == 'enddd' or len(decoded_translation.split()) > max_output_length:
            stop_condition = True
            
        empty_target_seq = np.zeros((1,1))  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 
    print("Answer        :" ,decoded_translation)

Enter question:  when did beyonce start becoming popular
Answer        :  in the late 1990s
Enter question:  when did beyonce becoming popular
Answer        :  in the late 1990s
Enter question:  when beyonce start becoming popular
Answer        :  in the late 1990s
Enter question:  when did beyonce popular
Answer        :  in the late 1990s to speed
Enter question:  esc
Exiting from the program
