In [None]:
# on server: 'screen' ,then start script
# use 'strg+a d' to return to terminal
# use 'screen -r' to return to screen
import numpy as np
import json
import os

from keras.layers.embeddings import Embedding
from keras.layers import Concatenate
from keras.models import Model
from keras.layers import Input,Dense,LSTM
from keras.utils import plot_model
# os.environ['CUDA_VISIBLE_DEVICES']='0'

import process_data as pd

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
# https://towardsdatascience.com/nlp-sequence-to-sequence-networks-part-1-processing-text-data-d141a5643b72
path='testpath/'
# rnn parameters
hidden_size = 100 #100 is the standard
batch_size = 512 #for the training on the GPU this to be has to very large, otherwise the GPU is used very inefficiently
epochs = 100

size=10000

#glove embedding parameters
glove_dir = '../glove/glove.6B.100d.txt'
embedding_dim = 100

In [None]:
#open SQuAD-dataset and extract the relevant data from the json-file
#to a easier readable/accessible dictionary
with open('SQuAD/train-v2.0.json') as file:
    train=json.load(file)
train_context=[]
train_question=[]
train_answer=[]
train_new={'context':train_context,'question':train_question,'answer':train_answer}
for j,data in enumerate(train['data']):
    for i,paragraph in enumerate(data['paragraphs']):
        context=paragraph['context']
        for qas in paragraph['qas']:
            #create a dataset with only the answerable questions
            #add a bos and eos token to the target
            if (qas['is_impossible']==False):
                a=context.lower()
                b=qas['question'].lower()
                c=qas['answers'][0]['text'].lower()
                
                train_new['context'].append(a)
                train_new['question'].append(b)
                train_new['answer'].append('START_ '+c+' _END')
print(len(train_new['context']))
print(len(train_new['question']))
print(len(train_new['answer']))

In [None]:
context=train_new['context'][:size]
question=train_new['question'][:size]
answer=train_new['answer'][:size]
data=[context,question,answer]
input_data=pd.process_data(data)

In [None]:
context_encoder_input=input_data['encoder_input']['context_encoder_input']
question_encoder_input=input_data['encoder_input']['question_encoder_input']
answer_decoder_input=input_data['decoder_input']['answer_decoder_input']
answer_decoder_target=input_data['decoder_input']['answer_decoder_target']

context_len_vocab=input_data['len_vocab']['context_len_vocab']
question_len_vocab=input_data['len_vocab']['question_len_vocab']
answer_len_vocab=input_data['len_vocab']['answer_len_vocab']

context_token_to_int=input_data['token_to_int']['context_token_to_int']
question_token_to_int=input_data['token_to_int']['question_token_to_int']
answer_token_to_int=input_data['token_to_int']['answer_token_to_int']

answer_int_to_token=input_data['int_to_token']['answer_int_to_token']

In [None]:
#FIX_ME: add glove download
# https://nlp.stanford.edu/projects/glove/
#get glove embeddings
embeddings_index = {}
f = open(glove_dir)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
#extract the glove-embedding to a matrix
context_embedding_matrix = np.zeros((context_len_vocab, embedding_dim))
for word, i in context_token_to_int.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        context_embedding_matrix[i] = embedding_vector

question_embedding_matrix = np.zeros((question_len_vocab, embedding_dim))
for word, i in question_token_to_int.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        question_embedding_matrix[i] = embedding_vector

answer_embedding_matrix = np.zeros((answer_len_vocab, embedding_dim))
for word, i in answer_token_to_int.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        answer_embedding_matrix[i] = embedding_vector
print(np.shape(context_embedding_matrix),np.shape(question_embedding_matrix),np.shape(answer_embedding_matrix))

In [None]:
#https://medium.com/@dev.elect.iitd/neural-machine-translation-using-word-level-seq2seq-model-47538cba8cd7
# encoder
context_encoder_inputs = Input(shape=(None,))
context_embedding_layer = Embedding(context_len_vocab, 
                        embedding_dim,weights=[context_embedding_matrix],trainable=False)
context_embedding=context_embedding_layer(context_encoder_inputs)

context_decoder_lstm = LSTM(embedding_dim,return_state=True)
context_x, context_state_h, context_state_c = context_decoder_lstm(context_embedding)
context_encoder_states = [context_state_h, context_state_c]


question_encoder_inputs = Input(shape=(None,))
question_embedding_layer = Embedding(question_len_vocab, 
                    embedding_dim,weights=[question_embedding_matrix],trainable=False)
question_embedding=question_embedding_layer(question_encoder_inputs)

question_decoder_lstm = LSTM(embedding_dim,return_state=True)
question_x, question_state_h, question_state_c = question_decoder_lstm(question_embedding)
question_encoder_states = [question_state_h, question_state_c]


encoder_state_h=Concatenate()([context_state_h,question_state_h])
encoder_state_c=Concatenate()([context_state_c,question_state_c])
concat_encoder_states=[encoder_state_h,encoder_state_c]

# decoder #################################
decoder_inputs = Input(shape=(None,))
answer_embedding_layer = Embedding(answer_len_vocab, 
                             embedding_dim,weights=[answer_embedding_matrix])
answer_embedding = answer_embedding_layer(decoder_inputs)

decoder_lstm = LSTM(embedding_dim*2, return_sequences=True,return_state=True)
decoder_lstm_output,_,_ = decoder_lstm(answer_embedding, initial_state=concat_encoder_states)

decoder_dense = Dense(answer_len_vocab, activation='softmax')
decoder_output = decoder_dense(decoder_lstm_output)

model = Model([context_encoder_inputs,question_encoder_inputs, decoder_inputs], decoder_output)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['acc'])
model.summary()

In [None]:
model.fit([context_encoder_input,
           question_encoder_input, 
           answer_decoder_input], 
          answer_decoder_target,
          batch_size=batch_size,
          epochs=epochs,)

In [None]:
# model.load_weights('models/baseline/baseline_model.h5')

In [None]:
# define the encoder model 
encoder_model = Model([context_encoder_inputs,question_encoder_inputs],concat_encoder_states)
encoder_model.summary()

In [None]:
decoder_state_input_h = Input(shape=(hidden_size*2,))
decoder_state_input_c = Input(shape=(hidden_size*2,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

final_dex2= answer_embedding_layer(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(final_dex2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]

decoder_outputs2 = decoder_dense(decoder_outputs2)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2,)

decoder_model.summary()

In [None]:
# encoder_model.predict([input_data['encoder_input']['encoder_input_context'][0:1],
#                        input_data['encoder_input']['encoder_input_question'][0:1]])

In [None]:
def decode_sequence(context_input_seq,question_input_seq,):
    # Encode the input as state vectors.
    states_value = encoder_model.predict([context_input_seq,question_input_seq])
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = answer_token_to_int['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = answer_int_to_token[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 52):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [None]:
for seq_index in range(10):
    context_input_seq = input_data['encoder_input']['context_encoder_input'][seq_index:seq_index+1]
    question_input_seq = input_data['encoder_input']['question_encoder_input'][seq_index:seq_index+1]
    
    decoded_sentence = decode_sequence(context_input_seq,question_input_seq)
    print('-')
    print('context: ',train_new['context'][seq_index:seq_index+1],'\n')
    print('question: ',train_new['question'][seq_index:seq_index+1],'\n')
    print('answer: ', decoded_sentence)

In [None]:
# from IPython.display import SVG
# from keras.utils.vis_utils import model_to_dot

# SVG(model_to_dot(model).create(prog='dot', format='svg'))

In [None]:
# plot_model(model, to_file=path+'/model.png')

In [None]:
# print('save model')
# if not os.path.isdir(path):
#     os.makedirs(path)
# model.save_weights(path+str('baseline_model.h5')) #save weights
# model_json = model.to_json()
# with open(path+str('baseline_model.json'),'w') as json_file:
#     json_file.write(model_json)