In [1]:
from tensorflow import keras
import os, pickle, numpy
import numpy as np



# Import Train and Test datasets

In [2]:
with open('train_qa.txt','rb') as file:
    train_data = pickle.load(file)
print('----------------------------------------------------------------------')
print(f'Train data len: {len(train_data)}')

with open('test_qa.txt','rb') as file:
    test_data = pickle.load(file)
print('----------------------------------------------------------------------')
print(f'Test data len: {len(test_data)}')
print('----------------------------------------------------------------------')

----------------------------------------------------------------------
Train data len: 10000
----------------------------------------------------------------------
Test data len: 1000
----------------------------------------------------------------------


# Data structure

In [3]:
x = 3
print(f'Set {x}')
print('----------------------------------------------------------------------')
print('Story:',' '.join(train_data[x-1][0]))
print('----------------------------------------------------------------------')
print('Question:',' '.join(train_data[x-1][1]))
print('----------------------------------------------------------------------')
print('Answer:',train_data[x-1][2])
print('----------------------------------------------------------------------')


Set 3
----------------------------------------------------------------------
Story: Mary moved to the bathroom . Sandra journeyed to the bedroom . Mary went back to the bedroom . Daniel went back to the hallway . Sandra went to the kitchen . Daniel went back to the bathroom .
----------------------------------------------------------------------
Question: Is Daniel in the office ?
----------------------------------------------------------------------
Answer: no
----------------------------------------------------------------------


# Create a vocabulary

This is unique for this particular dataset

In [4]:
all_data = test_data + train_data
len(all_data)

11000

In [5]:
vocabulary = set()

# add unique words to vocabulary
for story, question, answer in all_data:
    vocabulary = vocabulary.union(set(story))
    vocabulary = vocabulary.union(set(question))

vocabulary.add('no')
vocabulary.add('yes')

vocab_size = len(vocabulary)+1 # +1 because in keras paddind function it is required to have a placeholder

print('Total number of unique words in questions and stories:',vocab_size-1)
vocabulary

Total number of unique words in questions and stories: 37


{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

# Check the longest story and longest question

In [6]:
stories = []
questions = []

for story, question, answer in all_data:
    stories.append(len(story))
    questions.append(len(question))

max_story_len = max(stories)
max_question_len = max(questions)

print('Max story length is:', max_story_len,'words')
print('Max question length is:',max_question_len,'words')

Max story length is: 156 words
Max question length is: 6 words


# Tokenize data

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer


In [78]:
tokenizer = Tokenizer(filters=[]) # because all sighs are important
tokenizer.fit_on_texts(vocabulary)
print(tokenizer.word_index)

# save custom tokenizer
import pandas as pd
pd.DataFrame([tokenizer.word_index.keys(),tokenizer.word_index.values()]).T.rename({1:'word_index', 0:'word'},axis=1).to_csv('./model_checkpoints/tokenizer.csv',index=False)

{'down': 1, 'sandra': 2, '.': 3, 'in': 4, 'garden': 5, 'kitchen': 6, 'back': 7, 'up': 8, 'football': 9, 'travelled': 10, 'got': 11, 'the': 12, 'office': 13, 'apple': 14, 'daniel': 15, '?': 16, 'picked': 17, 'yes': 18, 'there': 19, 'bedroom': 20, 'john': 21, 'grabbed': 22, 'no': 23, 'mary': 24, 'bathroom': 25, 'to': 26, 'left': 27, 'journeyed': 28, 'hallway': 29, 'went': 30, 'took': 31, 'milk': 32, 'dropped': 33, 'put': 34, 'moved': 35, 'is': 36, 'discarded': 37}


In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def vectorize_data(data,
                    word_index,
                    max_story_len,
                    max_question_len):

    stories = []
    questions = []

    answers = []

    for story, question, answer in data:
        stories_part = [word_index[word.lower()] for word in story]         # return index of each word according to their position in word index for stories
        questions_part = [word_index[word.lower()] for word in question]    # return index of each word according to their position in word index for questions

        answers_part = np.zeros(len(word_index)+1)                          # placeholder
        answers_part[word_index[answer]] = 1                                # in the index position of 'yes' or 'no' put 1 

        stories.append(stories_part)
        questions.append(questions_part)
        answers.append(answers_part)

    return (pad_sequences(stories,maxlen=max_story_len),pad_sequences(questions,maxlen=max_question_len),np.array(answers))     # return padded data


    

# Create padded train and test data

In [10]:
inputs_train, questions_train, answers_train = vectorize_data(train_data,word_index=tokenizer.word_index, max_story_len=max_story_len,max_question_len=max_question_len)
inputs_test, questions_test, answers_test = vectorize_data(test_data,word_index=tokenizer.word_index, max_story_len=max_story_len,max_question_len=max_question_len)

In [11]:
print('inputs_train shape:',inputs_train.shape, 'questions_train shape:',questions_train.shape,'answers_train shape:', answers_train.shape)
print()
print('inputs_test shape:',inputs_test.shape, 'questions_test shape:',questions_test.shape,'answers_test shape:', answers_test.shape)

inputs_train shape: (10000, 156) questions_train shape: (10000, 6) answers_train shape: (10000, 38)

inputs_test shape: (1000, 156) questions_test shape: (1000, 6) answers_test shape: (1000, 38)


# Import and instatiate model

In [13]:

from memory_network import memory_network

model = memory_network(max_story_len,max_question_len,vocab_size,optimizer='rmsprop',model_name='memory_network')

Model: "memory_network2_linux"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 156)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 6)]          0                                            
__________________________________________________________________________________________________
sequential (Sequential)         (None, None, 64)     2432        input_1[0][0]                    
__________________________________________________________________________________________________
sequential_2 (Sequential)       (None, 6, 64)        2432        input_2[0][0]                    
______________________________________________________________________________

2022-07-22 13:29:10.265728: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-07-22 13:29:10.265921: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-22 13:29:10.266756: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


# Create a model

In [14]:
callbacks = [
            keras.callbacks.ModelCheckpoint(filepath=f'model_checkpoints/{model.name}.h5',save_best_only=True),
            keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=10, factor=0.1, verbose=2, min_lr=1e-6),
            keras.callbacks.EarlyStopping(monitor='val_loss',patience=15)
            ]

# Training is surprisingly tricky part, remember not to train using GPU, model has to be trained in a sequential manner not parallel, so use CPU instead

In [15]:
history = model.fit([inputs_train, questions_train], 
                    answers_train,
                    batch_size=32,
                    epochs=1200,
                    validation_data=([inputs_test, questions_test], 
                    answers_test),
                    callbacks = callbacks)

2022-07-22 13:29:13.659118: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2022-07-22 13:29:13.676404: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 3600000000 Hz


Epoch 1/1200
Epoch 2/1200
Epoch 3/1200
Epoch 4/1200
Epoch 5/1200
Epoch 6/1200
Epoch 7/1200
Epoch 8/1200
Epoch 9/1200
Epoch 10/1200
Epoch 11/1200
Epoch 12/1200
Epoch 13/1200
Epoch 14/1200
Epoch 15/1200
Epoch 16/1200
Epoch 17/1200
Epoch 18/1200
Epoch 19/1200
Epoch 20/1200
Epoch 21/1200
Epoch 22/1200
Epoch 23/1200
Epoch 24/1200
Epoch 25/1200
Epoch 26/1200
Epoch 27/1200
Epoch 28/1200
Epoch 29/1200
Epoch 30/1200
Epoch 31/1200
Epoch 32/1200
Epoch 33/1200
Epoch 34/1200
Epoch 35/1200
Epoch 36/1200
Epoch 37/1200
Epoch 38/1200
Epoch 39/1200
Epoch 40/1200
Epoch 41/1200
Epoch 42/1200
Epoch 43/1200

Epoch 00043: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 44/1200
Epoch 45/1200
Epoch 46/1200
Epoch 47/1200
Epoch 48/1200


# Check what model has learned

In [82]:
model = keras.models.load_model('./model_checkpoints/memory_network.h5')
qa_tokenizer = pd.read_csv('model_checkpoints/tokenizer.csv',dtype={'word':str})
qa_tokenizer = dict(list(zip(qa_tokenizer['word'],qa_tokenizer['word_index'])))


In [83]:
def generate_question():
    import random

    x = random.randint(a=1,b=len(test_data))

    print('Story:',' '.join(test_data[x-1][0]))
    s = test_data[x-1][0]
    print('Question:',' '.join(test_data[x-1][1]))
    q = test_data[x-1][1]
    print('Answer:',test_data[x-1][2])
    a = test_data[x-1][2]

    return s, q, a, [(test_data[x-1])]


In [84]:
_,_,_,question = generate_question()

my_story,my_ques,my_ans = vectorize_data(question, qa_tokenizer,max_story_len,max_question_len)

pred_results = model.predict(([ my_story, my_ques]))

yes_prob = pred_results[0][qa_tokenizer['yes']]
no_prob = pred_results[0][qa_tokenizer['no']]

if yes_prob > no_prob:
    k = 'yes'
    prob = yes_prob
else:
    k = 'no'
    prob = no_prob

print("Predicted answer is: ", k)
print("Probability of certainty was: ", round(prob*100,2),"%")

Story: Mary travelled to the hallway . Daniel got the apple there . Daniel journeyed to the bathroom . Mary moved to the kitchen . Daniel travelled to the hallway . Daniel dropped the apple .
Question: Is Mary in the bedroom ?
Answer: no
Predicted answer is:  no
Probability of certainty was:  99.41 %
