In [1]:
# Imports

import os
import json
import random
import numpy as np
from six.moves import range

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

import torch
import torch.nn as nn
from torch.autograd import Variable
import torchvision

import lc_options
from utils import lc_utilities as utils
from rouge import Rouge
from similarity.normalized_levenshtein import NormalizedLevenshtein
from gensim.models import KeyedVectors
from scipy import spatial
word2vec = KeyedVectors.load_word2vec_format(
    'data/word2vec/GoogleNews-vectors-negative300.bin', binary=True)

In [28]:
# Parameters

params = {
    # processed data
    'inputJson': "data/processed_data/processed_data.json",
    # No need for GPU
    'useGPU': False, 
    # Abot checkpoint
    'startFrom': "./checkpoints/rl_rouge/abot_ep_19.vd",
    # Qbot checkpoint
    'qstartFrom': "./checkpoints/rl_rouge/qbot_ep_19.vd",
    'beamSize': 5,
}

# RNG seed
manualSeed = 1597
random.seed(manualSeed)
torch.manual_seed(manualSeed)
if params['useGPU']:
    torch.cuda.manual_seed_all(manualSeed)

print('Loading json file: ' + params['inputJson'])
with open(params['inputJson'], 'r') as fileId:
    info = json.load(fileId)

wordCount = len(info['word2ind'])
# Add <START> and <END> to vocabulary
info['word2ind']['<START>'] = wordCount + 1
info['word2ind']['<END>'] = wordCount + 2
startToken = info['word2ind']['<START>']
endToken = info['word2ind']['<END>']
# Padding token is at index 0
vocabSize = wordCount + 3
print('Vocab size with <START>, <END>: %d' % vocabSize)

# Construct the reverse map
info['ind2word'] = {
    int(ind): word
    for word, ind in info['word2ind'].items()
}

Loading json file: data/processed_data/processed_data.json
Vocab size with <START>, <END>: 4952


In [29]:
# Load Models
def loadModel(params, agent='abot'):
    # should be everything used in encoderParam, decoderParam below
    encoderOptions = [
        'encoder', 'vocabSize', 'embedSize', 'rnnHiddenSize', 'numLayers',
        'useHistory', 'numRounds', 'dropout', 'useSumm'
    ]
    decoderOptions = [
        'decoder', 'vocabSize', 'embedSize', 'rnnHiddenSize', 'numLayers',
        'dropout'
    ]
    modelOptions = encoderOptions + decoderOptions

    mdict = None
    gpuFlag = params['useGPU']
    startArg = 'startFrom' if agent == 'abot' else 'qstartFrom'
    assert params[startArg], "Need checkpoint for {}".format(agent)

    if params[startArg]:
        print('Loading model (weights and config) from {}'.format(
            params[startArg]))

        if gpuFlag:
            mdict = torch.load(params[startArg])
        else:
            mdict = torch.load(params[startArg],
                map_location=lambda storage, location: storage)

        # Model options is a union of standard model options defined
        # above and parameters loaded from checkpoint
        modelOptions = list(set(modelOptions).union(set(mdict['params'])))
        for opt in modelOptions:
            if opt not in params:
                params[opt] = mdict['params'][opt]

            elif params[opt] != mdict['params'][opt]:
                # Parameters are not overwritten from checkpoint
                pass

    # Initialize model class
    encoderParam = {k: params[k] for k in encoderOptions}
    decoderParam = {k: params[k] for k in decoderOptions}

    encoderParam['startToken'] = encoderParam['vocabSize'] - 2
    encoderParam['endToken'] = encoderParam['vocabSize'] - 1
    decoderParam['startToken'] = decoderParam['vocabSize'] - 2
    decoderParam['endToken'] = decoderParam['vocabSize'] - 1

    if agent == 'abot':
        encoderParam['type'] = params['encoder']
        decoderParam['type'] = params['decoder']
        encoderParam['isAnswerer'] = True
        from lc.models.lc_answerer import Answerer
        model = Answerer(encoderParam, decoderParam)

    elif agent == 'qbot':
        encoderParam['type'] = params['qencoder']
        decoderParam['type'] = params['qdecoder']
        encoderParam['isAnswerer'] = False
        encoderParam['useSumm'] = False
        from lc.models.lc_questioner import Questioner
        model = Questioner(
            encoderParam,
            decoderParam,
            summSize=200)

    if params['useGPU']:
        model.cuda()

    if mdict:
        model.load_state_dict(mdict['model'])
        
    print("Loaded agent {}".format(agent))
    return model

In [30]:
# Helpers

from nltk.corpus import stopwords
import nltk

nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')

ind_map = lambda words: np.array([info['word2ind'].get(word, info['word2ind']['UNK']) 
                                  for word in words], dtype='int64')

tokenize = lambda string: ['<START>'] + word_tokenize(string) + ['<END>']

to_str_gt = lambda w: str(" ".join([info['ind2word'][x] for x in filter(
        lambda x:x>0,w.data.cpu().numpy())]))[8:-6]

to_str_pred = lambda w, l: str(" ".join([info['ind2word'][x] for x in list( filter(
        lambda x:x>0,w.data.cpu().numpy()))][:l.data.cpu()[0]]))[8:]

def var_map(tensor):
    return Variable(tensor.unsqueeze(0), volatile=True)

def string_conv(string):

    words = nltk.word_tokenize(string)
    words = [word.lower() for word in words if word.isalnum()]
    string = word_tokenize(' '.join(words))
    
    clear_seq = []
    for word in string:
        if (word not in stop_words):
            clear_seq.append(word)
            
    string = ' '.join(clear_seq)
    string_tokens = tokenize(string)
    string = ind_map(string_tokens)
    string_tensor = var_map(torch.from_numpy(string))
    string_lens = var_map(torch.LongTensor([len(string)]))
    
    return string_tensor, string_lens


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anest\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anest\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
# word2vec similarity

def word2vec_transform(sequence):
    vectorized = []
    words = word_tokenize(sequence)
    seq_tokens = [word.lower() for word in words if word.isalnum()]
    
    for word in seq_tokens:
        try:
            vectorized.append(word2vec[word])
        except:
            vectorized.append(np.zeros(300,))
            
    return np.array(vectorized)

def similarity_cosine(vec1, vec2):
    cosine_distance = spatial.distance.cosine(vec1, vec2)
    return 1-cosine_distance

In [32]:
# Load Dataset
doc_data = json.load(open('data/generated_data/gen_dataset.json'))['data']['dialogs']
summ_data = json.load(open('data/generated_data/summary_dataset.json'))

eval_data = []

for doc in doc_data:
    eval_data.append({
        'doc': doc['document'],
        'summ': summ_data[doc['summary']]
    })

# 0-117 -> train data
# 117-156 -> eval data
eval_data = eval_data[117:]

In [59]:
# Number 0-155
num = 25
# Summary-Goal for Example
doc_example = doc_data[num]['document']
print(doc_example)

print('--------------------------------------------------')

# Summary-Goal for Example
summary_example = summ_data[doc_data[num]['summary']]
print(summary_example)

 Congenital heart disease Summary Congenital heart disease (CHD) is a problem with the heart's structure and function that is present at birth. Causes CHD can describe a number of different problems affecting the heart. It is the most common type of birth defect. CHD causes more deaths in the first year of life than any other birth defects. CHD is often divided into two types: cyanotic (blue skin color caused by a lack of oxygen) and non-cyanotic. The following lists cover the most common CHDs: Cyanotic: Ebstein's anomaly Hypoplastic left heart Pulmonary atresia Tetralogy of Fallot Total anomalous pulmonary venous return Transposition of the great vessels Tricuspid atresia Truncus arteriosus Non-cyanotic: Aortic stenosis Bicuspid aortic valve Atrial septal defect (ASD) Atrioventricular canal (endocardial cushion defect) Coarctation of the aorta Patent ductus arteriosus (PDA) Pulmonic stenosis Ventricular septal defect (VSD) These problems may occur alone or together. Most children with

In [60]:
numRounds = 10
beamSize = 5

summary_tensor, summary_lens = string_conv(summary_example)
document_tensor, document_lens = string_conv(doc_example)

aBot = None
qBot = None

# load aBot
if params['startFrom']:
    aBot = loadModel(params, 'abot')
    assert aBot.encoder.vocabSize == vocabSize, "Vocab size mismatch!"
    aBot.eval()
    
# load qBot
if params['qstartFrom']:
    qBot = loadModel(params, 'qbot')
    assert qBot.encoder.vocabSize == vocabSize, "Vocab size mismatch!"
    qBot.eval()


# prepare for dialogue
if aBot:
    aBot.eval(), aBot.reset()
    aBot.train(), aBot.reset()
    aBot.observe(-1, summary=summary_tensor, summaryLens=summary_lens, document=document_tensor,
                    documentLens=document_lens)

if qBot:
    qBot.eval(), qBot.reset()
    qBot.observe(-1, document=document_tensor,
                    documentLens=document_lens)
summ, summ_lens = qBot.predictSummary(inference='greedy')

# dialogue before summ generation
for round in range(numRounds):
    questions, quesLens = qBot.forwardDecode(
        beamSize=beamSize, inference='greedy')
    qBot.observe(round, ques=questions, quesLens=quesLens)
    aBot.observe(round, ques=questions, quesLens=quesLens)
    answers, ansLens = aBot.forwardDecode(
        beamSize=beamSize, inference='greedy')
    aBot.observe(round, ans=answers, ansLens=ansLens)
    qBot.observe(round, ans=answers, ansLens=ansLens)
    
    summ, summ_lens = qBot.predictSummary(inference='greedy')
    print('Q%s:' %round, to_str_pred(questions[0], quesLens))
    print('A%s:' %round, to_str_pred(answers[0], ansLens))

print('S%s:' %-1, to_str_pred(summ[0], summ_lens))

  return Variable(tensor.unsqueeze(0), volatile=True)


Loading model (weights and config) from ./checkpoints/rl_rouge/abot_ep_19.vd
Encoder: hre-ques-lateim-hist
Decoder: gen
Loaded agent abot
Loading model (weights and config) from ./checkpoints/rl_rouge/qbot_ep_19.vd
Encoder: hre-ques-lateim-hist
Decoder: gen
Loaded agent qbot
Q0: how do you get pulmonary hypertension ?
A0: the right side of the heart pumps blood through the lungs where it picks up oxygen blood returns to the left side of the heart where it is pumped to the rest of the body when the small arteries blood
Q1: what is the cause of bulimia eye ?
A1: achondroplasia is a rare genetic disorder characterized by UNK head and facial and skeletal abnormalities delayed intellectual development short stature and hypotonia the disorder is caused by a defective gene UNK which is found in
Q2: what is the name of the eye of the eye of the eye of the eye called ?
A2: syndrome treatment treatment is a group of medication and the emphasis is on the most common type of dwarfism achondroplasi