<a href="https://colab.research.google.com/github/Mayur619/NMT/blob/master/NMT_TF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving answers to answers
Saving questions to questions
User uploaded file "answers" with length 502719 bytes
User uploaded file "questions" with length 200488 bytes


In [0]:
import tensorflow as tf
import os
import numpy as np
import copy
import _pickle as pickle

In [0]:
def load_data(path):
    input_file=os.path.join(path)
    with open(input_file,'r') as file:
        data=file.read()
    return data

In [0]:
source_path='questions'
target_path='answers'
source_text=load_data(source_path)
target_text=load_data(target_path)

In [0]:
CODES={'<PAD>':0,'<EOS>':1,'<UNK>':2,'<GO>':3}
def create_lookup_tables(text):
    vocab=set(text.split())
    word2int=copy.copy(CODES)
    for idx,word in enumerate(vocab,len(CODES)):
        word2int[word]=idx
    int2word={idx:word for word,idx in word2int.items()}
    return word2int,int2word

In [0]:
def text_to_ids(source_text,target_text,source_word2int,target_word2int):
    source_text_id=[]
    target_text_id=[]
    
    source_sentences=source_text.split('\n')
    target_sentences=target_text.split('\n')
    
    for i in range(len(source_sentences)):
        source_sentence=[]
        target_sentence=[]
        
        source_tokens=source_sentences[i].split(' ')
        target_tokens=target_sentences[i].split(' ')
        try:
          for word in source_tokens:
              if word!='':
                  source_sentence.append(source_word2int[word])
          for word in target_tokens:
              if word!='':
                  target_sentence.append(target_word2int[word])
        except KeyError:
          continue
        target_sentence.append(target_word2int['<EOS>'])
        source_text_id.append(source_sentence)
        target_text_id.append(target_sentence)
        
    return source_text_id,target_text_id

In [0]:
def preprocess_and_save(source_text,target_text):
    source_text=source_text.lower()
    target_text=target_text.lower()

    source_word2int,source_int2word=create_lookup_tables(source_text)
    target_word2int,target_int2word=create_lookup_tables(target_text)
    
    source_text,target_text=text_to_ids(source_text,target_text,source_word2int,target_word2int)
    pickle.dump(((source_text,target_text),(source_word2int,target_word2int),(source_int2word,target_int2word)),open('preprocessed.pkl','wb'))

In [0]:
preprocess_and_save(source_text,target_text)

In [0]:
def load_preprocessed():
    with open('preprocessed.pkl','rb') as file:
        return pickle.load(file)

In [0]:
(source_text_int,target_text_int),(source_word2int,target_word2int),_=load_preprocessed()

In [0]:
def model_inputs():
    inputs=tf.placeholder(tf.int32,[None,None],name='Inputs')
    targets=tf.placeholder(tf.int32,[None,None],name='Targets')
    target_sequence_length=tf.placeholder(tf.int32,[None],name='target_sequence_length')
    max_target_length=tf.reduce_max(target_sequence_length)
    source_sequence_length=tf.placeholder(tf.int32,shape=(None,),name='source_sequence_length')
    
    return inputs,targets,source_sequence_length,target_sequence_length,max_target_length

In [0]:
def hparams():
    learning_rate=tf.placeholder(tf.float32,name='learning_rate',shape=())
    keep_prob=tf.placeholder(tf.float32,name='keep_prob',shape=())
    return learning_rate,keep_prob

In [0]:
def process_decoder_input(target_data,target_word2int,batch_size):
    go_id=target_word2int['<GO>']
    after_slice=tf.strided_slice(target_data,[0,0],[batch_size,-1],[1,1])
    after_concat=tf.concat([tf.fill([batch_size,1],go_id),after_slice],1)
    return after_concat

In [0]:
def encoding_layer(rnn_inputs,rnn_size,num_layers,keep_prob,source_vocab_size,encoding_embedding_size):
    embedding=tf.contrib.layers.embed_sequence(rnn_inputs,vocab_size=source_vocab_size,embed_dim=encoding_embedding_size)
    stacked_cell=tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.LSTMCell(rnn_size),keep_prob) for _ in range(num_layers)])
    output,state=tf.nn.dynamic_rnn(stacked_cell,embedding,dtype=tf.float32)
    return output,state

In [0]:
def attention(encoder_outputs,rnn_size,source_sequence_length):
  #attention_state=tf.transpose(encoder_outputs,[1,0,2])
  attention_mechanism=tf.contrib.seq2seq.LuongAttention(rnn_size,encoder_outputs,memory_sequence_length=source_sequence_length)
  return attention_mechanism

In [0]:
def decoding_layer_train(encoder_state,encoder_outputs,rnn_size,dec_cell,dec_embed_input,source_sequence_length,target_sequence_length,max_summary_length,output_layer,keep_prob,batch_size):
    dec_cell=tf.contrib.rnn.DropoutWrapper(dec_cell,output_keep_prob=keep_prob)
    dec_cell=tf.contrib.seq2seq.AttentionWrapper(dec_cell,attention(encoder_outputs,rnn_size,source_sequence_length),attention_layer_size=rnn_size)
    helper=tf.contrib.seq2seq.TrainingHelper(dec_embed_input,target_sequence_length)
    decoder=tf.contrib.seq2seq.BasicDecoder(dec_cell,helper,dec_cell.zero_state(dtype=tf.float32,batch_size=batch_size),output_layer)
    outputs,_,_=tf.contrib.seq2seq.dynamic_decode(decoder,impute_finished=True,maximum_iterations=max_summary_length)
    return outputs

In [0]:
def decoding_layer_infer(encoder_state,encoder_outputs,rnn_size,dec_cell,dec_embeddings,source_sequence_length,start_of_sequence_id,end_of_sequence_id,max_target_sequence_length,vocab_size,output_layer,batch_size,keep_prob):
    dec_cell=tf.contrib.rnn.DropoutWrapper(dec_cell,output_keep_prob=keep_prob)
    dec_cell=tf.contrib.seq2seq.AttentionWrapper(dec_cell,attention(encoder_outputs,rnn_size,source_sequence_length),attention_layer_size=rnn_size)
    helper=tf.contrib.seq2seq.GreedyEmbeddingHelper(dec_embeddings,tf.fill([batch_size],start_of_sequence_id),end_of_sequence_id)
    decoder=tf.contrib.seq2seq.BasicDecoder(dec_cell,helper,dec_cell.zero_state(dtype=tf.float32,batch_size=batch_size),output_layer)
    output,_,_=tf.contrib.seq2seq.dynamic_decode(decoder,impute_finished=True,maximum_iterations=max_target_sequence_length)
    return output

In [0]:
def decoding_layer(dec_input,encoder_state,encoder_output,source_sequence_length,target_sequence_length,max_target_sequence_length,rnn_size,num_layers,target_word2int,target_vocab_size,batch_size,keep_prob,decoding_embedding_size):
    target_vocab_size=len(target_word2int)
    dec_embeddings=tf.Variable(tf.random_uniform([target_vocab_size,decoding_embedding_size]))
    dec_embed_input=tf.nn.embedding_lookup(dec_embeddings,dec_input)
    cells=tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LSTMCell(rnn_size) for _ in range(num_layers)])
    with tf.variable_scope('decode'):
        output_layer=tf.layers.Dense(target_vocab_size)
        train_output=decoding_layer_train(encoder_state,encoder_output,rnn_size,cells,dec_embed_input,source_sequence_length,target_sequence_length,max_target_sequence_length,output_layer,keep_prob,batch_size)
    with tf.variable_scope('decode',reuse=True):
        infer=decoding_layer_infer(encoder_state,encoder_output,rnn_size,cells,dec_embeddings,source_sequence_length,target_word2int['<GO>'],target_word2int['<EOS>'],max_target_sequence_length,target_vocab_size,output_layer,batch_size,keep_prob)
    return train_output,infer

In [0]:
def seq2seq_model(input_data,target_data,keep_prob,batch_size,source_sequence_length,target_sequence_length,max_target_sequence_length,source_vocab_size,target_vocab_size,enc_embedding_size,dec_embedding_size,rnn_size,num_layers,target_word2int):
    enc_output,enc_state=encoding_layer(input_data,rnn_size,num_layers,keep_prob,source_vocab_size,enc_embedding_size)
    dec_inputs=process_decoder_input(target_data,target_word2int,batch_size)
    train_output,infer_output=decoding_layer(dec_inputs,enc_state,enc_output,source_sequence_length,target_sequence_length,max_target_sequence_length,rnn_size,num_layers,target_word2int,target_vocab_size,batch_size,keep_prob,dec_embedding_size)
    return train_output,infer_output

In [0]:
display_step=50
epochs=500
batch_size=32
rnn_size=256
num_layers=3
encoding_embedding_size=200
decoding_embedding_size=200
learning_rate=0.001
keep_prob=0.75

In [0]:
savepath='./model/model_s'
(source_int_text,target_int_text),(source_word2int,target_word2int),_=load_preprocessed()
max_target_sequence_length=max([len(s) for s in source_int_text])
train_graph=tf.Graph()
with train_graph.as_default():
    input_data,target_data,source_sequence_length,target_sequence_length,max_target_sequence_length=model_inputs()
    lr,keep_probability=hparams()
    train_logits,infer_logits=seq2seq_model(input_data,target_data,keep_probability,batch_size,source_sequence_length,target_sequence_length,max_target_sequence_length,len(source_word2int),len(target_word2int),encoding_embedding_size,decoding_embedding_size,rnn_size,num_layers,target_word2int)
    training_logits=tf.identity(train_logits.rnn_output,name='logits')
    inference_logits=tf.identity(infer_logits.sample_id,name='predictions')
    masks=tf.sequence_mask(target_sequence_length,max_target_sequence_length,dtype=tf.float32,name='mask')
    with tf.name_scope('optimization'):
        cost=tf.contrib.seq2seq.sequence_loss(training_logits,target_data,masks)
        optimizer=tf.train.AdamOptimizer(lr)
        gradients=optimizer.compute_gradients(cost)
        capped_grad=[(tf.clip_by_value(grad,-1.,1.),var) for grad,var in gradients if grad is not None]
        train_op=optimizer.apply_gradients(capped_grad)


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


In [0]:
def pad_sentence_batch(sentence_batch,pad_int):
    max_sentence=max([len(s) for s in sentence_batch])
    return [sentence + [pad_int]*(max_sentence-len(sentence)) for sentence in sentence_batch]

def get_batches(sources,targets,batch_size,source_pad_int,target_pad_int):
    for batch_i in range(0,len(sources)//batch_size):
        start_i=batch_i*batch_size
        
        sources_batch=sources[start_i:start_i+batch_size]
        targets_batch=targets[start_i:start_i+batch_size]
        
        pad_source_batch=np.array(pad_sentence_batch(sources_batch,source_pad_int))
        pad_target_batch=np.array(pad_sentence_batch(targets_batch,target_pad_int))
        
        pad_source_lengths=[]
        pad_target_lengths=[]
        
        for source in pad_source_batch:
            pad_source_lengths.append(len(source))
        for target in pad_target_batch:
            pad_target_lengths.append(len(target))
        yield pad_source_batch,pad_target_batch,pad_source_lengths,pad_target_lengths


In [0]:
from datetime import datetime
 
def get_accuracy(target,logits):
    max_seq=max(target.shape[1],logits.shape[1])
    if max_seq-target.shape[1]:
        target=np.pad(target,[(0,0),(0,max_seq-target.shape[1])],'constant')
    if max_seq-logits.shape[1]:
        logits=np.pad(logits,[(0,0),(0,max_seq-logits.shape[1])],'constant')
    return np.mean(np.equal(target,logits))
train_source=source_int_text[batch_size:]
train_target=target_int_text[batch_size:]
validation_source=source_int_text[:batch_size]
validation_target=target_int_text[:batch_size]

(valid_source_batch,valid_target_batch,valid_source_length,valid_target_length)=next(get_batches(validation_source,validation_target,batch_size,source_word2int['<PAD>'],target_word2int['<PAD>']))
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    for epoch_i in range(epochs):
        for batch_i,(source_batch,target_batch,source_length,target_length) in enumerate(get_batches(train_source,train_target,batch_size,source_word2int['<PAD>'],target_word2int['<PAD>'])):
            _,loss=sess.run([train_op,cost],{input_data:source_batch,target_data:target_batch,source_sequence_length:source_length,target_sequence_length:target_length,lr:learning_rate,keep_probability:keep_prob})
            if batch_i%display_step==0 and batch_i>0:
                batch_train_logits=sess.run(inference_logits,{input_data:source_batch,source_sequence_length:source_length,target_sequence_length:target_length,keep_probability:1.0})
                batch_valid_logits=sess.run(inference_logits,{input_data:valid_source_batch,source_sequence_length:valid_source_length,target_sequence_length:valid_target_length,keep_probability:1.0})
                train_acc=get_accuracy(target_batch,batch_train_logits)
                valid_acc=get_accuracy(valid_target_batch,batch_valid_logits)
                print('[',str(datetime.now()),']','Epoch {:>3} batch {:>4}/{} - Train accuracy: {:>6.4f}, Validation accuracy: {:>6.4f}, Loss: {:>6.4f}'.format(epoch_i,batch_i,len(source_int_text)//batch_size,train_acc,valid_acc,loss))
    saver=tf.train.Saver()
    saver.save(sess,savepath)
    print('Model trained and saved')

[ 2019-02-16 06:51:44.727463 ] Epoch   0 batch   50/67 - Train accuracy: 0.5354, Validation accuracy: 0.5379, Loss: 4.0757
[ 2019-02-16 06:52:22.404177 ] Epoch   1 batch   50/67 - Train accuracy: 0.5354, Validation accuracy: 0.5379, Loss: 3.3999
[ 2019-02-16 06:53:00.213909 ] Epoch   2 batch   50/67 - Train accuracy: 0.0145, Validation accuracy: 0.0151, Loss: 3.2796
[ 2019-02-16 06:53:38.146608 ] Epoch   3 batch   50/67 - Train accuracy: 0.0145, Validation accuracy: 0.0151, Loss: 3.2393
[ 2019-02-16 06:54:15.929480 ] Epoch   4 batch   50/67 - Train accuracy: 0.0137, Validation accuracy: 0.0151, Loss: 3.1466
[ 2019-02-16 06:54:53.845400 ] Epoch   5 batch   50/67 - Train accuracy: 0.0141, Validation accuracy: 0.0162, Loss: 3.0897
[ 2019-02-16 06:55:31.619962 ] Epoch   6 batch   50/67 - Train accuracy: 0.0130, Validation accuracy: 0.0151, Loss: 3.0655
[ 2019-02-16 06:56:09.407806 ] Epoch   7 batch   50/67 - Train accuracy: 0.0137, Validation accuracy: 0.0147, Loss: 3.0336
[ 2019-02-16 06:

In [0]:
!mkdir model

mkdir: cannot create directory ‘model’: File exists


In [0]:
with tf.Session(graph=train_graph) as sess:
  sess.run(tf.global_variables_initializer())
  saver=tf.train.Saver()
  saver.save(sess,'./model')
  print('Model trained and saved')

Model trained and saved


In [0]:
def sentence_to_seq(sentence,vocab_to_int):
    results=[]
    for word in sentence.split(" "):
        if word in vocab_to_int:
            results.append(vocab_to_int[word])
        else:
            results.append(vocab_to_int['<UNK>'])
    return results

_,(source_word2int,target_word2int),(source_int2word,target_int2word)=load_preprocessed()
translate_sen='I am taking sulfamethoxazole, can I participate in any sports?'.lower()
translate_sen=sentence_to_seq(translate_sen,source_word2int)
loaded_graph=tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    loader=tf.train.import_meta_graph('./model/model_s.meta')
    loader.restore(sess,'./model/model_s')
    
    input_data=loaded_graph.get_tensor_by_name('Inputs:0')
    logits=loaded_graph.get_tensor_by_name('predictions:0')
    target_sequence_length=loaded_graph.get_tensor_by_name('target_sequence_length:0')
    keep_prob=loaded_graph.get_tensor_by_name('keep_prob:0')
    source_sequence_length=loaded_graph.get_tensor_by_name('source_sequence_length:0')
    translation=sess.run(logits,{input_data:[translate_sen]*batch_size,source_sequence_length:[len(translate_sen)]*batch_size,target_sequence_length:[len(translate_sen)*2]*batch_size,keep_prob:1.0})[0]
    print('Question:',' '.join([source_int2word[word] for word in translate_sen]))
    print('Answer:',' '.join([target_int2word[word] for word in translation]))

INFO:tensorflow:Restoring parameters from ./model/model_s
Question: i am taking sulfamethoxazole, can i participate in any sports?
Answer: no no durring and can be related to no no no no sports and you will be related to all


In [0]:
!zip model.zip model/*

  adding: model/checkpoint (deflated 41%)
  adding: model/model_s.data-00000-of-00001 (deflated 6%)
  adding: model/model_s.index (deflated 54%)
  adding: model/model_s.meta (deflated 93%)


In [0]:
source_text

'questions\nMy son ,17, has a bruised and painful back also he is vomiting after being tackled Friday night while playing football ?\nI get a stitch pain  in  my hip mainly after  playing sports ?\nbeen playing volleyball a lot and cheering,lifting pepl. I was spiking and my shoulder hurts and now if i stretch it a bit it hurts. What should i do\n15 year old soccer player with lower back instability and very little hip instability with irritation, Is it dangerous if she still plays with back?\n16 year old girl, I play catcher. I\'ve been playing for years, but knees have been hurting after games and it feels hard to Bend them. (This is new)\n3 weeks ago hyperextended elbow in basketball game, pain only lasted 3 days. Pain returned suddenlywhen I swung my softball bat at practice last night?\n5 1/2 weeks ago I fell playing softball and hurt my thumb.both sides of my MCP joint are bothering me. Do normal sprains last this long?\n8 months ago I had ACL surgery, I did the patellar tendon w

In [0]:
from google.colab import files

files.download('model.zip')

KeyboardInterrupt: ignored

In [0]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))