In [37]:
import random
import codecs
import math
import time
import sys
import subprocess
import os.path
import pickle
import numpy as np
import gensim

import keras.backend as K
from keras.models import Sequential
from keras.layers.core import Dense
from keras.layers.wrappers import TimeDistributed
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.layers.core import Lambda, Activation
from keras.utils import np_utils
from keras.preprocessing import sequence

from tqdm import tqdm
from sklearn.cross_validation import train_test_split
from nltk.translate.bleu_score import sentence_bleu
from numpy import inf
from operator import itemgetter

seed = 28
random.seed(seed)
np.random.seed(seed)

top_freq_word_to_use = 40000
embedding_dimension = 300
max_len_head = 25
max_len_desc = 50
max_length = max_len_head + max_len_desc
rnn_layers = 4
rnn_size = 600
# first 40 numebers from hidden layer output used for
# simple context calculation
activation_rnn_size = 50

empty_tag_location = 0
eos_tag_location = 1
unknown_tag_location = 2
learning_rate = 1e-4

#minimum headline should be genrated
min_head_line_gen = 10
dont_repeat_word_in_last = 5

word2vec = []
idx2word = {}
word2idx = {}
# initalize end of sentence, empty and unk tokens
word2idx['<empty>'] = empty_tag_location
word2idx['<eos>'] = eos_tag_location
word2idx['<unk>'] = unknown_tag_location
idx2word[empty_tag_location] = '<empty>'
idx2word[eos_tag_location] = '<eos>'
idx2word[unknown_tag_location] = '<unk>'

idx = 3
model = gensim.models.KeyedVectors.load_word2vec_format('F:/VIGHNESH/Cognitive_Computing/Final_Project/GoogleNews-vectors-negative300.bin', binary = True, limit = 40000)
V = model.index2word
for index, word in enumerate(V):
    vector = model[word]
    word2idx[word] = idx
    idx2word[idx] = word
    idx = idx + 1
    if idx % 10000 == 0:
        print ("working on word2vec ... idx ", idx)

working on word2vec ... idx  10000
working on word2vec ... idx  20000
working on word2vec ... idx  30000
working on word2vec ... idx  40000


In [38]:
def simple_context(X, mask):
    """
    Simple context calculation layer logic
    X = (batch_size, time_steps, units)
    time_steps are nothing but number of words in our case.
    """
    # segregrate heading and desc
    desc, head = X[:, :max_len_desc, :], X[:, max_len_desc:, :]
    # segregrate activation and context part
    head_activations, head_words = head[:, :, :activation_rnn_size], head[:, :, activation_rnn_size:]
    desc_activations, desc_words = desc[:, :, :activation_rnn_size], desc[:, :, activation_rnn_size:]

    # p=(bacth_size, length_desc_words, rnn_units)
    # q=(bacth_size, length_headline_words, rnn_units)
    # K.dot(p,q) = (bacth_size, length_desc_words,length_headline_words)
    activation_energies = K.batch_dot(head_activations, desc_activations, axes=(2, 2))

    # make sure we dont use description words that are masked out
    activation_energies = activation_energies + -1e20 * K.expand_dims(1. - K.cast(mask[:, :max_len_desc], 'float32'), 1)

    # for every head word compute weights for every desc word
    activation_energies = K.reshape(activation_energies, (-1, max_len_desc))
    activation_weights = K.softmax(activation_energies)
    activation_weights = K.reshape(activation_weights, (-1, max_len_head, max_len_desc))

    # for every head word compute weighted average of desc words
    desc_avg_word = K.batch_dot(activation_weights, desc_words, axes=(2, 1))
    return K.concatenate((desc_avg_word, head_words))

In [39]:
def output_shape_simple_context_layer(input_shape):
    """
    Take input shape tuple and return tuple for output shape
    Output shape size for simple context layer =
    remaining part after activatoion calculation fron input layers avg. +
    remaining part after activatoion calculation fron current hidden layers avg.
    that is 2 * (rnn_size - activation_rnn_size))
    input_shape[0] = batch_size remains as it is
    max_len_head = heading max length allowed
    """
    return (input_shape[0], max_len_head , 2 * (rnn_size - activation_rnn_size))

In [40]:
def create_model():
        """
        RNN model creation
        Layers include Embedding Layer, 3 LSTM stacked,
        Simple Context layer (manually defined),
        Time Distributed Layer
        """
        #length_vocab, embedding_size = word2vec.shape
        #print ("shape of word2vec matrix ", word2vec.shape)

        model = Sequential()

        # TODO: look at mask zero flag
        model.add(
                Embedding(
                        40003, embedding_dimension,
                        input_length=max_length,
                         mask_zero=True,
                        name='embedding_layer'
                )
        )

        for i in range(rnn_layers):
            lstm = LSTM(rnn_size, return_sequences=True,
                name='lstm_layer_%d' % (i + 1)
            )

            model.add(lstm)
            # No drop out added !

        model.add(Lambda(simple_context,
                     mask=lambda inputs, mask: mask[:, max_len_desc:],
                     output_shape=output_shape_simple_context_layer,
                     name='simple_context_layer'))

        vocab_size = 40003
        model.add(TimeDistributed(Dense(vocab_size,
                                name='time_distributed_layer')))
        
        model.add(Activation('softmax', name='activation_layer'))
        
        model.compile(loss='categorical_crossentropy', optimizer='adam')
        K.set_value(model.optimizer.lr, np.float32(learning_rate))
        print (model.summary())
        return model

In [41]:
model = create_model()
model.load_weights('F:/VIGHNESH/Cognitive_Computing/Final_Project/model_weights.h5')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer (Embedding)  (None, 75, 300)           12000900  
_________________________________________________________________
lstm_layer_1 (LSTM)          (None, 75, 600)           2162400   
_________________________________________________________________
lstm_layer_2 (LSTM)          (None, 75, 600)           2882400   
_________________________________________________________________
lstm_layer_3 (LSTM)          (None, 75, 600)           2882400   
_________________________________________________________________
lstm_layer_4 (LSTM)          (None, 75, 600)           2882400   
_________________________________________________________________
simple_context_layer (Lambda (None, 25, 1100)          0         
_________________________________________________________________
time_distributed_3 (TimeDist (None, 25, 40003)         44043303  
__________

In [42]:
def padding(list_idx, curr_max_length, is_left):
    """
    padds with <empty> tag in left side
    """
    print("Padding")
    if len(list_idx) >= curr_max_length:
        return list_idx
    number_of_empty_fill = curr_max_length - len(list_idx)
    if is_left:
        return [empty_tag_location, ] * number_of_empty_fill + list_idx
    else:
        return list_idx + [empty_tag_location, ] * number_of_empty_fill

In [43]:
def headline2idx(list_idx, curr_max_length, is_input):
    """
    if space add <eos> tag in input case, input size = curr_max_length-1
    always add <eos> tag in predication case, size = curr_max_length
    always right pad
    """
    print("Adding <eos> to headline and padding")
    if is_input:
        if len(list_idx) >= curr_max_length - 1:
            return list_idx[:curr_max_length - 1]
        else:
            # space remaning add eos and empty tags
            list_idx = list_idx + [eos_tag_location, ]
            return padding(list_idx, curr_max_length - 1, False)
    else:
        # always add <eos>
        if len(list_idx) == curr_max_length:
            list_idx[-1] = eos_tag_location
            return list_idx
        else:
            # space remaning add eos and empty tags
            list_idx = list_idx + [eos_tag_location, ]
            return padding(list_idx, curr_max_length, False)

In [44]:
def desc2idx(list_idx, curr_max_length):
    """
    always left pad and eos tag to end
    """
    #====== REVERSE THE DESC IDS ========
    list_idx.reverse()
    # padding to the left remain same and 
    # eos tag position also remain same,
    # just description flipped
    #===================================
    # desc padded left
    print("Adding <eos> to description and padding, reversing")
    list_idx = padding(list_idx, curr_max_length, True)
    # eos tag add
    list_idx = list_idx + [eos_tag_location, ]
    return list_idx

In [45]:
def sentence2idx(sentence, is_headline, curr_max_length, is_input=True):
    """
    given a sentence convert it to its ids
    "I like India" => [12, 51, 102]
    if words not present in vocab ignore them
    is_input is only for headlines
    """
    print("Converting sentences to vectors using word2ix obtained from model")
    list_idx = []
    tokens = sentence.split(" ")
    count = 0
    for each_token in tokens:
        if each_token in word2idx:
            list_idx.append(word2idx[each_token])
        else:
            #append unk token as original word not present in word2vec
            list_idx.append(word2idx['<unk>'])
        count = count + 1
        if count >= curr_max_length:
            break

    if is_headline:
        return headline2idx(list_idx, curr_max_length, is_input)
    else:
        return desc2idx(list_idx, curr_max_length)

In [46]:
def convert_inputs(descriptions, headlines):
    """
    convert input to suitable format
    1.Left pad descriptions with <empty> tag
    2.Add <eos> tag
    3.Right padding with <empty> tag after (desc+headline)
    4.input headline doesnot contain <eos> tag
    5.expected/predicated headline contain <eos> tag
    6.One hot endoing for expected output
    """
    # length of headlines and descriptions should be equal
    #assert len(descriptions) == len(headlines)

    X, y = [], []
    input_headline_idx = sentence2idx(headlines, True, max_len_head, True)
    predicted_headline_idx = sentence2idx(headlines, True, max_len_head, False)
    desc_idx = sentence2idx(descriptions, False, max_len_desc)
    #print("Input headline length",len(input_headline_idx))
    #print("Predicted headline length",len(predicted_headline_idx))
    #print("Description length",len(desc_idx))
    # assert size checks
    assert len(input_headline_idx) == max_len_head - 1
    assert len(predicted_headline_idx) == max_len_head
    assert len(desc_idx) == max_len_desc + 1

    X.append(desc_idx + input_headline_idx)
    y.append(predicted_headline_idx)
        
    X, y = np.array(X), np.array(y)
    print("Convert descs and headlines to numpy arrays")
    return X,headlines

In [47]:
def indexes_to_words(list_of_headline):
    """
    indexes => words (for BLUE Score)
    e.g. [2,0] => ["I","am"] (idx2word defined dictionary used)
    """
    list_of_word_headline = []
    for each_headline in list_of_headline:
        each_headline_words = []
        for each_word in each_headline:
            #Dont include <eos> and <empty> tags
            if each_word in (empty_tag_location, eos_tag_location, unknown_tag_location):
                continue
            each_headline_words.append(idx2word[each_word])
        list_of_word_headline.append(each_headline_words)            
    return list_of_word_headline

In [48]:
def is_headline_end(word_index_list,current_predication_position):
    """
    is headline ended checker
    current_predication_position is 0 index based
    """
    #print("Inside is_headline function")
    if (word_index_list is None) or (len(word_index_list)==0):
        return False
    if word_index_list[current_predication_position]==eos_tag_location or current_predication_position>=max_length:
        return True
    return False


In [49]:
def process_word(predication,word_position_index,top_k,X,prev_layer_log_prob):
    """
    Extract top k predications of given position
    """
    #print("Inside process word function")
    #predication conttains only one element
    #shape of predication (1,max_head_line_words,vocab_size)
    predication = predication[0]
    #predication (max_head_line_words,vocab_size)
    predication_at_word_index = predication[word_position_index]
    #http://stackoverflow.com/questions/6910641/how-to-get-indices-of-n-maximum-values-in-a-numpy-array
    sorted_arg = predication_at_word_index.argsort()
    top_probable_indexes = sorted_arg[::-1]
    top_probabilities = np.take(predication_at_word_index,top_probable_indexes)
    log_probabilities = np.log(top_probabilities)
    #make sure elements doesnot contain -infinity
    log_probabilities[log_probabilities == -inf] = -sys.maxsize - 1
    #add prev layer probability
    log_probabilities = log_probabilities + prev_layer_log_prob
    assert len(log_probabilities)==len(top_probable_indexes)
        
    #add previous words ... preparation for next input
    #offset calculate ... description + eos + headline till now
    offset = max_len_desc+word_position_index+1
    ans = []
    count = 0 
    for i,j in zip(log_probabilities, top_probable_indexes):
        #check for word should not repeat in headline ... 
        #checking for last x words, where x = dont_repeat_word_in_last
        if j in X[max_len_desc+1:offset][-dont_repeat_word_in_last:]:
            continue
        if (word_position_index < min_head_line_gen) and (j in [empty_tag_location, unknown_tag_location, eos_tag_location]):
            continue
            
        next_input = np.concatenate((X[:offset], [j,]))
        next_input = next_input.reshape((1,next_input.shape[0]))
        #for the last time last word put at max_length + 1 position 
        #don't truncate that
        if offset!=max_length:
            next_input = sequence.pad_sequences(next_input, maxlen=max_length, value=empty_tag_location, padding='post', truncating='post')
        next_input = next_input[0]
        ans.append((i,next_input))
        count = count + 1
        if count>=top_k:
            break
    #[(prob,list_of_words_as_next_input),(prob2,list_of_words_as_next_input2),...]
    return ans


In [50]:
def beam_search(X,top_k):
    """
    1.Loop over max headline word allowed
    2.predict word prob and select top k words for each position
    3.select top probable combination uptil now for next round
    """
    print("Inside beam search function")
    #contains [(log_p untill now, word_seq), (log_p2, word_seq2)]
    prev_word_index_top_k = []
    curr_word_index_top_k = []
    done_with_pred = []
    #1d => 2d array [1,2,3] => [[1,2,3]]
    data = X.reshape((1,X.shape[0]))
    #shape of predication (1,max_head_line_words,vocab_size)
    predication = model.predict_proba(data,verbose=0)
    #prev layer probability 1 => np.log(0)=0.0
    prev_word_index_top_k = process_word(predication,0,top_k,X,0.0)
        
    #1st time its done above to fill prev word therefore started from 1
    for i in range(1,max_len_head):
        #i = represents current intrested layer ...
        for j in range(len(prev_word_index_top_k)):
            #j = each time loops for top k results ...
            probability_now, current_intput = prev_word_index_top_k[j]
            data = current_intput.reshape((1,current_intput.shape[0]))
            predication = model.predict_proba(data,verbose=1)
            next_top_k_for_curr_word = process_word(predication,i,top_k,current_intput,probability_now)
            curr_word_index_top_k = curr_word_index_top_k + next_top_k_for_curr_word
                
        #sort new list, empty old, copy top k element to old, empty new
        curr_word_index_top_k = sorted(curr_word_index_top_k,key=itemgetter(0),reverse=True)
        prev_word_index_top_k_temp = curr_word_index_top_k[:top_k]
        curr_word_index_top_k = []
        prev_word_index_top_k = []
        #if word predication eos ... put it done list ...
        for each_proba, each_word_idx_list in prev_word_index_top_k_temp:
            offset = max_len_desc+i+1
            if is_headline_end(each_word_idx_list,offset):
                done_with_pred.append((each_proba, each_word_idx_list))
            else:
                prev_word_index_top_k.append((each_proba,each_word_idx_list))
            
    #sort according to most probable
    done_with_pred = sorted(done_with_pred,key=itemgetter(0),reverse=True)
    done_with_pred = done_with_pred[:top_k]
    return done_with_pred

In [51]:
def test(descriptions,headlines,top_k,seperator='#|#'):
    """
    test on given description data file with empty headline ...
    """
    #model.load_weights(model_weights_file_name)
    #print ("model weights loaded")
    print("Inside test function")
    X,Y = convert_inputs(descriptions, headlines)
    X = X[0]
    Y = Y[0]
    assert X[max_len_desc]==eos_tag_location
    #wipe up news headlines present and replace by empty tag ...            
    X[max_len_desc+1:]=empty_tag_location
    result = beam_search(X,top_k)
    #take top most probable element
    list_of_word_indexes = []
    #for results in result:
        #list_of_word_indexes.append(results[1])
    list_of_word_indexes = result[0][1]
    list_of_words = indexes_to_words([list_of_word_indexes])[0]
    print(list_of_words)
    headline = " ".join(list_of_words)
    return str(headline)

In [52]:
import math
def get_cosine(vec1, vec2):
     intersection = set(vec1.keys()) & set(vec2.keys())
     numerator = sum([vec1[x] * vec2[x] for x in intersection])

     sum1 = sum([vec1[x]**2 for x in vec1.keys()])
     sum2 = sum([vec2[x]**2 for x in vec2.keys()])
     denominator = math.sqrt(sum1) * math.sqrt(sum2)

     if not denominator:
        return 0.0
     else:
        return float(numerator) / denominator

In [53]:
import re, math
from collections import Counter
WORD = re.compile(r'\w+')
def text_to_vector(text):
     words = WORD.findall(text)
     return Counter(words)

In [54]:
from flask import Flask, render_template, request, send_from_directory
app = Flask(__name__)

def get_generated_title(actual_headline,actual_body):
    gen_title = test(actual_headline,actual_body,5)
    print(gen_title)
    #gen_title="I am the generated title"
    return(str(gen_title))

def get_cosine_similarity(actual_headline,gen_title):
    vector1 = text_to_vector(actual_headline)
    vector2 = text_to_vector(gen_title)
    cosine = get_cosine(vector1, vector2)
    return(cosine)

@app.route('/')
def landing_page():
    return ('Welcome!!!!')

@app.route('/index/')
def index_page():
    return render_template('index.html')

@app.route('/result',methods = ['POST', 'GET'])
def result_page():
    if request.method == 'POST':
        result = request.form
        #cosine='99%'
        actual_headline = request.form['newstitle']
        actual_body = request.form['newsbody']
        #print(actual_headline)
        #print(actual_body)
        gen_title = get_generated_title(actual_headline,actual_body)
        cosine = get_cosine_similarity(actual_headline,gen_title)
        return render_template("result.html",result = result, cosine=cosine, gen_title = gen_title)


if __name__ == '__main__':
    app.run()

 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [27/Apr/2018 15:20:15] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [27/Apr/2018 15:20:22] "GET /index/ HTTP/1.1" 200 -


Inside test function
Converting sentences to vectors using word2ix obtained from model
Adding <eos> to headline and padding
Padding
Converting sentences to vectors using word2ix obtained from model
Adding <eos> to headline and padding
Padding
Converting sentences to vectors using word2ix obtained from model
Adding <eos> to description and padding, reversing
Padding
Convert descs and headlines to numpy arrays
Inside beam search function
['Name', 'The', 'Trump', 'for', 'the', 'New', 'York', 'Times', 'in', 'The', 'Trump']
Name The Trump for the New York Times in The Trump


127.0.0.1 - - [27/Apr/2018 15:21:29] "POST /result HTTP/1.1" 200 -


Inside test function
Converting sentences to vectors using word2ix obtained from model
Adding <eos> to headline and padding
Converting sentences to vectors using word2ix obtained from model
Adding <eos> to headline and padding
Converting sentences to vectors using word2ix obtained from model
Adding <eos> to description and padding, reversing
Padding
Convert descs and headlines to numpy arrays
Inside beam search function
['firing', 'chaplain', 'House', 'over', 'Ryan', 'grill', 'Republicans', 'Some', 'The', 'Trump', 'is', 'the', 'New', 'York', 'on', 'Trump', 'The', 'the']
firing chaplain House over Ryan grill Republicans Some The Trump is the New York on Trump The the


127.0.0.1 - - [27/Apr/2018 15:24:57] "POST /result HTTP/1.1" 200 -
