<a href="https://colab.research.google.com/github/Mahanotrahul/DeepLearning/blob/master/kernel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This Notebook is a Sequence-to-Sequence Model for Text Summarization task using Attention Model

In [1]:
import numpy as np
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation, Bidirectional, Concatenate, Permute, Dot, Multiply
from keras.layers import RepeatVector, Lambda
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.utils import to_categorical
import matplotlib.pyplot as plt
#from utils import *
import pandas as pd
from nltk.corpus import stopwords
from pickle import dump, load
import re

Using TensorFlow backend.


In [2]:
reviews = pd.read_csv("../input/amazon-fine-food-reviews/Reviews.csv")

FileNotFoundError: ignored

In [0]:
print(reviews.shape)
print(reviews.head())
print(reviews.isnull().sum())

In [0]:
reviews = reviews.dropna()
reviews = reviews.drop(['Id','ProductId','UserId','ProfileName','HelpfulnessNumerator','HelpfulnessDenominator', 'Score','Time'], 1)
reviews = reviews.reset_index(drop=True) 
print(reviews.head())
for i in range(5):
    print("Review #",i+1)
    print(reviews.Summary[i])
    print(reviews.Text[i])
    print()
    

In [0]:
contractions = {"ain't": "am not",
                "aren't": "are not",
                "can't": "cannot",
                "can't've": "cannot have",
                "'cause": "because",
                "could've": "could have",
                "couldn't": "could not",
                "couldn't've": "could not have",
                "didn't": "did not",
                "doesn't": "does not",
                "don't": "do not",
                "hadn't": "had not",
                "hadn't've": "had not have",
                "hasn't": "has not",
                "haven't": "have not",
                "he'd": "he would",
                "he'd've": "he would have"}

In [0]:
def clean_text(text, remove_stopwords = True):
    # Convert words to lower case
    text = text.lower()
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
        text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
        text = re.sub(r'\<a href', ' ', text)
        text = re.sub(r'</code>&<code>', '', text)
        text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
        text = re.sub(r'</code><br /><code>', ' ', text)
        text = re.sub(r'\'', ' ', text)
        if remove_stopwords:
            text = text.split()
            stops = set(stopwords.words("english"))
            text = [w for w in text if not w in stops]
            text = " ".join(text)
        return text

In [0]:
clean_summaries = []
for summary in reviews.Summary:
    clean_summaries.append(clean_text(summary, remove_stopwords=False))
print("Summaries are complete.")
clean_texts = []
for text in reviews.Text:
    clean_texts.append(clean_text(text, remove_stopwords = False))
print("Texts are complete.")

In [0]:
stories = list()
for i, text in enumerate(clean_texts):
    stories.append({'story': text, 'highlights': clean_summaries[i]})
# save to file
dump(stories, open('review_dataset1.pkl', 'wb'))

In [3]:
stories = load(open('review_dataset.pkl', 'rb'))
print('Loaded Stories %d' % len(stories))
print(type(stories))
print(stories[2])

FileNotFoundError: ignored

In [0]:
batch_size = 64
epochs = 100
latent_dim = 256
num_samples = 10000

In [0]:
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()
for story in stories[:1000]:
    input_text = story['story']
    target_text = story['highlights']
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])
print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

In [0]:
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])

target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])


encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype='float32')

decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')

decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')



for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
        
    for t, char in enumerate(target_text):

        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.

In [0]:
# Define an input sequence and process it.

encoder_inputs = Input(shape=(None, num_encoder_tokens))

encoder = LSTM(latent_dim, return_state=True)

encoder_outputs, state_h, state_c = encoder(encoder_inputs)

# We discard `encoder_outputs` and only keep the states.

encoder_states = [state_h, state_c]



# Set up the decoder, using `encoder_states` as initial state.

decoder_inputs = Input(shape=(None, num_decoder_tokens))

# We set up our decoder to return full output sequences,

# and to return internal states as well. We don't use the

# return states in the training model, but we will use them in inference.

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)

decoder_outputs, _, _ = decoder_lstm(decoder_inputs,

                                     initial_state=encoder_states)

decoder_dense = Dense(num_decoder_tokens, activation='softmax')

decoder_outputs = decoder_dense(decoder_outputs)



# Define the model that will turn

# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)



# Run training

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics = ['accuracy'])
print(model.summary())

model.fit([encoder_input_data, decoder_input_data], decoder_target_data,

          batch_size=batch_size,

          epochs=epochs,

          validation_split=0.2)

# Save model

model.save('s2s.h5')

In [0]:
def generic_Model(n_input, n_output, n_units):
    # define training encoder
    encoder_inputs = Input(shape=(None, n_input))
    encoder = LSTM(n_units, return_state=True)
    encoder_outputs, state_h, state_c = encoder(encoder_inputs)
    encoder_states = [state_h, state_c]
    # define training decoder
    decoder_inputs = Input(shape=(None, n_output))
    decoder_lstm = LSTM(n_units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
    decoder_dense = Dense(n_output, activation='softmax')
    
    decoder_outputs = decoder_dense(decoder_outputs)
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    # define inference encoder
    encoder_model = Model(encoder_inputs, encoder_states)
    # define inference decoder
    decoder_state_input_h = Input(shape=(n_units,))
    decoder_state_input_c = Input(shape=(n_units,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs,  initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
    # return all models
    return model, encoder_model, decoder_model


In [0]:
training_model, encoder_model, decoder_model = generic_Model(max_encoder_seq_length, max_decoder_seq_length, 128)
training_model.summary()

In [0]:
training_model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
training_model.fit([input_characters, target_characters], target_characters,
batch_size=batch_size,
epochs=epochs,
validation_split=0.2)

In [0]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('dataset/Glove Embeddings/glove.6B.50d.txt')

In [0]:
print(len(word_to_vec_map))
print(word_to_index["didn't"])
print(word_to_vec_map["strife-torn"])

In [0]:
word = "cucumber"
index = 289846
print(len(word_to_vec_map[word]))
print("the index of", word, "in the vocabulary is", word_to_index[word])
print("the", str(index) + "th word in the vocabulary is", index_to_word[index])
print(len(word_to_vec_map))

**Exercise**: Implement `sentence_to_avg()`. This function performs two steps described as follows:
1. Convert every sentence to lower-case, then split the sentence into a list of words.
2. For each word in the sentence, access its GloVe representation. Then, average all these values.

In [0]:
# GRADED FUNCTION: sentence_to_avg

def sentence_to_avg(sentence, word_to_vec_map):
    """
    Converts a sentence (string) into a list of words (strings). Extracts the GloVe representation of each word
    and averages its value into a single vector encoding the meaning of the sentence.
    
    Arguments:
    sentence -- string, one training example from X
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    
    Returns:
    avg -- average vector encoding information about the sentence, numpy-array of shape (50,)
    """
    
    ### START CODE HERE ###
    # Step 1: Split sentence into list of lower case words (≈ 1 line)
    words = [i.lower() for i in sentence.split()]
    
    # Initialize the average word vector, should have the same shape as your word vectors.
    avg = np.zeros((len(word_to_vec_map["a"]),))
    
    # Step 2: average the word vectors. You can loop over the words in the list "words".
    for w in words:
        try:
            avg += word_to_vec_map[w]
        except KeyError:
            print(w)
            continue
        
    avg = avg / len(words)
    
    ### END CODE HERE ###
    
    return avg

In [0]:
avg = sentence_to_avg("at least two people were killed in a suspected bomb attack on a passenger bus in the strife-torn southern philippines on monday , the military said .\n", word_to_vec_map)
print("avg = ", avg)

In [0]:
def preprocess_data(X):
    from tqdm import tqdm
    
    m = len(X)
    str_check = [",", ".", '"', "'", "(", ")", "$", "mph", "-", "_"]
    str_remove = ["-lrb-", "-rrb-"]
    
    for i in tqdm(range(m)):
        sentence_words = [w for w in X[i].split()]
        j = 0
        new_string = ""
        if(i == 0):
            print(X[i])
        
        for w in sentence_words:
            try:
               _ = word_to_index[w]
            except KeyError:
                for stri_r in str_remove:
                    if(stri_r in w):
                        #print(stri_r)
                        w = w.replace(stri_r," ")
                for stri in str_check:
                    if(stri in w):
                        idx = w.index(stri)
                        if(w[idx:idx+2] == "'s"):
                            stri = "'s"
                            idx = w.index(stri)
                            a = w[:idx]
                            b = w[idx + 2:]
                            w = a + " " + stri + " " + b
                            continue
                        elif(w[idx:idx+3] == "mph"):
                            stri = "mph"
                            idx = w.index(stri)
                            if(w[idx - 1] == " "):
                                a = ""
                            else:
                                a = w[:idx]
                            b = w[idx + 3:]
                            w = a + " " + stri + " " + b
                            continue
                        if(w[idx - 1] == " "):
                            a = ""
                        else:
                            a = w[:idx]

                        b = w[idx + 1:]
                        w = a + " " + stri + " " + b
            
            new_string += w + " "
        X[i] = new_string
        if(i == 0):
            print(X[i])
    return X

In [0]:
# GRADED FUNCTION: sentences_to_indices

def sentences_to_indices(X, word_to_index, max_len):
    from tqdm import tqdm
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` (described in Figure 4). 
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = len(X)                                # number of training examples
    
    ### START CODE HERE ###
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros((m, max_len))
    
    for i in tqdm(range(m)):                               # loop over training examples
        
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words = [w.lower() for w in X[i].split()]
        
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            try:
                #if(w == "<unk>"):
                    #X_indices[i, j] = -1
                #else:
                    #X_indices[i, j] = word_to_index[w]
                X_indices[i, j] = word_to_index[w]
            except Exception as e:
                #print(e)
                pass
            # Increment j to j + 1
            j += 1
            
    ### END CODE HERE ###
    
    return X_indices

In [0]:
X = open("dataset/sumdata/train/train.article.10000.txt").readlines()[:100]
X = preprocess_data(X)
print(len(X))
length = [len(x.split()) for x in X]
maxLen = max(length)
print(maxLen)
X1 = sentences_to_indices(X, word_to_index, maxLen)
print("X[0] =", X[9])
print("X1_indices[0] =", X1[9])

print("X[0] =", X[length.index(max(length))])
print("X1_indices[0] =", X1[length.index(max(length))])


Y = open("dataset/sumdata/train/train.title.10000.txt").readlines()[:100]
Y = preprocess_data(Y)
print(len(Y))
length_Y = [len(y.split()) for y in Y]
maxLen_Y = max(length_Y)
print(maxLen_Y)
Y1 = sentences_to_indices(Y, word_to_index, maxLen_Y)
print("Y[0] =", Y[0])
print("Y1_indices[0] =", Y1[0])
print("Y[0] =", Y[length_Y.index(max(length_Y))])
print("Y1_indices[0] =", Y1[length_Y.index(max(length_Y))])

In [0]:
print(index_to_word[1])

In [0]:
def convert_to_one_hot(X, word_to_index):
    
    m = X.shape[0]
    dim = X.shape[1]
    print(type(X))
    length = len(word_to_index) + 1
    print(length)
    
    Z = np.zeros((m, dim, length + 1))
    
    
    for i in tqdm(range(m)):
        for j in range(dim):
            if(j > 0):
                if(X[i,j] == -1):
                    Z[i, j, length] = 1
                else:
                    idx = X[i,j]
                    Z[i, j - 1, int(idx)] = 1
                
    return Z

In [0]:
decoder_target_data = convert_to_one_hot(Y1, word_to_index)
decoder_target_data[0]

In [0]:
X2 = open("dataset/DUC/duc2002/data/test/summaries/Text.txt").readlines()
preprocess_data(X2)
X3 = sentences_to_indices(X2, word_to_index, max_len = 1500)
print("X2[0] =", X2[1])
print("X3_indices[0] =", X3[1])

In [0]:
print(index_to_word[8])
print(word_to_index["#"])

In [0]:
# GRADED FUNCTION: pretrained_embedding_layer

def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    ### START CODE HERE ###
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct output/input sizes, make it trainable. Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = Embedding(vocab_len, emb_dim, trainable = True)
    ### END CODE HERE ###

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [0]:
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
print("weights[0][1][3] =", embedding_layer.get_weights()[0][1][3])

In [0]:
def Model_Text(encoder_input_shape, word_to_vec_map, word_to_index, decoder_input_shape):
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of GloVe word vectors (= 50)
    
    sentence_indices = Input(encoder_input_shape, dtype='int32')
    
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    embeddings = embedding_layer(sentence_indices) 
    
    X, state_h, state_c = LSTM(emb_dim, return_state=True)(embeddings)
    
    encoder_states = [state_h, state_c]
    
    
    #Set up the decoder, using `encoder_states` as initial state.

    decoder_inputs = Input(decoder_input_shape)
    
    
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    embeddings = embedding_layer(decoder_inputs) 
    
    x = LSTM(emb_dim, return_sequences = True)(embeddings, initial_state=encoder_states)
    
    decoder_outputs = Dense(vocab_len + 1, activation='softmax')(x)

    
    model = Model(inputs=[sentence_indices, decoder_inputs], outputs= decoder_outputs)
    
    return model

In [0]:
concatenator = Concatenate(axis = 1)

In [0]:
def Model_Text3(encoder_input_shape, word_to_vec_map, word_to_index, decoder_input_shape):
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of GloVe word vectors (= 50)
    
    sentence_indices = Input(encoder_input_shape, dtype='int32')
    
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    embeddings = embedding_layer(sentence_indices) 
    
    X, _, _ = LSTM(emb_dim, return_state=True)(embeddings)
    
    
    
    #Set up the decoder, using `encoder_states` as initial state.
    print(decoder_input_shape)
    print(encoder_input_shape)
    decoder_inputs = Input(decoder_input_shape, dtype='int32')
    
    
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    embeddings = embedding_layer(decoder_inputs)
    
    x, _, _ = LSTM(emb_dim, return_state=True)(embeddings)
    
    
    decoder1 = concatenator([X, x])
    
    decoder_outputs = Dense(vocab_len + 1, activation='softmax')(decoder1)

    
    model = Model(inputs=[sentence_indices, decoder_inputs], outputs= decoder_outputs)
    
    return model

In [0]:
def Model_Text2(encoder_input_shape, word_to_vec_map, word_to_index, decoder_input_shape):
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of GloVe word vectors (= 50)
    
    sentence_indices = Input(encoder_input_shape, dtype='int32')
    
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    embeddings = embedding_layer(sentence_indices) 
    
    X, _, _ = LSTM(emb_dim, return_state=True)(embeddings)
    
    
    #x = LSTM(emb_dim)(X)
    
    decoder_outputs = Dense(vocab_len + 1, activation='softmax')(X)

    
    model = Model(inputs=sentence_indices, outputs= decoder_outputs)
    
    return model

In [0]:
model = Model_Text((maxLen,), word_to_vec_map, word_to_index, (maxLen_Y,))
model.summary()

In [0]:
# Compile & run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics = ['accuracy'])

In [0]:
model.fit([X1, Y1], decoder_target_data, batch_size = 6, epochs = 5, shuffle = True)

In [0]:
decoder_target_data[0]

In [0]:
loss, acc = model.evaluate([X1 , Y1], decoder_target_data[:,1,:])
print("Accuracy" + str(acc))

In [0]:
acc

In [0]:
idx = model.predict([X1[:10,:], Y1[:10,:]], verbose = 0)
print(idx.shape)
print(idx)
id1 = {}
for i in range(idx.shape[0]):
    id1[str(i)] = []
    for j in range(idx.shape[1]):
        try:
            id1[str(i)].append(index_to_word[np.argmax(idx[i,j,:])])
        except Exception as e:
            id1[str(i)].append(index_to_word[np.argmax(idx[i,j,:]) + 1])
            pass
print(id1)

In [0]:
idx = model.predict([X1, Y1], verbose = 0)

In [0]:
print(idx.shape)
id = []
for i in range(idx.shape[0]):
    id.append(index_to_word[np.argmax(idx[i,:])])


In [0]:
id