In [None]:
import re
import spacy


import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split


from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences

from keras import backend as K 
import gensim

from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

from nltk.corpus import stopwords

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
df = pd.read_csv('/kaggle/input/news-summary/news_summary.csv', encoding='latin-1')
more_df = pd.read_csv('/kaggle/input/news-summary/news_summary_more.csv', encoding='latin-1')

In [None]:
concat_df = pd.concat([df, more_df], axis=0).reset_index(drop=True)
concat_df.shape

In [None]:
concat_df.head()

In [None]:
concat_df['headlines'] = concat_df.headlines.apply(lambda x: x.lower())
concat_df['text'] = concat_df.text.apply(lambda x: x.lower())

In [None]:
!pip install text_hammer
import  text_hammer as th

In [None]:
def clean_text(df, column):
    column = column
    df[column] = df[column].progress_apply(lambda x: re.sub('"',"'", x))
    df[column] = df[column].progress_apply(lambda x:th.remove_special_chars(x))
    df[column] = df[column].progress_apply(lambda x:th.remove_html_tags(x))
    df[column] = df[column].progress_apply(lambda x: th.remove_urls(x))
    df[column] = df[column].progress_apply(lambda x:th.cont_exp(x))
    df[column] = df[column].progress_apply(lambda x: re.sub('[^a-zA-Z]+',' ', x))
    df[column] = df[column].progress_apply(lambda x:' '.join([x for x in x.split() if len(x)>=2]) )
    return df[column]

In [None]:
processed_headlines = clean_text(concat_df, 'headlines')

In [None]:
processed_text = clean_text(concat_df, 'text')

In [None]:
nlp = spacy.load('en', disable=['ner', 'parser']) 
docs = nlp.pipe(processed_text, batch_size=5000, n_threads=-1)
text = [str(doc) for doc in docs]

In [None]:
docs = nlp.pipe(processed_headlines, batch_size=5000, n_threads=-1)
headlines = ['_START_ '+ str(doc) + ' _END_' for doc in docs]

In [None]:
concat_df['text'] = pd.Series(text)
concat_df['headlines'] = pd.Series(headlines)

In [None]:
max_text_len = 0

for i in concat_df['text']:
    tmp = len(i.split())
    if(tmp > max_text_len):
        max_text_len = tmp
       
print(max_text_len)

In [None]:
max_headlines_len = 0

for i in concat_df['headlines']:
    tmp = len(i.split())
    if(tmp > max_headlines_len):
        max_headlines_len = tmp

print(max_headlines_len)

In [None]:
max_text_len = 60
max_headlines_len = 15

In [None]:
text = np.array(concat_df['text'])
headlines = np.array(concat_df['headlines'])

tmp_txt = []
tmp_hln = []

for  i in range(len(text)):
    if(len(headlines[i].split()) <= max_headlines_len and len(text[i].split()) <= max_text_len):
        tmp_txt.append(text[i])
        tmp_hln.append(headlines[i])
        
trimmed_df = pd.DataFrame({'text':tmp_txt, 'headlines':tmp_hln})

In [None]:
trimmed_df.head()

In [None]:
trimmed_df['headlines'] = trimmed_df['headlines'].apply(lambda x : 'sostok '+ x + ' eostok')

In [None]:
trimmed_df.head()

**SEQ2SEQ MODEL BUILDING **

In [None]:
x_train,x_test,y_train,y_test=train_test_split(np.array(trimmed_df['text']),np.array(trimmed_df['headlines']),test_size=0.2,random_state=0,shuffle=True)

In [None]:
x_tokenizer = Tokenizer() 
x_tokenizer.fit_on_texts(list(x_train))
print(1+len(x_tokenizer.word_index))

In [None]:
limit = 3
count = 0
freq  = 0

for key, value in x_tokenizer.word_counts.items():
    if(value < limit):
        count = count + 1
        freq = freq + value
        
print("count: ", count)
print("freq: ",freq)

In [None]:
#prepare a tokenizer for reviews on training data
x_tokenizer = Tokenizer(num_words=40000) 
x_tokenizer.fit_on_texts(list(x_train))

#convert text sequences into integer sequences (i.e one-hot encodeing all the words)
x_train_seq    =   x_tokenizer.texts_to_sequences(x_train) 
x_test_seq   =   x_tokenizer.texts_to_sequences(x_test)

#padding zero upto maximum length
x_train    =   pad_sequences(x_train_seq,  maxlen=max_text_len, padding='post')
x_test   =   pad_sequences(x_test_seq, maxlen=max_text_len, padding='post')

#size of vocabulary ( +1 for padding token)
x_voc   =  x_tokenizer.num_words + 1

print("Size of vocabulary: ",x_voc)

In [None]:
y_tokenizer = Tokenizer()   
y_tokenizer.fit_on_texts(list(y_train))
print(1+len(y_tokenizer.word_index))

In [None]:
limit = 3
count = 0
freq  = 0

for key, value in y_tokenizer.word_counts.items():
    if(value < limit):
        count = count + 1
        freq = freq + value
        
print("count: ", count)
print("freq: ",freq)

In [None]:
#prepare a tokenizer for reviews on training data
y_tokenizer = Tokenizer(num_words=20000) 
y_tokenizer.fit_on_texts(list(y_train))

#convert text sequences into integer sequences (i.e one hot encode the text in Y)
y_train_seq    =   y_tokenizer.texts_to_sequences(y_train) 
y_test_seq   =   y_tokenizer.texts_to_sequences(y_test) 

#padding zero upto maximum length
y_train    =   pad_sequences(y_train_seq, maxlen=max_headlines_len, padding='post')
y_test  =   pad_sequences(y_test_seq, maxlen=max_headlines_len, padding='post')

#size of vocabulary
y_voc  =   y_tokenizer.num_words +1
print("Size of vocabulary: ", y_voc)

In [None]:
K.clear_session()

latent_dim = 300
embedding_dim=200

# Encoder
encoder_inputs = Input(shape=(max_text_len,))

#embedding layer
enc_emb =  Embedding(x_voc, embedding_dim,trainable=True)(encoder_inputs)

#encoder lstm 1
encoder_lstm1 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)

#encoder lstm 2
encoder_lstm2 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)

#encoder lstm 3
encoder_lstm3=LSTM(latent_dim, return_state=True, return_sequences=True,dropout=0.4,recurrent_dropout=0.4)
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2)

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))

#embedding layer
dec_emb_layer = Embedding(y_voc, embedding_dim,trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True,dropout=0.4,recurrent_dropout=0.2)
decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=[state_h, state_c])

#dense layer
decoder_dense =  TimeDistributed(Dense(y_voc, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model 
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()


In [None]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=2)

**Start fitting the model with the data**

In [None]:
history=model.fit([x_train,y_train[:,:-1]], y_train.reshape(y_train.shape[0],y_train.shape[1], 1)[:,1:] ,epochs=10,callbacks=[es],batch_size=128, validation_data=([x_test,y_test[:,:-1]], y_test.reshape(y_test.shape[0],y_test.shape[1], 1)[:,1:]))

**Visualize the model learning**

In [None]:
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

**Next, let’s build the dictionary to convert the index to word for target and source vocabulary:**

In [None]:
reverse_target_word_index=y_tokenizer.index_word
reverse_source_word_index=x_tokenizer.index_word
target_word_index=y_tokenizer.word_index

In [None]:
# Encode the input sequence to get the feature vector
encoder_model = Model(inputs=encoder_inputs,outputs=[encoder_outputs, state_h, state_c])

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_hidden_state_input = Input(shape=(max_text_len,latent_dim))

# Get the embeddings of the decoder sequence
dec_emb2= dec_emb_layer(decoder_inputs) 
# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])

# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_outputs2) 

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs2] + [state_h2, state_c2])

**We are defining a function below which is the implementation of the inference process**

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    e_out, e_h, e_c = encoder_model.predict(input_seq)
    
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    
    # Populate the first word of target sequence with the start word.
    target_seq[0, 0] = target_word_index['sostok']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
      
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index[sampled_token_index]
        
        if(sampled_token!='eostok'):
            decoded_sentence += ' '+sampled_token

        # Exit condition: either hit max length or find stop word.
        if (sampled_token == 'eostok'  or len(decoded_sentence.split()) >= (max_headlines_len-1)):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        e_h, e_c = h, c

    return decoded_sentence

**Let us define the functions to convert an integer sequence to a word sequence for summary as well as the reviews:**


In [None]:
def seq2summary(input_seq):
    newString=''
    for i in input_seq:
        if((i!=0 and i!=target_word_index['sostok']) and i!=target_word_index['eostok']):
            newString=newString+reverse_target_word_index[i]+' '
    return newString

def seq2text(input_seq):
    newString=''
    for i in input_seq:
        if(i!=0):
            newString=newString+reverse_source_word_index[i]+' '
    return newString

**Run the model over the data to see the results**

In [None]:
print("Review: ", seq2text(x_train[0]))
print("Original summary: ", seq2summary(y_train[0]))
print("predicted summary: ", decode_sequence(x_train[i].reshape(1,max_text_len)))