In [1]:
!unzip /content/train.csv.zip

Archive:  /content/train.csv.zip
  inflating: train.csv               


In [4]:
import pandas as pd
import string
import re
import nltk
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import numpy as np
import tensorflow
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Input, Concatenate, TimeDistributed
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model
import tensorflow.keras.utils as ku
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
PUNCTUATION = r'[\,\;\?\.\!\:\"\(\)]'

#Training data preparation
# df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train_split.csv')
df=pd.read_csv("/content/train.csv").head(30000)

df, df2 = train_test_split(df, test_size = 0.25, random_state = 42)
text = df['abstract'].tolist()

#Preprocessing: remove stopwords and punctuation, stemming
def preprocess_data(text):
    stopwords_english = stopwords.words('english')
    stemmer = PorterStemmer()
    
    texts_clean = []
    for word in text:
        if (word not in stopwords_english and  
                word not in PUNCTUATION):
            stem_word = stemmer.stem(word)  # stemming word
            texts_clean.append(stem_word)

    return texts_clean
df['content_clean'] = preprocess_data(text)
df['title'] = df['title'].apply(lambda x : 'sostok '+ x + ' eostok')

#Test data preparation
#df2 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/test_split.csv')
text2 = df2['abstract'].tolist()
df2['content_clean'] = preprocess_data(text2)
df2['title'] = df2['title'].apply(lambda x : 'sostok '+ x + ' eostok')


#Tokenization for contents
x_tokenizer = Tokenizer() 
x_tokenizer.fit_on_texts(list(df['content_clean']))

#Check rare words (occur less than 5 times)
thresh = 5
cnt = 0
tot_cnt = 0

for key, value in x_tokenizer.word_counts.items():
    tot_cnt = tot_cnt + 1
    if value < thresh:
        cnt = cnt + 1
    
print("% of rare words in vocabulary: ", (cnt / tot_cnt) * 100)

#Tokenization with frequent words only
x_tokenizer = Tokenizer(num_words = tot_cnt - cnt) 
x_tokenizer.fit_on_texts(list(df['content_clean']))

# Convert content sequences to integer sequences for training_content and test_content
train_seq = x_tokenizer.texts_to_sequences(df['content_clean']) 
test_seq = x_tokenizer.texts_to_sequences(df2['content_clean'])

# Pad zero upto maximum length
max_content_len = 15
train_content = pad_sequences(train_seq,  maxlen=max_content_len, padding='post')
test_content = pad_sequences(test_seq, maxlen=max_content_len, padding='post')

# Size of content vocabulary (+1 for padding token)
content_voc = x_tokenizer.num_words + 1
print("Size of vocabulary in X = {}".format(content_voc))

#Tokenization for titles
y_tokenizer = Tokenizer()   
y_tokenizer.fit_on_texts(list(df['title']))

#Check rare words
thresh = 5

cnt = 0
tot_cnt = 0

for key, value in y_tokenizer.word_counts.items():
    tot_cnt = tot_cnt + 1
    if value < thresh:
        cnt = cnt + 1
    
print("% of rare words in vocabulary:",(cnt / tot_cnt) * 100)

#Tokenization with reqular words only
y_tokenizer = Tokenizer(num_words=tot_cnt-cnt) 
y_tokenizer.fit_on_texts(list(df['title']))

# Convert title sequences to integer sequences for training_title and test_title
train_title_seq = y_tokenizer.texts_to_sequences(df['title']) 
test_title_seq = y_tokenizer.texts_to_sequences(df2['title']) 

# Pad zero upto maximum length
max_title_len = 15
train_title = pad_sequences(train_title_seq, maxlen=max_title_len, padding='post')
test_title = pad_sequences(test_title_seq, maxlen=max_title_len, padding='post')

# Size of title vocabulary
title_voc = y_tokenizer.num_words + 1
print("Size of vocabulary in Y = {}".format(title_voc))
y_tokenizer.word_counts['sostok'],len(train_title)  



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


% of rare words in vocabulary:  66.57861681690557
Size of vocabulary in X = 18094
% of rare words in vocabulary: 74.93270340053837
Size of vocabulary in Y = 4564


(22500, 22500)

In [5]:

##Encoder - decoder layers
latent_dim = 300
embedding_dim = 200

# Encoder
encoder_inputs = Input(shape=(max_content_len, ))

# Embedding layer
enc_emb = Embedding(content_voc, embedding_dim,
                    trainable=True)(encoder_inputs)

# Encoder LSTM 1
encoder_lstm1 = LSTM(latent_dim, return_sequences=True,
                     return_state=True, dropout=0.4,
                     recurrent_dropout=0.4)
(encoder_output1, state_h1, state_c1) = encoder_lstm1(enc_emb)

# Encoder LSTM 2
encoder_lstm2 = LSTM(latent_dim, return_sequences=True,
                     return_state=True, dropout=0.4,
                     recurrent_dropout=0.4)
(encoder_output2, state_h2, state_c2) = encoder_lstm2(encoder_output1)

# Encoder LSTM 3
encoder_lstm3 = LSTM(latent_dim, return_state=True,
                     return_sequences=True, dropout=0.4,
                     recurrent_dropout=0.4)
(encoder_outputs, state_h, state_c) = encoder_lstm3(encoder_output2)

# Set up the decoder, using encoder_states as the initial state
decoder_inputs = Input(shape=(None, ))

# Embedding layer
dec_emb_layer = Embedding(title_voc, embedding_dim, trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)

# Decoder LSTM
decoder_lstm = LSTM(latent_dim, return_sequences=True,
                    return_state=True, dropout=0.4,
                    recurrent_dropout=0.2)
(decoder_outputs, decoder_fwd_state, decoder_back_state) = \
    decoder_lstm(dec_emb, initial_state=[state_h, state_c])

# Dense layer
decoder_dense = TimeDistributed(Dense(title_voc, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()






Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 15)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 15, 200)      3618800     ['input_1[0][0]']                
                                                                                                  
 lstm (LSTM)                    [(None, 15, 300),    601200      ['embedding[0][0]']              
                                 (None, 300),                                                     
                                 (None, 300)]                                                     
                                                                                              

In [6]:
#Training
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)

history = model.fit(
    [train_content, train_title[:, :-1]],
    train_title.reshape(train_title.shape[0], train_title.shape[1], 1)[:, 1:],
    epochs=20,
    callbacks=[es],
    batch_size=128,
    validation_data=([test_content, test_title[:, :-1]],
                     test_title.reshape(test_title.shape[0], test_title.shape[1], 1)[:
                     , 1:]),
    )


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 18: early stopping


In [7]:
##Make predictions
reverse_target_word_index = y_tokenizer.index_word
reverse_source_word_index = x_tokenizer.index_word
target_word_index = y_tokenizer.word_index

# Inference Models

# Encode the input sequence to get the feature vector
encoder_model = Model(inputs=encoder_inputs, outputs=[encoder_outputs,
                      state_h, state_c])

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim, ))
decoder_state_input_c = Input(shape=(latent_dim, ))
decoder_hidden_state_input = Input(shape=(max_content_len, latent_dim))

# Get the embeddings of the decoder sequence
dec_emb2 = dec_emb_layer(decoder_inputs)

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2,
        initial_state=[decoder_state_input_h, decoder_state_input_c])

# A dense softmax layer to generate probability distribution over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_outputs2)

# Final decoder model
decoder_model = Model([decoder_inputs] + [decoder_hidden_state_input,
                      decoder_state_input_h, decoder_state_input_c],
                      [decoder_outputs2] + [state_h2, state_c2])

#Decode function that predicts the title: start and end tokens (sostok&eostok)
def decode_sequence(input_seq):

    # Encode the input as state vectors.
    e_out, e_h, e_c = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1
    target_seq = np.zeros((1, 1))

    # Populate the first word of target sequence with the start word.
    target_seq[0, 0] = target_word_index['sostok']

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        (output_tokens, h, c) = decoder_model.predict([target_seq]
                + [e_out, e_h, e_c])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index[sampled_token_index]

        if sampled_token != 'eostok':
            decoded_sentence += ' ' + sampled_token

        # Exit condition: either hit max length or find the stop word.
        if sampled_token == 'eostok' or len(decoded_sentence.split()) \
            >= max_title_len - 1:
            stop_condition = True

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        (e_h, e_c) = (h, c)

    return decoded_sentence

# To convert sequence to original titles
def seq2summary(input_seq):
    newString = ''
    for i in input_seq:
        if i != 0 and i != target_word_index['sostok'] and i \
            != target_word_index['eostok']:
            newString = newString + reverse_target_word_index[i] + ' '

    return newString


In [9]:
for i in range(len(test_content)):
  print("Predicted title:",seq2summary(test_content[i]))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Predicted title: finite transmission of data poisson the specific finite zero process of data scalable 
Predicted title: of weak finance risk to generalized properties modifies multilayer epidemic the google ols costs i 
Predicted title: tedi graphical two configuration two spectral a of agent configuration two probabilities phylogenetic migraine 
Predicted title: long spiking an implications of particle survey hst community from product quadratic 
Predicted title: patterns testing adaptive projects averaged by analysis detector elementary using admm 
Predicted title: transmitter superhedging tail number multivariate when monoids digital attribute superhedging only 
Predicted title: smoothable model economic imputation nucleons solvation promotion solvation core forest cycle semidefinite objects nonparametric 
Predicted title: with relative random controllers dynamics case clustering on criticality methods clustering cult

In [16]:
 pip install rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [17]:
from rouge import Rouge 
ROUGE = Rouge()

In [37]:
references=[]
candidates=[]
for i in range(0,100):
  reference = seq2summary(test_title[i])
  candidate = decode_sequence(test_content[i].reshape(1,max_content_len))
  references.append(reference)
  candidates.append(candidate)




In [40]:
scores = ROUGE.get_scores(candidates, references,avg=True)
print(scores)

{'rouge-1': {'r': 0.10397968697968697, 'p': 0.15338261738261738, 'f': 0.11877049475733667}, 'rouge-2': {'r': 0.01547111222111222, 'p': 0.021261904761904767, 'f': 0.017028125634369833}, 'rouge-l': {'r': 0.09701290376290377, 'p': 0.14241100566100562, 'f': 0.11048391846812883}}
