In [1]:
import tensorflow as tf

In [2]:
import collections


import numpy as np


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GRU, Input, Dense, TimeDistributed,LSTM, Activation, RepeatVector, Bidirectional,Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy

## Data

In [3]:
import pandas as pd

In [6]:
data=pd.read_csv("../../News_Category.csv")

In [7]:
data.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [8]:
titles = data['headline'].to_list()

In [9]:
titles[0]

'There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV'

In [10]:
len(titles)

200853

In [None]:
titles=random.shuffle(title, random)

In [108]:
titles=titles[:1000]

In [109]:
import collections

In [110]:
titles_counter = collections.Counter([word for sentence in titles for word in sentence.split()])

print('{} Words.'.format(len([word for sentence in titles for word in sentence.split()])))
print('{} unique words.'.format(len(titles_counter)))
print('10 Most common words in the titles:')
print('"' + '" "'.join(list(zip(*titles_counter.most_common(20)))[0]) + '"')

14588 Words.
4097 unique words.
10 Most common words in the titles:
"New" "The" "York" "Times" "-" "to" "in" "a" "of" "the" "Trump" "and" "for" "on" "Is" "With" "Trump’s" "at" "as" "by"


In [111]:
def tokenize(x):
    """
    Tokenize x
    :param x: List of sentences/strings to be tokenized
    :return: Tuple of (tokenized x data, tokenizer used to tokenize x)
    """
    x_tk = Tokenizer()
    x_tk.fit_on_texts(x)
    return x_tk.texts_to_sequences(x), x_tk

In [112]:
def pad(x, length=None):
    """
    Pad x
    :param x: List of sequences.
    :param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
    :return: Padded numpy array of sequences
    """
    if length is None:
        # Find the length of the longest sequence/sentence
        length = max([len(seq) for seq in x])
    
    return pad_sequences(sequences=x, maxlen=length, padding='post')

In [113]:
def preprocess(x):
    """
    Preprocess x and y
    :param x: Feature List of sentences
    :param y: Label List of sentences
    :return: Tuple of (Preprocessed x, Preprocessed y, x tokenizer, y tokenizer)
    """
    preprocess_x, x_tk = tokenize(x)
    preprocess_x = pad(preprocess_x)

    return preprocess_x, x_tk

preproc_titles, titles_tokenizer = preprocess(titles)
    
max_title_sequence_length = preproc_titles.shape[1]

titles_vocab_size = len(titles_tokenizer.word_index)


print('Data Preprocessed')
print("Max Title length:", max_title_sequence_length)
print("Title vocabulary size:", titles_vocab_size)


Data Preprocessed
Max Title length: 21
Title vocabulary size: 3696


In [119]:
# preproc_titles[0].shape

In [120]:
# np.expand_dims(preproc_titles, -1)

In [121]:
#hyperParams
num_words = 3700
maxlen = 21
embed_dim = 50
batch_size = 16

## Encoder

In [116]:
encoder_inputs = Input(shape=(maxlen,), name='Encoder-Input')
emb_layer = Embedding(num_words, embed_dim,input_length = maxlen, name='Body-Word-Embedding', mask_zero=False)
x = emb_layer(encoder_inputs)
state_h = Bidirectional(LSTM(128, activation='relu', name='Encoder-Last-LSTM'))(x)
encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')
seq2seq_encoder_out = encoder_model(encoder_inputs)

In [117]:
decoded = RepeatVector(maxlen)(seq2seq_encoder_out)
decoder_lstm = Bidirectional(LSTM(128, return_sequences=True, name='Decoder-LSTM-before'))
decoder_lstm_output = decoder_lstm(decoded)
decoder_dense = Dense(num_words, activation='softmax', name='Final-Output-Dense-before')
decoder_outputs = decoder_dense(decoder_lstm_output)


In [118]:
seq2seq_Model = Model(encoder_inputs, decoder_outputs)
seq2seq_Model.compile(optimizer=tf.keras.optimizers.Nadam(lr=0.001), loss='sparse_categorical_crossentropy')
history = seq2seq_Model.fit(preproc_titles, np.expand_dims(preproc_titles, -1),
          batch_size=batch_size,
          epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [126]:
sentences = ["Corona is bitch"]
seq,seq_tokenizer = tokenize(sentences)
pad_seq=pad_sequences(seq, maxlen=21, padding='post')
sentence_vec = encoder_model.predict(pad_seq)[0]

In [127]:
sentence_vec

array([1.5400781e+03, 2.1195332e+03, 1.9452211e+03, 1.4468572e+03,
       2.7062571e+03, 1.6349536e+03, 2.0579385e+03, 1.5932351e+03,
       2.4487239e+03, 1.9050825e+03, 2.0784609e+00, 2.1536016e+03,
       1.6867900e+03, 0.0000000e+00, 1.9955588e+03, 3.0597051e+03,
       5.0063408e-04, 0.0000000e+00, 1.7180950e+03, 1.8202684e+03,
       2.0634824e+03, 9.0168103e+02, 3.8030848e-01, 1.0607266e+03,
       2.8046865e+03, 2.2180254e+03, 2.0974800e+03, 1.9343760e+03,
       0.0000000e+00, 2.2577852e+03, 3.3668691e+03, 1.3126469e+03,
       1.8436913e+03, 0.0000000e+00, 7.0035736e+02, 1.2686010e+03,
       1.7521382e+03, 2.5970186e+03, 3.8825687e+02, 1.7115757e+03,
       2.7330707e+02, 2.0190470e+03, 0.0000000e+00, 2.0208607e+03,
       2.3512793e+03, 1.9183210e+03, 2.4099807e+03, 0.0000000e+00,
       3.5299006e+03, 5.8409735e+02, 0.0000000e+00, 9.5661932e+02,
       8.4618499e-05, 2.2348738e+03, 2.7969570e+03, 2.7983208e+03,
       1.9414551e+03, 0.0000000e+00, 6.0455139e+02, 4.8234296e