In [1]:
import tensorflow as tf

In [2]:
import collections


import numpy as np
import random


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GRU, Input, Dense, TimeDistributed,LSTM, Activation, RepeatVector, Bidirectional,Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy

## Data

In [8]:
import pandas as pd

In [9]:
data=pd.read_csv("News_Category.csv")

In [10]:
data.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [11]:
titles = data['headline'].to_list()

In [12]:
titles[0]

'There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV'

In [13]:
len(titles)

200853

In [14]:
random.shuffle(titles)

In [15]:
titles[0]

'Laura Linney Gives Birth to a Baby Boy at Age 49 and the Question Remains, Should You Ever Give Up Your Dream to Conceive?'

In [16]:
titles=titles[:20000]

In [17]:
import collections

In [18]:
titles_counter = collections.Counter([word for sentence in titles for word in sentence.split()])

print('{} Words.'.format(len([word for sentence in titles for word in sentence.split()])))
print('{} unique words.'.format(len(titles_counter)))
print('10 Most common words in the titles:')
print('"' + '" "'.join(list(zip(*titles_counter.most_common(20)))[0]) + '"')

191011 Words.
31724 unique words.
10 Most common words in the titles:
"The" "To" "A" "In" "Of" "For" "Is" "And" "On" "the" "With" "to" "Your" "You" "How" "of" "Trump" "New" "(PHOTOS)" "This"


In [19]:
def tokenize(x):
    """
    Tokenize x
    :param x: List of sentences/strings to be tokenized
    :return: Tuple of (tokenized x data, tokenizer used to tokenize x)
    """
    x_tk = Tokenizer()
    x_tk.fit_on_texts(x)
    return x_tk.texts_to_sequences(x), x_tk

In [20]:
def pad(x, length=None):
    """
    Pad x
    :param x: List of sequences.
    :param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
    :return: Padded numpy array of sequences
    """
    if length is None:
        # Find the length of the longest sequence/sentence
        length = max([len(seq) for seq in x])
    
    return pad_sequences(sequences=x, maxlen=length, padding='post')

In [21]:
def preprocess(x):
    """
    Preprocess x and y
    :param x: Feature List of sentences
    :param y: Label List of sentences
    :return: Tuple of (Preprocessed x, Preprocessed y, x tokenizer, y tokenizer)
    """
    preprocess_x, x_tk = tokenize(x)
    preprocess_x = pad(preprocess_x)

    return preprocess_x, x_tk

preproc_titles, titles_tokenizer = preprocess(titles)
    
max_title_sequence_length = preproc_titles.shape[1]

titles_vocab_size = len(titles_tokenizer.word_index)


print('Data Preprocessed')
print("Max Title length:", max_title_sequence_length)
print("Title vocabulary size:", titles_vocab_size)


Data Preprocessed
Max Title length: 33
Title vocabulary size: 24526


In [110]:
# preproc_titles[0].shape

In [111]:
# np.expand_dims(preproc_titles, -1)

In [120]:
#hyperParams
num_words = 25000
maxlen = 53
embed_dim = 50
batch_size = 16

## Encoder

In [127]:
encoder_inputs = Input(shape=(maxlen,), name='Encoder-Input')
emb_layer = Embedding(num_words, embed_dim,input_length = maxlen, name='Body-Word-Embedding', mask_zero=False)
x = emb_layer(encoder_inputs)
state_h = Bidirectional(LSTM(128, activation='relu', name='Encoder-Last-LSTM'),merge_mode='sum')(x)
encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')
seq2seq_encoder_out = encoder_model(encoder_inputs)

In [128]:
decoded = RepeatVector(maxlen)(seq2seq_encoder_out)
decoder_lstm = Bidirectional(LSTM(128, return_sequences=True, name='Decoder-LSTM-before'),merge_mode='sum')
decoder_lstm_output = decoder_lstm(decoded)
decoder_dense = Dense(num_words, activation='softmax', name='Final-Output-Dense-before')
decoder_outputs = decoder_dense(decoder_lstm_output)


In [135]:
tf.debugging.set_log_device_placement(True)
seq2seq_Model = Model(encoder_inputs, decoder_outputs)
seq2seq_Model.compile(optimizer=tf.keras.optimizers.Nadam(lr=0.001), loss='sparse_categorical_crossentropy')
history = seq2seq_Model.fit(preproc_titles, np.expand_dims(preproc_titles, -1),
          batch_size=batch_size,
          epochs=10)

Train on 20000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [132]:
sentences = ["Corona is bitch"]
seq,seq_tokenizer = tokenize(sentences)
pad_seq=pad_sequences(seq, maxlen=53, padding='post')
sentence_vec = encoder_model.predict(pad_seq)[0]

In [133]:
sentence_vec

array([1.7154497e+10, 2.3128285e+00, 1.8948966e+10, 0.0000000e+00,
       2.4477364e+10, 6.9981921e+09, 9.0288293e+02, 1.8066031e+10,
       0.0000000e+00, 8.0409114e+09, 3.0993208e+03, 1.2019243e-10,
       2.4211485e+10, 1.2311431e+10, 2.6360735e+03, 9.5063368e+09,
       2.6514760e-01, 1.6312970e+10, 1.3306758e+10, 8.8617349e+09,
       0.0000000e+00, 0.0000000e+00, 1.8690505e+10, 2.4993655e+10,
       0.0000000e+00, 0.0000000e+00, 2.7861638e+03, 1.1276770e+03,
       3.7062963e+09, 1.6781406e+10, 1.8797748e+10, 2.4847573e+10,
       1.1200934e+10, 4.8212398e+09, 3.1813740e+03, 2.2980510e+10,
       1.7623846e+03, 0.0000000e+00, 6.9290004e+09, 3.3853710e+10,
       1.9593732e+10, 2.1315430e+03, 3.0775521e+09, 0.0000000e+00,
       1.2079585e+10, 5.9122246e+08, 0.0000000e+00, 2.3526877e+10,
       6.3832576e+09, 2.0302156e+10, 1.8359994e+10, 1.7828168e+10,
       9.4540595e+09, 1.2882727e+10, 6.5908582e+09, 2.9732536e+10,
       1.6303486e+10, 0.0000000e+00, 1.6761657e+00, 2.1479334e

In [134]:
sentence_vec.shape

(128,)

## Save the trained model

In [None]:
seq2seq_Model.save('saveModel/LSTM_EncoderDecoderTrained') 
encoder_model.save('saveModel/LSTM_EncoderTrained') 

## Loading Models

In [5]:
Model = tf.keras.models.load_model('saveModel/LSTM_EncoderDecoderTrained')
Model.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Encoder-Input (InputLayer)   [(None, 53)]              0         
_________________________________________________________________
Encoder-Model (Model)        (None, 128)               1433296   
_________________________________________________________________
repeat_vector_4 (RepeatVecto (None, 53, 128)           0         
_________________________________________________________________
bidirectional_8 (Bidirection (None, 53, 128)           263168    
_________________________________________________________________
Final-Output-Dense-before (D (None, 53, 25000)         3225000   
Total params: 4,921,464
Trainable params: 4,921,464
Non-trainable params: 0
_________________________________________________________________


In [6]:
encoder = tf.keras.models.load_model('saveModel/LSTM_EncoderTrained')
encoder.summary()

Model: "Encoder-Model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Encoder-Input (InputLayer)   [(None, 53)]              0         
_________________________________________________________________
Body-Word-Embedding (Embeddi (None, 53, 50)            1250000   
_________________________________________________________________
bidirectional_7 (Bidirection (None, 128)               183296    
Total params: 1,433,296
Trainable params: 1,433,296
Non-trainable params: 0
_________________________________________________________________


## Using models for inference

In [22]:
sentences = ["Corona is bitch"]
seq,seq_tokenizer = tokenize(sentences)
pad_seq=pad_sequences(seq, maxlen=53, padding='post')
sentence_vec = encoder.predict(pad_seq)[0]
sentence_vec

array([1.7154497e+10, 2.3128285e+00, 1.8948966e+10, 0.0000000e+00,
       2.4477364e+10, 6.9981921e+09, 9.0288293e+02, 1.8066031e+10,
       0.0000000e+00, 8.0409114e+09, 3.0993208e+03, 1.2019243e-10,
       2.4211485e+10, 1.2311431e+10, 2.6360735e+03, 9.5063368e+09,
       2.6514760e-01, 1.6312970e+10, 1.3306758e+10, 8.8617349e+09,
       0.0000000e+00, 0.0000000e+00, 1.8690505e+10, 2.4993655e+10,
       0.0000000e+00, 0.0000000e+00, 2.7861638e+03, 1.1276770e+03,
       3.7062963e+09, 1.6781406e+10, 1.8797748e+10, 2.4847573e+10,
       1.1200934e+10, 4.8212398e+09, 3.1813740e+03, 2.2980510e+10,
       1.7623846e+03, 0.0000000e+00, 6.9290004e+09, 3.3853710e+10,
       1.9593732e+10, 2.1315430e+03, 3.0775521e+09, 0.0000000e+00,
       1.2079585e+10, 5.9122246e+08, 0.0000000e+00, 2.3526877e+10,
       6.3832576e+09, 2.0302156e+10, 1.8359994e+10, 1.7828168e+10,
       9.4540595e+09, 1.2882727e+10, 6.5908582e+09, 2.9732536e+10,
       1.6303486e+10, 0.0000000e+00, 1.6761657e+00, 2.1479334e

In [27]:
Articles_100=pd.read_csv('10Category10Articles.csv')

In [28]:
Titles_100 = Articles_100['headline'].to_list()

In [39]:
len(Titles_100)

100

In [45]:
Titles_100[0]

'There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV'

## Latents for the 10 Articles of 10 categories

In [37]:
ArticleLatents=[]
for sentence in Titles_100:
    seq,seq_tokenizer = tokenize(sentences)
    pad_seq=pad_sequences(seq, maxlen=53, padding='post')
    sentence_vec = encoder.predict(pad_seq)[0]
    ArticleLatents.append(sentence_vec)

In [64]:
len(ArticleLatents)

100

## Dictionary mapping between article to latent

In [80]:
Article2Latent = {}

In [81]:
def mapArticle2Latent(Articles, Latents):
    for i in range(len(Articles)):
        Article2Latent[Articles[i]]=Latents[i]
    

In [82]:
mapArticle2Latent(Titles_100,ArticleLatents)

## K means

In [93]:
LatentArray=np.stack( ArticleLatents, axis=0 )

In [97]:
LatentArray.shape

(100, 128)

In [104]:
from sklearn.cluster import KMeans

km = KMeans(
    n_clusters=10, init='random',
    n_init=10, max_iter=300, 
    tol=0.001, random_state=0
)
Clusters = km.fit_predict(LatentArray)

  return self.fit(X, sample_weight=sample_weight).labels_


In [105]:
km.cluster_centers_.shape

(10, 128)

## Cluster to Article Mapping!

In [106]:
km.labels_

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [107]:
km.labels_.shape

(100,)