# In this notebook, we will create a LSTM Encoder from scratch!

## Import Packages

In [None]:
#Data Handling Libraries
import collections
import pandas as pd
import numpy as np
import random

#Tensorflow
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GRU, Input, Dense, TimeDistributed,LSTM, Activation, RepeatVector, Bidirectional,Embedding,LayerNormalization, Lambda, dot, Activation, concatenate 
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras import layers 
from tensorflow.keras.layers import Layer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow.keras.backend as K


## Import Google Drive!

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import Articles i:e Dataset

In [None]:
data=pd.read_csv("/content/drive/My Drive/Colab Notebooks/UBS_Pitch/News_Category.csv")

In [None]:
data = data[data['headline'].notnull()] #Removing NULL rows if any!

In [None]:
headlines = data['headline'].to_list() #selecting the target column - Headlines!

In [None]:
headlines[0]

'There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV'

In [None]:
len(headlines)

200847

In [None]:
random.shuffle(headlines) #breaking order

In [None]:
headlines[0]

'Top 13 Wedding Trends for 2013'

In [None]:
headlines_counter = collections.Counter([word for sentence in headlines for word in sentence.split()])

print('{} Words.'.format(len([word for sentence in headlines for word in sentence.split()])))
print('{} unique words.'.format(len(headlines_counter)))
print('10 Most common words in the titles:')
print('"' + '" "'.join(list(zip(*headlines_counter.most_common(20)))[0]) + '"')

1915849 Words.
119488 unique words.
10 Most common words in the titles:
"The" "To" "A" "In" "Of" "For" "Is" "And" "On" "With" "the" "to" "You" "Your" "How" "of" "Trump" "(PHOTOS)" "New" "and"


## Functions for preprocessing!

#### Tokenization via Tokenizer function from TF

In [None]:
def tokenize(x):
    """
    Tokenize x
    :param x: List of sentences/strings to be tokenized
    :return: Tuple of (tokenized x data, tokenizer used to tokenize x)
    """
    x_tk = Tokenizer()
    x_tk.fit_on_texts(x)
    return x_tk.texts_to_sequences(x), x_tk

#### padding all the article's token list, so that all of them be the same length via pad_sequences function from TF

In [None]:
def pad(x, length=None):
    """
    Pad x
    :param x: List of sequences.
    :param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
    :return: Padded numpy array of sequences
    """
    if length is None:
        # Find the length of the longest sequence/sentence
        length = max([len(seq) for seq in x])
    
    return pad_sequences(sequences=x, maxlen=length, padding='post')

#### preprocess function that uses above two functions to preprocess all articles, it also gives us 
1. max headline length (which is used in the NN pipeline)
2. Total vocab size (which is used in the NN pipeline)

In [None]:
def preprocess(x):
    """
    Preprocess x and y
    :param x: Feature List of sentences
    :param y: Label List of sentences
    :return: Tuple of (Preprocessed x, Preprocessed y, x tokenizer, y tokenizer)
    """
    preprocess_x, x_tk = tokenize(x)
    preprocess_x = pad(preprocess_x)

    return preprocess_x, x_tk

preproc_headlines, headlines_tokenizer = preprocess(headlines)
    
max_headline_sequence_length = preproc_headlines.shape[1]

headlines_vocab_size = len(headlines_tokenizer.word_index)


print('Data Preprocessed')
print("Max Headline length:", max_headline_sequence_length)
print("Headline vocabulary size:", headlines_vocab_size)


Data Preprocessed
Max Headline length: 53
Headline vocabulary size: 75639


## Hyperparams for the NN pipeline

In [None]:
num_words = 75640
maxlen = 53
embed_dim = 128
batch_size = 16

## LSTM Encoder_Decoder Architecture

#### Attention Module

In [None]:
class attention(Layer):
    def __init__(self,**kwargs):
        super(attention,self).__init__(**kwargs)

    def build(self,input_shape):
        self.W=self.add_weight(name="att_weight",shape=(input_shape[-1],1),initializer="normal")
        self.b=self.add_weight(name="att_bias",shape=(input_shape[1],1),initializer="zeros")        
        super(attention, self).build(input_shape)

    def call(self,x):
        et=K.squeeze(K.tanh(K.dot(x,self.W)+self.b),axis=-1)
        at=K.softmax(et)
        at=K.expand_dims(at,axis=-1)
        output=x*at
        return K.sum(output,axis=1)

    def compute_output_shape(self,input_shape):
        return (input_shape[0],input_shape[-1])

    def get_config(self):
        return super(attention,self).get_config()

#### Encoder

###### max headline length (maxlen, should be the input size)
##### Total vocab size - num_words
##### Note: num_words = Total vocab size + 1

In [None]:
encoder_inputs = Input(shape=(maxlen,), name='Encoder-Input')
emb_layer = Embedding(num_words, embed_dim,input_length = maxlen, name='Body-Word-Embedding', mask_zero=False)
x = emb_layer(encoder_inputs)

state_h = Bidirectional(LSTM(128, activation='relu', name='Encoder-Last-LSTM', return_sequences=True))(x)
norm = LayerNormalization(axis=1)(state_h)
state_h = Bidirectional(LSTM(128, activation='relu', name='Encoder-Last-LSTM',return_sequences=True))(norm)
norm = LayerNormalization(axis=1)(state_h)
state_h = Bidirectional(LSTM(128, activation='relu', name='Encoder-Last-LSTM',return_sequences=False))(norm)
norm = LayerNormalization(axis=1)(state_h)


att_out=attention()(norm)
encoder_model = Model(inputs=encoder_inputs, outputs=norm, name='Encoder-Model')
seq2seq_encoder_out = encoder_model(encoder_inputs)

#### Decoder

In [None]:
decoded = RepeatVector(maxlen)(seq2seq_encoder_out)
decoder_lstm = Bidirectional(LSTM(128, return_sequences=True, name='Decoder-LSTM-before'),merge_mode='sum')
decoder_lstm_output = decoder_lstm(decoded)
norm = layers.LayerNormalization(axis=1)(decoder_lstm_output)
decoder_dense = Dense(num_words, activation='softmax', name='Final-Output-Dense-before')
decoder_outputs = decoder_dense(norm)


## Define NN using above blocks and Start the training!

### Training takes huge time! You can use the trained models saved in saveModel folder!

In [None]:
tf.debugging.set_log_device_placement(True)
seq2seq_Model = Model(encoder_inputs, decoder_outputs)
print(seq2seq_Model.summary())
seq2seq_Model.compile(optimizer=tf.keras.optimizers.Nadam(lr=0.001), loss='sparse_categorical_crossentropy')
history = seq2seq_Model.fit(preproc_titles, np.expand_dims(preproc_titles, -1),
          batch_size=batch_size,
          epochs=10)

## Save the trained model

In [None]:
seq2seq_Model.save('/content/drive/My Drive/Colab Notebooks/UBS_Pitch/saveModel/LSTM_EncoderDecoderTrained') 
encoder_model.save('/content/drive/My Drive/Colab Notebooks/UBS_Pitch/saveModel/LSTM_EncoderTrained') 

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: /content/drive/My Drive/Colab Notebooks/UBS_Pitch/saveModel/LSTM_EncoderDecoderTrained/assets
INFO:tensorflow:Assets written to: /content/drive/My Drive/Colab Notebooks/UBS_Pitch/saveModel/LSTM_EncoderTrained/assets


## Load the saved Encoder model

In [None]:
encoder = tf.keras.models.load_model('/content/drive/My Drive/Colab Notebooks/UBS_Pitch/saveModel/LSTM_EncoderTrained')
encoder.summary()

Model: "Encoder-Model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Encoder-Input (InputLayer)   [(None, 33)]              0         
_________________________________________________________________
Body-Word-Embedding (Embeddi (None, 33, 128)           3553664   
_________________________________________________________________
bidirectional_12 (Bidirectio (None, 33, 256)           263168    
_________________________________________________________________
layer_normalization_12 (Laye (None, 33, 256)           66        
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 33, 256)           394240    
_________________________________________________________________
layer_normalization_13 (Laye (None, 33, 256)           66        
_________________________________________________________________
bidirectional_14 (Bidirectio (None, 256)             

## Using models for inference

In [None]:
Headlines_Inference=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/UBS_Pitch/41category500ArticlesEach.csv')

In [None]:
Headlines_Inference = Headlines_Inference['headline'].to_list()

In [None]:
len(Headlines_Inference)

20500

In [None]:
from tqdm import tqdm

In [None]:
ArticleLatents=[]
# for i in tqdm(range(1)):
for sentence in tqdm(TitlesComparision):
    seq,seq_tokenizer = tokenize(sentence)
    pad_seq=pad_sequences(seq, maxlen=33, padding='post')
    sentence_vec = encoder.predict(pad_seq)[0]
    ArticleLatents.append(sentence_vec)

100%|██████████| 20500/20500 [1:19:47<00:00,  4.28it/s]


In [None]:
articleLatentDF = pd.DataFrame(ArticleLatents)
articleLatentDF.to_csv('/content/drive/My Drive/Colab Notebooks/UBS_Pitch/articleToLatent.csv')

None


## Cluster Purity
### Now that we have latents for each headline, we can use K-means to cluster these headlines. We are using a range of K values, to pick one best K value that could cluster the articles at the best. Ideal K value would be the number of Categories and it turns out with such K value, clusters are atleast 85% pure.

### Purity is measured as, lets say we are talking about cluster 1 - of the data avaible in that cluster if 85% of the articles are of a certain category then we can very well name that cluster with that category name and say it is 85% pure.


In [None]:
from sklearn.cluster import KMeans
from collections import Counter

file = ("/content/drive/My Drive/Colab Notebooks/UBS_Pitch/41category500ArticlesEach.csv")
dataset = pd.read_csv(file, delimiter=',')
# dataset = dataset.head(50000)
n_unique_cat = dataset['category'].nunique()
category = dataset['category'].values
ctr = Counter(category)
print(ctr)
print(n_unique_cat)
# ad
print(category.shape)
embeddings = pd.read_csv("/content/drive/My Drive/Colab Notebooks/UBS_Pitch/articleToLatent.csv")
print(embeddings.shape)
print("Embedding loading done")

for n_cluster in range(n_unique_cat,4*n_unique_cat):
  print("Num clust ", n_cluster)
  kmeans = KMeans(n_clusters=n_cluster).fit(embeddings)
  cluster_list = []
  for i in range(n_cluster):
    cluster_list.append([])
  
  labels = kmeans.labels_
  ctr = Counter(labels)
  print(ctr)
  for i in range(category.shape[0]-1):
    cluster_list[labels[i]].append(category[i])
  for i in range(len(cluster_list)):
    ctr = Counter(cluster_list[i])
    print(ctr.most_common(2))
    cls = ctr.most_common(1)[0][0]
    ct = ctr.most_common(1)[0][1]
    print(cls,ct*100/float(len(cluster_list[i])))
    print()
  # sd

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
MEDIA 100.0

[('DIVORCE', 233), ('WEDDINGS', 96)]
DIVORCE 70.82066869300913

[('FOOD & DRINK', 231), ('MONEY', 111)]
FOOD & DRINK 67.54385964912281

[('ARTS', 261), ('WELLNESS', 48)]
ARTS 84.46601941747574

[('COMEDY', 337)]
COMEDY 100.0

[('HEALTHY LIVING', 296), ('THE WORLDPOST', 29)]
HEALTHY LIVING 91.07692307692308

[('GREEN', 286), ('TASTE', 46)]
GREEN 86.144578313253

[('WORLDPOST', 304), ('GOOD NEWS', 11)]
WORLDPOST 96.5079365079365

[('BLACK VOICES', 330), ('WEIRD NEWS', 10)]
BLACK VOICES 97.05882352941177

[('SPORTS', 332)]
SPORTS 100.0

[('STYLE', 336)]
STYLE 100.0

[('MONEY', 344)]
MONEY 100.0

[('IMPACT', 334)]
IMPACT 100.0

[('ARTS & CULTURE', 203), ('PARENTS', 138)]
ARTS & CULTURE 59.530791788856305

[('HOME & LIVING', 302), ('PARENTING', 15)]
HOME & LIVING 95.26813880126183

[('POLITICS', 184), ('WEIRD NEWS', 152)]
POLITICS 54.76190476190476

[('QUEER VOICES', 319), ('SPORTS', 14)]
QUEER VOICES 95.795795795

KeyboardInterrupt: ignored

## Create a Dataframe that has article to cluster mapping, which will be used by the downstream to find the trend

In [None]:
#Read the embeddings and do a Kmeans with k=41
n_cluster=41
embeddings = pd.read_csv("/content/drive/My Drive/Colab Notebooks/UBS_Pitch/articleToLatent.csv")
print(embeddings.shape)
print("Embedding loading done")
print("Num clust ", n_cluster)
#Kmeans
kmeans = KMeans(n_clusters=n_cluster).fit(embeddings)
ClusterIDs=kmeans.labels_
articles=pd.read_csv("/content/drive/My Drive/Colab Notebooks/UBS_Pitch/41category500ArticlesEachWithAuthorNames.csv")
articles['Cluster ID']=ClusterIDs.tolist()
articlesClusterID=articles[["category","headline","authors","date","Cluster ID"]]
articlesClusterID.to_csv("/content/drive/My Drive/Colab Notebooks/UBS_Pitch/articlesToClusterID.csv")

(20500, 257)
Embedding loading done
Num clust  41


## Pass the articleToClusterID.csv to the downstream. Done