In [None]:
import tensorflow as tf
import pandas as pd
import keras
import os

# Data Processing, Loading

In [None]:
import os
 
def getListOfFiles(dirName):
    # create a list of file and sub directories 
    # names in the given directory 
    listOfFile = os.listdir(dirName)
    allFiles = list()
    # Iterate over all the entries
    for entry in listOfFile:
        # Create full path
        fullPath = dirName+"/"+entry
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath)
        else:
            allFiles.append(fullPath)
                
    return allFiles        

<h4> Convert the data into a pandas dataframe </h4>

In [None]:
import pandas as pd


dirName = 'data';
    
# Get the list of all files in directory tree at given path
listOfFiles = getListOfFiles(dirName)
    

df=pd.DataFrame(columns=["Title","Description","Category"])
# Print the files    

for elem in listOfFiles:
    file1 = open(elem,"r") 
    sampleNews=file1.read().split("\n")
    newsDesc=""
    for strline in sampleNews[1:len(sampleNews)]:
        newsDesc+=strline
    """
    print("Category: "+elem.split("/")[1])
    print("Title:"+sampleNews[0])
    print("Description:"+newsDesc[0:50])
    """
    dfsample=pd.DataFrame(columns=["Title","Description","Category"],data=[[sampleNews[0],newsDesc,elem.split("/")[1]]])
    df=df.append(dfsample)

In [None]:
df=df.reset_index()
df=df.drop(list(df)[0], axis=1)
df["Description"].head()[0]

In [None]:
df.head()

<h4> Convert the Labels to Integers </h4>

In [None]:
categories=df["Category"].unique().tolist()
cat_dict=dict(zip(categories, range(0,len((categories)))))
print(cat_dict)
for x in categories:
    print(x+": "+str(len(df[df["Category"]==x])))
df["Category"]=df['Category'].map(cat_dict, na_action='ignore')

<h4> Removing Special Characters </h4>

In [None]:
df['Description'] = df['Description'].str.replace("[^a-zA-Z#]", " ")
df=df.dropna()
df["Description"].head()[0]

# Data Cleaning

<h4> 80-20 Train-Test Split </h4>

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df["Description"], df["Category"], test_size=0.20, random_state=42)

train_sum=0
for x in categories:
    print(x+": "+str(len(y_train[y_train==cat_dict[x]])))
    train_sum+=len(y_train[y_train==cat_dict[x]])
print("Total Training Data: "+str(train_sum)+"\n")

test_sum=0
for x in categories:
    print(x+": "+str(len(y_test[y_test==cat_dict[x]])))
    test_sum+=len(y_test[y_test==cat_dict[x]])
print("Total Test Data: "+str(test_sum))

### Tokenization

In [None]:
import nltk
from nltk.tokenize import word_tokenize

X_train_notcleaned=[word_tokenize(i) for i in X_train]
y_train=y_train.values
X_test_notcleaned=[word_tokenize(i) for i in X_test]
y_test=y_test.values

print(X_train_notcleaned[0])

<h4> Removing of Stop Words and Words with length <3 </h4>

In [None]:
def remove_stop_words(s):
    s_cleaned=[]
    for w in s:
        if w.lower() not in stopwords.words("english") and len(w)>=3:
            s_cleaned.append(w)
 
    return s_cleaned

In [None]:
from nltk.corpus import stopwords
import multiprocessing as mp
import data_clean


X_train,X_test=[],[]

pool = mp.Pool(processes=4)
X_train = pool.map(data_clean.remove_stop_words, X_train_notcleaned)

print(X_train[0])

In [None]:
X_test=pool.map(data_clean.remove_stop_words, X_test_notcleaned)

pool.close()

 <h4> <p> Creating the Vocabulary and word2index </p> </h4>

In [None]:
words = set([])
for s in X_train:
    for w in s:
        words.add(w.lower())
word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for OOVs

## Preparation of Training and Test Data before fitting into the model

<h4> Importing the GloVe word embeddings</h4>

<p> You can download the word embeddings at https://nlp.stanford.edu/projects/glove/ </p>
<br><b>Note:</b> Choose the 6B and 300D data

In [None]:
import numpy as np

embeddings_index = dict()
f = open('glove.6B.300d.txt',encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

<h3> Convert all words to lowercase then to integers, then pad the sentences </h3>

In [None]:
train_sentences_X, test_sentences_X = [], []

EMB_DIM=300
num_words=len(word2index)+1
print("Number of Words:"+str(num_words))

for s in X_train:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    train_sentences_X.append(s_int)
    
for s in X_test:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    test_sentences_X.append(s_int)


MAX_LENGTH = len(max(train_sentences_X, key=len))
print("Max Length: "+str(MAX_LENGTH))  # 271

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
#y_train=to_categorical(y_train)
#y_test=to_categorical(y_test)
print(train_sentences_X[0])
print(y_train[0])

<p> <h3>Creation of Pre-Trained Word Embeddings to be used for the embedding layer </h3> </p>

In [None]:
embedding_matrix=np.zeros((num_words,EMB_DIM))
#print(word2index)
for word,i in word2index.items():
    if i>num_words:
        continue
    embedding_vector=embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i]=embedding_vector

## Bidirectional LSTMs with Attention Mechanism

From the paper of <i> Attention is All You Need </i>, we have the follow equation:

\begin{align}
Attention(Q,K,V) &  = Softmax(\frac{QK^T}{\sqrt d_k}) V 
\end{align}

In [None]:
from keras.layers import merge
from keras.layers.core import *
from keras.layers.recurrent import LSTM
SINGLE_ATTENTION_VECTOR = False
TIME_STEPS=MAX_LENGTH

def attention_3d_block(inputs):
    # inputs.shape = (batch_size, time_steps, input_dim)
    input_dim = EMB_DIM
    a = Permute((2, 1))(inputs)
    a = Dense(TIME_STEPS, activation='softmax')(a)
    if SINGLE_ATTENTION_VECTOR:
        a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)
        a = RepeatVector(input_dim)(a)
    a_probs = Permute((2, 1), name='attention_vec')(a)
    output_attention_mul = multiply([inputs, a_probs])
    return output_attention_mul

### Tensorboard callback for visualizations of loss and accuracy

In [None]:
from keras.callbacks import TensorBoard

class TrainValTensorBoard(TensorBoard):
    def __init__(self, log_dir='./Graph', **kwargs):
        # Make the original `TensorBoard` log to a subdirectory 'training'
        training_log_dir = os.path.join(log_dir, 'training')
        super(TrainValTensorBoard, self).__init__(training_log_dir, **kwargs)

        # Log the validation metrics to a separate subdirectory
        self.val_log_dir = os.path.join(log_dir, 'validation')

    def set_model(self, model):
        # Setup writer for validation metrics
        self.val_writer = tf.summary.FileWriter(self.val_log_dir)
        super(TrainValTensorBoard, self).set_model(model)

    def on_epoch_end(self, epoch, logs=None):
        # Pop the validation logs and handle them separately with
        # `self.val_writer`. Also rename the keys so that they can
        # be plotted on the same figure with the training metrics
        logs = logs or {}
        val_logs = {k.replace('val_', ''): v for k, v in logs.items() if k.startswith('val_')}
        for name, value in val_logs.items():
            summary = tf.Summary()
            summary_value = summary.value.add()
            summary_value.simple_value = value.item()
            summary_value.tag = name
            self.val_writer.add_summary(summary, epoch)
        self.val_writer.flush()

        # Pass the remaining logs to `TensorBoard.on_epoch_end`
        logs = {k: v for k, v in logs.items() if not k.startswith('val_')}
        super(TrainValTensorBoard, self).on_epoch_end(epoch, logs)

    def on_train_end(self, logs=None):
        super(TrainValTensorBoard, self).on_train_end(logs)
        self.val_writer.close()

### Bidirectional Long-Short Term Memory without the Attention Mechanisms

In [None]:
from keras.models import Sequential,load_model
from keras.layers import Dense, CuDNNLSTM,LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation,Dropout
from keras.optimizers import Adam,SGD
from keras import regularizers
from keras.initializers import Constant
import numpy as np
from keras.utils import plot_model


model_lstm=Sequential() 
embedding_layer=Embedding(num_words,EMB_DIM,embeddings_initializer=Constant(embedding_matrix),input_length=MAX_LENGTH,trainable=False)
#embedding_layer=Embedding(num_words, 300,mask_zero=True)
inputs=InputLayer(input_shape=(MAX_LENGTH, ))
model_lstm.add(inputs)
model_lstm.add(embedding_layer)
model_lstm.add(Bidirectional(LSTM(256,return_sequences=True)))
model_lstm.add(Dropout(0.3))
model_lstm.add(Bidirectional(LSTM(256)))
model_lstm.add(Dropout(0.3))
model_lstm.add(Dense(len(np.unique(y_train)),activation="softmax"))
model_lstm.compile(loss='sparse_categorical_crossentropy',
              optimizer=Adam(0.0001),    
              metrics=["accuracy"])
print(model_lstm.summary())
plot_model(model_lstm, to_file='model_lstm.png')


In [None]:
from keras.callbacks import ModelCheckpoint,EarlyStopping

es_valacc = EarlyStopping(monitor='val_acc', mode='max',verbose=1,patience=2,min_delta=0.003)
es_loss = EarlyStopping(monitor='loss', mode='min',verbose=1,min_delta=0.003)
history_lstm=model_lstm.fit(train_sentences_X, y_train, validation_data=(test_sentences_X,y_test), batch_size=32,epochs=15,callbacks=[es_loss,es_valacc,TrainValTensorBoard(write_graph=True,log_dir='./Graph_LSTM')])

In [None]:
#model_lstm.save("LSTM.h5")
scores = model_lstm.evaluate(test_sentences_X,y_test)
print(model_lstm.metrics_names)
print(scores)

In [None]:
# Save the weights
model_lstm.save_weights('lstm_weights.h5')

# Save the model architecture
with open('lstm_architecture.json', 'w') as f:
    f.write(model_lstm.to_json())
    
from keras.models import model_from_json
del model_lstm
# Model reconstruction from JSON file
with open('lstm_architecture.json', 'r') as f:
    model_lstm = model_from_json(f.read())

# Load weights into the new model
model_lstm.load_weights('lstm_weights.h5')
model_lstm.compile(loss='sparse_categorical_crossentropy',
              optimizer=Adam(0.0001),    
              metrics=["accuracy"])
scores = model_lstm.evaluate(test_sentences_X,y_test)
print(model_lstm.metrics_names)
print(scores)

In [None]:
import matplotlib.pyplot as plt

# Plot training & validation accuracy values
plt.figure()
plt.plot(history_lstm.history['acc'])
plt.plot(history_lstm.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.yticks(np.arange(0,1,step=0.1))
plt.savefig("acc_lstm.png")

plt.figure( )
# Plot training & validation loss values
plt.plot(history_lstm.history['loss'])
plt.plot(history_lstm.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.savefig("loss_lstm.png")

## Bidirectional Long-Short  Term Memory with Attention Mechanisms

In [19]:
from keras.layers import merge,concatenate,add,dot,multiply, Dense,CuDNNLSTM, LSTM, InputLayer, Bidirectional, Embedding, Activation,Dropout
from keras.layers.core import *
from keras.models import *
from keras.optimizers import Adam
from keras import regularizers
from keras.initializers import Constant
from keras.utils import plot_model

inputs = Input(shape=(MAX_LENGTH,))
embedding_layer=Embedding(num_words,EMB_DIM,embeddings_initializer=Constant(embedding_matrix),input_length=MAX_LENGTH,trainable=False)
attention_mul = embedding_layer(inputs)
attention_mul = Bidirectional(LSTM(256,return_sequences=True,activation="tanh"))(attention_mul)
attention_mul=Dropout(0.5)(attention_mul)
attention_mul = Bidirectional(LSTM(256,return_sequences=True,activation="tanh"))(attention_mul)
attention_mul = attention_3d_block(attention_mul)
attention_mul=Dropout(0.5)(attention_mul)
attention_mul = Flatten()(attention_mul)
output = Dense(len(np.unique(y_train)),activation="softmax")(attention_mul)
model = Model(inputs=[inputs], outputs=output)
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=Adam(0.0001),    
              metrics=["accuracy"])
print(model.summary())
plot_model(model, to_file='model_attention.png')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1749)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1749, 300)    7512900     input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 1749, 512)    1140736     embedding_1[0][0]                
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 1749, 512)    0           bidirectional_1[0][0]            
__________________________________________________________________________________________________
bidirectio

In [21]:
from keras.callbacks import ModelCheckpoint,EarlyStopping

es_valacc = EarlyStopping(monitor='val_acc', mode='max',verbose=1,patience=2,min_delta=0.003)
es_loss = EarlyStopping(monitor='loss', mode='min',verbose=1)
history=model.fit(train_sentences_X, y_train, validation_data=(test_sentences_X,y_test), batch_size=20,epochs=15,callbacks=[es_loss,es_valacc,TrainValTensorBoard(write_graph=True,log_dir='./Graph')])

Train on 1780 samples, validate on 445 samples
Epoch 1/15
  40/1780 [..............................] - ETA: 49:04 - loss: 1.6094 - acc: 0.2750

KeyboardInterrupt: 

In [None]:
# Save the weights
model.save_weights('attention_weights.h5')

# Save the model architecture
with open('attention_architecture.json', 'w') as f:
    f.write(model.to_json())
    
from keras.models import model_from_json
del model
# Model reconstruction from JSON file
with open('attention_architecture.json', 'r') as f:
    model = model_from_json(f.read())

# Load weights into the new model
model.load_weights('attention_weights.h5')
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=Adam(0.0001),    
              metrics=["accuracy"])
scores = model.evaluate(test_sentences_X,y_test)
print(model.metrics_names)
print(scores)

In [None]:
import matplotlib.pyplot as plt

# Plot training & validation accuracy values
plt.figure()
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.yticks(np.arange(0,1,step=0.1))
plt.savefig("acc_attention.png")

plt.figure( )
# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.savefig("loss_attention.png")

### Working Example

In [None]:
from attention_utils import get_activations, get_data_recurrent
attention_vectors = []

input_text="Oscar-nominated director John Singleton is in intensive care after suffering a stroke. known for movies including Boyz N The Hood and 2 Fast 2 Furious, became unwell on Wednesday, his family said. Following news of the stroke, friends and colleagues sent their best wishes. Rapper Snoop Dogg shared a picture of the pair together on Instagram, writing: Pray 4 my brother. " 
input_text=input_text.replace("[^a-zA-Z#]", " ")
input_text=word_tokenize(input_text)
print(input_text)
input_text_cleaned=[]
for w in input_text:
    if w.lower() not in stopwords.words("english") and len(w)>=3:
        input_text_cleaned.append(w)
print(input_text_cleaned)
input_text_index=[]
for w in input_text_cleaned:
    try:
        input_text_index.append(word2index[w.lower()])
    except KeyError:
        input_text_index.append(word2index['-OOV-'])
print(input_text_index)
input_text_pad=pad_sequences([input_text_index], maxlen=MAX_LENGTH, padding='post')
print(input_text_pad)

print(model.predict(input_text_pad))

In [None]:
attention_vector = np.mean(get_activations(model,input_text_pad,print_shape_only=True,layer_name='attention_vec')[0], axis=2).squeeze()
assert (np.sum(attention_vector) - 1.0) < 1e-5

words_index=sorted(range(len(attention_vector)), key=lambda i: attention_vector[i])[-10:]
print(words_index)
top_words=dict()
for x in words_index:
    if x<len(input_text_cleaned):
        top_words[input_text_cleaned[x]]=attention_vector[x]
print(top_words)
