In [1]:
import tensorflow as tf
import pandas as pd
import keras
import os

Using TensorFlow backend.


# Data Processing, Loading

In [2]:
import os
 
def getListOfFiles(dirName):
    # create a list of file and sub directories 
    # names in the given directory 
    listOfFile = os.listdir(dirName)
    allFiles = list()
    # Iterate over all the entries
    for entry in listOfFile:
        # Create full path
        fullPath = dirName+"/"+entry
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath)
        else:
            allFiles.append(fullPath)
                
    return allFiles        

<h4> Convert the data into a pandas dataframe </h4>

In [3]:
import pandas as pd


dirName = 'data';
    
# Get the list of all files in directory tree at given path
listOfFiles = getListOfFiles(dirName)
    

df=pd.DataFrame(columns=["Title","Description","Category"])
# Print the files    

for elem in listOfFiles:
    file1 = open(elem,"r") 
    sampleNews=file1.read().split("\n")
    newsDesc=""
    for strline in sampleNews[1:len(sampleNews)]:
        newsDesc+=strline
    """
    print("Category: "+elem.split("/")[1])
    print("Title:"+sampleNews[0])
    print("Description:"+newsDesc[0:50])
    """
    dfsample=pd.DataFrame(columns=["Title","Description","Category"],data=[[sampleNews[0],newsDesc,elem.split("/")[1]]])
    df=df.append(dfsample)

In [4]:
df=df.reset_index()
df=df.drop(list(df)[0], axis=1)
df["Description"].head()[0]

'Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL\'s underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL\'s existing customers for high-spee

In [5]:
df.head()

Unnamed: 0,Title,Description,Category
0,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarner...,business
1,Dollar gains on Greenspan speech,The dollar has hit its highest level against t...,business
2,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuko...,business
3,High fuel prices hit BA's profits,British Airways has blamed high fuel prices fo...,business
4,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Domec...,business


<h4> Convert the Labels to Integers </h4>

In [6]:
categories=df["Category"].unique().tolist()
cat_dict=dict(zip(categories, range(0,len((categories)))))
print(cat_dict)
for x in categories:
    print(x+": "+str(len(df[df["Category"]==x])))
df["Category"]=df['Category'].map(cat_dict, na_action='ignore')

{'business': 0, 'entertainment': 1, 'politics': 2, 'sport': 3, 'tech': 4}
business: 510
entertainment: 386
politics: 417
sport: 511
tech: 401


<h4> Removing Special Characters </h4>

In [7]:
df['Description'] = df['Description'].str.replace("[^a-zA-Z#]", " ")
df=df.dropna()
df["Description"].head()[0]

'Quarterly profits at US media giant TimeWarner jumped     to      bn       m  for the three months to December  from     m year earlier The firm  which is now one of the biggest investors in Google  benefited from sales of high speed internet connections and higher advert sales  TimeWarner said fourth quarter sales rose    to      bn from      bn  Its profits were buoyed by one off gains which offset a profit dip at Warner Bros  and less users for AOL Time Warner said on Friday that it now owns    of search engine Google  But its own internet business  AOL  had has mixed fortunes  It lost         subscribers in the fourth quarter profits were lower than in the preceding three quarters  However  the company said AOL s underlying profit before exceptional items rose    on the back of stronger internet advertising revenues  It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL s existing customers for high speed 

# Data Cleaning

<h4> 80-20 Train-Test Split </h4>

In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df["Description"], df["Category"], test_size=0.20, random_state=42)

train_sum=0
for x in categories:
    print(x+": "+str(len(y_train[y_train==cat_dict[x]])))
    train_sum+=len(y_train[y_train==cat_dict[x]])
print("Total Training Data: "+str(train_sum)+"\n")

test_sum=0
for x in categories:
    print(x+": "+str(len(y_test[y_test==cat_dict[x]])))
    test_sum+=len(y_test[y_test==cat_dict[x]])
print("Total Test Data: "+str(test_sum))

business: 395
entertainment: 314
politics: 341
sport: 409
tech: 321
Total Training Data: 1780

business: 115
entertainment: 72
politics: 76
sport: 102
tech: 80
Total Test Data: 445


### Tokenization

In [9]:
import nltk
from nltk.tokenize import word_tokenize

X_train_notcleaned=[word_tokenize(i) for i in X_train]
y_train=y_train.values
X_test_notcleaned=[word_tokenize(i) for i in X_test]
y_test=y_test.values

print(X_train_notcleaned[0])

['Ashley', 'Cole', 'has', 'refused', 'to', 'blame', 'Robin', 'van', 'Persie', 'for', 'leaving', 'Arsenal', 'with', 'no', 'fully', 'fit', 'strikers', 'for', 'the', 'FA', 'Cup', 'fifth', 'round', 'replay', 'at', 'Sheffield', 'United', 'Van', 'Persie', 'is', 'suspended', 'alongside', 'Dennis', 'Bergkamp', 'and', 'Jose', 'Antonio', 'Reyes', 'after', 'being', 'sent', 'off', 'at', 'Southampton', 'when', 'Arsenal', 'had', 'a', 'numerical', 'advantage', 'Thierry', 'Henry', 'is', 'ruled', 'out', 'with', 'an', 'Achilles', 'tendon', 'injury', 'but', 'Cole', 'said', 'No', 'one', 'is', 'putting', 'the', 'blame', 'on', 'Robin', 'It', 's', 'just', 'something', 'that', 'happens', 'on', 'the', 'spur', 'of', 'the', 'moment', 'Cole', 'added', 'I', 've', 'done', 'it', 'before', 'and', 'I', 'hope', 'they', 'didn', 't', 'blame', 'me', 'for', 'anything', 'Of', 'course', 'he', 'll', 'learn', 'I', 've', 'been', 'sent', 'off', 'a', 'couple', 'of', 'times', 'now', 'and', 'it', 's', 'just', 'one', 'of', 'those', 

<h4> Removing of Stop Words and Words with length <3 </h4>

In [10]:
from nltk.corpus import stopwords

X_train,X_test=[],[]
for s in X_train_notcleaned:
    s_cleaned = []
    for w in s:
        if w.lower() not in stopwords.words("english") and len(w)>=3:
            s_cleaned.append(w)
 
    X_train.append(s_cleaned)
for s in X_test_notcleaned:
    s_cleaned = []
    for w in s:
        if w.lower() not in stopwords.words("english") and len(w)>=3:
            s_cleaned.append(w)
 
    X_test.append(s_cleaned)
print(X_train[0])

['Ashley', 'Cole', 'refused', 'blame', 'Robin', 'van', 'Persie', 'leaving', 'Arsenal', 'fully', 'fit', 'strikers', 'Cup', 'fifth', 'round', 'replay', 'Sheffield', 'United', 'Van', 'Persie', 'suspended', 'alongside', 'Dennis', 'Bergkamp', 'Jose', 'Antonio', 'Reyes', 'sent', 'Southampton', 'Arsenal', 'numerical', 'advantage', 'Thierry', 'Henry', 'ruled', 'Achilles', 'tendon', 'injury', 'Cole', 'said', 'one', 'putting', 'blame', 'Robin', 'something', 'happens', 'spur', 'moment', 'Cole', 'added', 'done', 'hope', 'blame', 'anything', 'course', 'learn', 'sent', 'couple', 'times', 'one', 'things', 'bit', 'crazy', 'one', 'two', 'seconds', 'Freddie', 'Ljungberg', 'likely', 'used', 'emergency', 'striking', 'role', 'partnered', 'either', 'Arturo', 'Lupoli', 'Quincy', 'Owusu', 'Abeyie', 'Jeremie', 'Aliadiere', 'Gunners', 'boss', 'Arsene', 'Wenger', 'said', 'Freddie', 'option', 'need', 'second', 'striker', 'decide', 'whether', 'Aliadiere', 'Quincy', 'Lupoli', 'start', 'front', 'three', 'involved', 

 <h4> <p> Creating the Vocabulary and word2index </p> </h4>

In [11]:
words = set([])
for s in X_train:
    for w in s:
        words.add(w.lower())
word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for OOVs

## Preparation of Training and Test Data before fitting into the model

<h4> Importing the GloVe word embeddings</h4>

<p> You can download the word embeddings at https://nlp.stanford.edu/projects/glove/ </p>
<br><b>Note:</b> Choose the 6B and 300D data

In [12]:
import numpy as np

embeddings_index = dict()
f = open('glove.6B.300d.txt',encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

<h3> Convert all words to lowercase then to integers, then pad the sentences </h3>

In [13]:
train_sentences_X, test_sentences_X = [], []

EMB_DIM=300
num_words=len(word2index)+1
print("Number of Words:"+str(num_words))

for s in X_train:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    train_sentences_X.append(s_int)
    
for s in X_test:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    test_sentences_X.append(s_int)


MAX_LENGTH = len(max(train_sentences_X, key=len))
print("Max Length: "+str(MAX_LENGTH))  # 271

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
#y_train=to_categorical(y_train)
#y_test=to_categorical(y_test)
print(train_sentences_X[0])
print(y_train[0])

Number of Words:25043
Max Length: 1749
[20939 15731  4872 ...     0     0     0]
3


<p> <h3>Creation of Pre-Trained Word Embeddings to be used for the embedding layer </h3> </p>

In [14]:
embedding_matrix=np.zeros((num_words,EMB_DIM))
#print(word2index)
for word,i in word2index.items():
    if i>num_words:
        continue
    embedding_vector=embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i]=embedding_vector

## Bidirectional LSTMs with Attention Mechanism

From the paper of <i> Attention is All You Need </i>, we have the follow equation:

\begin{align}
Attention(Q,K,V) &  = Softmax(\frac{QK^T}{\sqrt d_k}) V 
\end{align}

In [15]:
from keras.layers import merge
from keras.layers.core import *
from keras.layers.recurrent import LSTM
SINGLE_ATTENTION_VECTOR = False
TIME_STEPS=MAX_LENGTH

def attention_3d_block(inputs):
    # inputs.shape = (batch_size, time_steps, input_dim)
    input_dim = EMB_DIM
    a = Permute((2, 1))(inputs)
    a = Dense(TIME_STEPS, activation='softmax')(a)
    if SINGLE_ATTENTION_VECTOR:
        a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)
        a = RepeatVector(input_dim)(a)
    a_probs = Permute((2, 1), name='attention_vec')(a)
    output_attention_mul = multiply([inputs, a_probs])
    return output_attention_mul

### Tensorboard callback for visualizations of loss and accuracy

In [16]:
from keras.callbacks import TensorBoard

class TrainValTensorBoard(TensorBoard):
    def __init__(self, log_dir='./Graph', **kwargs):
        # Make the original `TensorBoard` log to a subdirectory 'training'
        training_log_dir = os.path.join(log_dir, 'training')
        super(TrainValTensorBoard, self).__init__(training_log_dir, **kwargs)

        # Log the validation metrics to a separate subdirectory
        self.val_log_dir = os.path.join(log_dir, 'validation')

    def set_model(self, model):
        # Setup writer for validation metrics
        self.val_writer = tf.summary.FileWriter(self.val_log_dir)
        super(TrainValTensorBoard, self).set_model(model)

    def on_epoch_end(self, epoch, logs=None):
        # Pop the validation logs and handle them separately with
        # `self.val_writer`. Also rename the keys so that they can
        # be plotted on the same figure with the training metrics
        logs = logs or {}
        val_logs = {k.replace('val_', ''): v for k, v in logs.items() if k.startswith('val_')}
        for name, value in val_logs.items():
            summary = tf.Summary()
            summary_value = summary.value.add()
            summary_value.simple_value = value.item()
            summary_value.tag = name
            self.val_writer.add_summary(summary, epoch)
        self.val_writer.flush()

        # Pass the remaining logs to `TensorBoard.on_epoch_end`
        logs = {k: v for k, v in logs.items() if not k.startswith('val_')}
        super(TrainValTensorBoard, self).on_epoch_end(epoch, logs)

    def on_train_end(self, logs=None):
        super(TrainValTensorBoard, self).on_train_end(logs)
        self.val_writer.close()

### Bidirectional Long-Short Term Memory without the Attention Mechanisms

In [17]:
from keras.models import Sequential,load_model
from keras.layers import Dense, CuDNNLSTM,LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation,Dropout
from keras.optimizers import Adam,SGD
from keras import regularizers
from keras.initializers import Constant
import numpy as np


model_lstm=Sequential() 
embedding_layer=Embedding(num_words,EMB_DIM,embeddings_initializer=Constant(embedding_matrix),input_length=MAX_LENGTH,trainable=False,mask_zero=True)
#embedding_layer=Embedding(num_words, 300,mask_zero=True)
inputs=InputLayer(input_shape=(MAX_LENGTH, ))
model_lstm.add(inputs)
model_lstm.add(embedding_layer)
model_lstm.add(Bidirectional(LSTM(128,activation="tanh")))
model_lstm.add(Dropout(0.3))
model_lstm.add(Dense(len(np.unique(y_train)),activation="softmax"))
model_lstm.compile(loss='sparse_categorical_crossentropy',
              optimizer=Adam(0.0001),    
              metrics=["accuracy"])
print(model_lstm.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1749, 300)         7512900   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               439296    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 1285      
Total params: 7,953,481
Trainable params: 440,581
Non-trainable params: 7,512,900
_________________________________________________________________
None


In [None]:
from keras.callbacks import ModelCheckpoint,EarlyStopping

es_valacc = EarlyStopping(monitor='val_acc', mode='max',verbose=1,patience=2,min_delta=0.003)
es_loss = EarlyStopping(monitor='loss', mode='min',verbose=1,min_delta=0.003)
history_lstm=model_lstm.fit(train_sentences_X, y_train, validation_data=(test_sentences_X,y_test), batch_size=64,epochs=15,callbacks=[es_loss,es_valacc,TrainValTensorBoard(write_graph=True,log_dir='./Graph_LSTM')])

Train on 1780 samples, validate on 445 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15


In [None]:
model_lstm.save("LSTM.h5")
scores = model_lstm.evaluate(test_sentences_X,y_test)
print(model_lstm.metrics_names)
print(scores)

In [None]:
import matplotlib.pyplot as plt

# Plot training & validation accuracy values
plt.figure()
plt.plot(history_lstm.history['acc'])
plt.plot(history_lstm.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.yticks(np.arange(0,1,step=0.1))
plt.savefig("acc_lstm.png")

plt.figure( )
# Plot training & validation loss values
plt.plot(history_lstm.history['loss'])
plt.plot(history_lstm.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.savefig("loss_lstm.png")

## Bidirectional Long-Short  Term Memory with Attention Mechanisms

In [None]:
from keras.layers import merge,concatenate,add,dot,multiply, Dense, LSTM, InputLayer, Bidirectional, Embedding, Activation,Dropout
from keras.layers.core import *
from keras.layers.recurrent import LSTM
from keras.models import *
from keras.optimizers import Adam
from keras import regularizers
from keras.initializers import Constant

inputs = Input(shape=(MAX_LENGTH,))
embedding_layer=Embedding(num_words,EMB_DIM,embeddings_initializer=Constant(embedding_matrix),input_length=MAX_LENGTH,trainable=False)
attention_mul = embedding_layer(inputs)
attention_mul = Bidirectional(LSTM(150,activation="tanh",return_sequences=True))(attention_mul)
attention_mul = attention_3d_block(attention_mul)
attention_mul = Flatten()(attention_mul)
attention_mul=Dropout(0.3)(attention_mul)
output = Dense(len(np.unique(y_train)),activation="softmax")(attention_mul)
model = Model(inputs=[inputs], outputs=output)
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=Adam(0.0001),    
              metrics=["accuracy"])
print(model.summary())

In [None]:
from keras.callbacks import ModelCheckpoint,EarlyStopping

es_valacc = EarlyStopping(monitor='val_acc', mode='max',verbose=1,patience=2,min_delta=0.003)
es_loss = EarlyStopping(monitor='loss', mode='min',verbose=1,min_delta=0.003)
history=model.fit(train_sentences_X, y_train, validation_data=(test_sentences_X,y_test), batch_size=32,epochs=15,callbacks=[es_loss,es_valacc,TrainValTensorBoard(write_graph=True,,log_dir='./Graph')])

In [None]:
model.save("Attention.h5")
scores = model.evaluate(test_sentences_X,y_test)
print(model.metrics_names)   # acc: 99.09751977804825   
print(scores)   # acc: 99.09751977804825

In [None]:
import matplotlib.pyplot as plt

# Plot training & validation accuracy values
plt.figure()
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.yticks(np.arange(0,1,step=0.1))
plt.savefig("acc_attention.png")

plt.figure( )
# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.savefig("loss_attention.png")

### Working Example

In [None]:
from attention_utils import get_activations, get_data_recurrent
attention_vectors = []

input_text="Oscar-nominated director John Singleton is in intensive care after suffering a stroke. known for movies including Boyz N The Hood and 2 Fast 2 Furious, became unwell on Wednesday, his family said. Following news of the stroke, friends and colleagues sent their best wishes. Rapper Snoop Dogg shared a picture of the pair together on Instagram, writing: Pray 4 my brother. " 
input_text=input_text.replace("[^a-zA-Z#]", " ")
input_text=word_tokenize(input_text)
print(input_text)
input_text_cleaned=[]
for w in input_text:
    if w.lower() not in stopwords.words("english") and len(w)>=3:
        input_text_cleaned.append(w)
print(input_text_cleaned)
input_text_index=[]
for w in input_text_cleaned:
    try:
        input_text_index.append(word2index[w.lower()])
    except KeyError:
        input_text_index.append(word2index['-OOV-'])
print(input_text_index)
input_text_pad=pad_sequences([input_text_index], maxlen=MAX_LENGTH, padding='post')
print(input_text_pad)

print(model.predict(input_text_pad))

In [None]:
attention_vector = np.mean(get_activations(model,input_text_pad,print_shape_only=True,layer_name='attention_vec')[0], axis=2).squeeze()
assert (np.sum(attention_vector) - 1.0) < 1e-5

words_index=sorted(range(len(attention_vector)), key=lambda i: attention_vector[i])[-10:]
print(words_index)
top_words=dict()
for x in words_index:
    if x<len(input_text_cleaned):
        top_words[input_text_cleaned[x]]=attention_vector[x]
print(top_words)
