In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

import tensorflow as tf
from keras import backend as K

from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer
from tensorflow.keras.layers import Embedding, Input, Dense, LSTM, GRU, Bidirectional, TimeDistributed
from tensorflow.keras.models import Sequential, Model
import tensorflow.keras.initializers as initializers
import tensorflow.keras.regularizers as regularizers
import tensorflow.keras.constraints as constraints

In [2]:
MAX_SENTENCES = 50
MAX_WORDS_PER_SENTENCE = 50

In [3]:
def read_dataset(data_path):
    with open(data_path) as f:
        d_lines = f.read().splitlines()
        
    data = []
    labels = []
    for line in d_lines:
        features = line.split('<fff>')
        label, doc_id, sentences = int(features[0]), int(features[1]), features[2:]
        
        labels.append(label)
        
        doc_tokens = [] # contain tokens for every sentence in the doc
        for sent in sentences:
            sent_tokens = [int(token) for token in sent.split()]
            doc_tokens.append(sent_tokens)
            
        data.append(doc_tokens)
        
    return np.array(data), np.array(labels)

In [4]:
X_train, y_train = read_dataset('../input/attention-v2/20news-train-encoded.txt')
X_test, y_test = read_dataset('../input/attention-v2/20news-test-encoded.txt')

y_train = pd.get_dummies(pd.Series(y_train)).values
y_test = pd.get_dummies(pd.Series(y_test)).values

In [5]:
def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)

In [6]:
class AttentionWithContext(tf.keras.layers.Layer):
    
    def __init__(self,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):

        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight(shape=(input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)

        self.u = self.add_weight(shape=(input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        super(AttentionWithContext, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        uit = dot_product(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        ait = dot_product(uit, self.u)

        a = K.exp(ait)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]

In [7]:
with open('../input/attention-set/vocab-raw.txt', 'rb') as f:
    vocab_size = len(f.read().splitlines())

In [8]:
embedding_layer = Embedding(
    input_dim=vocab_size+2, output_dim=300, input_length=MAX_WORDS_PER_SENTENCE
)

word_input = Input(shape=(MAX_WORDS_PER_SENTENCE, ), dtype='int32')
word_sequence = embedding_layer(word_input)
word_lstm = Bidirectional(LSTM(units=100, return_sequences=True))(word_sequence)
word_dense = TimeDistributed(Dense(200))(word_lstm)
word_att = AttentionWithContext()(word_dense)
wordEncoder = Model(word_input, word_att)

sent_input = Input(
    shape=(MAX_SENTENCES, MAX_WORDS_PER_SENTENCE), dtype='int32')
sent_encoder = TimeDistributed(wordEncoder)(sent_input)
sent_lstm = Bidirectional(LSTM(100, return_sequences=True))(sent_encoder)
sent_dense = TimeDistributed(Dense(200))(sent_lstm)
sent_att = AttentionWithContext()(sent_dense)
preds = Dense(20, activation='softmax')(sent_att)
model = Model(sent_input, preds)

model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer='adam',
              metrics=['acc'])

2022-10-28 13:05:17.131978: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-28 13:05:17.140760: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-28 13:05:17.141425: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-28 13:05:17.142606: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [9]:
model.fit(X_train, y_train, validation_split=0.1,
          epochs=20, batch_size=100)

2022-10-28 13:05:20.517804: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/20


2022-10-28 13:05:26.515026: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fe296472910>

In [10]:
model.evaluate(X_test, y_test)



[2.3221399784088135, 0.74349445104599]