In [24]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import os
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from bs4 import BeautifulSoup
from keras.initializers import Constant
from keras import backend as K
from keras.models import Model
from keras import initializers
from keras.layers import Dense, Input , Layer
from keras.layers import Embedding, GRU, Bidirectional, TimeDistributed
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import to_categorical
from nltk import tokenize

In [4]:
class HAN(Layer):
    def __init__(self, attention_dim):
        self.init = initializers.get('normal')
        self.supports_masking = True
        self.attention_dim = attention_dim
        super(HAN, self).__init__()

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
        self.b = K.variable(self.init((self.attention_dim,)))
        self.u = K.variable(self.init((self.attention_dim, 1)))
        self.trainable__weights = [self.W, self.b, self.u]
        super(HAN, self).build(input_shape)

    def compute_mask(self, inputs, mask=None):
        return mask

    def call(self, x, mask=None):
        # size of x :[batch_size, sel_len, attention_dim]
        # size of u :[batch_size, attention_dim]
        # uit = tanh(xW+b)
        uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))

        ait = K.exp(K.squeeze(K.dot(uit, self.u), -1))

        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting
            ait *= K.cast(mask, K.floatx())
        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        weighted_input = x * K.expand_dims(ait)
        output = K.sum(weighted_input, axis=1)

        return output

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]

In [5]:
data = pd.read_csv("/content/drive/My Drive/Deep_Learning/labeledTrainData.tsv" , sep='\t')
data.head(5)

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [35]:
data.shape

(25000, 3)

In [6]:
maxlen = 100
max_sentences = 15
max_words = 20000
embedding_dim = 100
validation_split = 0.2

In [7]:
def remove_html(str_a):
    p = re.compile(r'<.*?>')
    return p.sub('', str_a)

def replace_non_ascii(str_a):
    return re.sub(r'[^\x00-\x7f]', r'', str_a)

def clean_str(string):
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()

In [8]:
reviews = []
texts = []
labels = []

for idx in range(data.review.shape[0]):
    text = BeautifulSoup(data.review[idx], features="html5lib")
    text = clean_str(text.get_text().encode('ascii', 'ignore').decode('utf-8'))
    texts.append(text)
    sentences = tokenize.sent_tokenize(text)
    reviews.append(sentences)
    labels.append(data.sentiment[idx])

  text = BeautifulSoup(data.review[idx], features="html5lib")


In [9]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)

word_index = tokenizer.word_index
print(len(word_index))

80568


In [10]:
dt = np.zeros(shape=(len(texts) , max_sentences , maxlen) , dtype='int32')

In [11]:
for i, sentences in enumerate(reviews):
    for j, sent in enumerate(sentences):
        if j < max_sentences:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for _, word in enumerate(wordTokens):
                if k < maxlen and tokenizer.word_index[word] < max_words:
                    dt[i, j, k] = tokenizer.word_index[word]
                    k = k + 1

In [12]:
print(dt.shape)

(25000, 15, 100)


In [13]:
labels = to_categorical(np.asarray(labels))
print(labels.shape)

(25000, 2)


In [14]:
indices = np.arange(dt.shape[0])
np.random.shuffle(indices)
dt = dt[indices]
labels = labels[indices]
nb_validation_samples = int(validation_split * dt.shape[0])

x_train = dt[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = dt[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [15]:
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

[ 9991. 10009.]
[2509. 2491.]


In [16]:
embeddings_index = {}
f = open(os.path.join('/content/drive/My Drive/Deep_Learning/glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [17]:
embedding_matrix = np.random.random((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [66]:
!pip install tensorflow==2.10.0

Collecting tensorflow==2.10.0
  Downloading tensorflow-2.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (578.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m578.0/578.0 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting gast<=0.4.0,>=0.2.1 (from tensorflow==2.10.0)
  Downloading gast-0.4.0-py3-none-any.whl (9.8 kB)
Collecting keras<2.11,>=2.10.0 (from tensorflow==2.10.0)
  Downloading keras-2.10.0-py2.py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m62.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras-preprocessing>=1.1.1 (from tensorflow==2.10.0)
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf<3.20,>=3.9.2 (from tensorflow==2.10.0)
  Downloading protobuf-3.19.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.wh

In [19]:
embedding_layer = Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix],
                            input_length=maxlen, trainable=True, mask_zero=True)

sentence_input = Input(shape=(maxlen,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)

word_encoder = Bidirectional(GRU(50, return_sequences=True))(embedded_sequences)
word_attn = HAN(100)(word_encoder)
sentenceEncoder = Model(sentence_input, word_attn)

review_input = Input(shape=(max_sentences, maxlen), dtype='int32')
review_encoder = TimeDistributed(sentenceEncoder)(review_input)

sentence_encoder = Bidirectional(GRU(50, return_sequences=True))(tf.transpose(review_encoder, perm=[0, 2, 1]))
sentence_attn = HAN(100)(sentence_encoder)




In [21]:
preds = Dense(2, activation='softmax')(sentence_attn)
model = Model(review_input, preds)

In [23]:
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 15, 100)]         0         
                                                                 
 time_distributed_1 (TimeDis  (None, 15, 100)          8112700   
 tributed)                                                       
                                                                 
 tf.compat.v1.transpose (TFO  (None, 100, 15)          0         
 pLambda)                                                        
                                                                 
 bidirectional_3 (Bidirectio  (None, 100, 100)         20100     
 nal)                                                            
                                                                 
 han_2 (HAN)                 (None, 100)               10200     
                                                           

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

print("model fitting - Hierachical attention network")
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=5, batch_size=64)

model fitting - Hierachical attention network
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5