<a href="https://colab.research.google.com/github/MathBorgess/data_science_studies/blob/main/deep_learning/recurrent/sentimental_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf

imdb = tf.keras.datasets.imdb

train_data, test_data = imdb.load_data()

In [2]:
import numpy as np
word_index = {k: (v+3) for k,v in imdb.get_word_index().items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

reverse_word_index = dict([(value, key) for key, value in word_index.items()])

train_dataset_texts = []
train_dataset_labels = []
for index in range(len(train_data[0])):
    train_dataset_texts.append(' '.join([ reverse_word_index.get(i, '?') for i in train_data[0][index]]))
    train_dataset_labels.append(train_data[1][index])

test_dataset_texts = []
test_dataset_labels = []
for index in range(len(test_data[0])):
    test_dataset_texts.append(' '.join([ reverse_word_index.get(i, '?') for i in test_data[0][index]]))
    test_dataset_labels.append(test_data[1][index])


train_dataset = tf.data.Dataset.from_tensor_slices((train_dataset_texts, train_dataset_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_dataset_texts, test_dataset_labels))

In [3]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [4]:
vocab_size = 10000

encoder = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size)
encoder.adapt(train_dataset.map(lambda text, label: text))

2025-01-30 21:38:22.712429: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Os tratamentos, encoding e embedding do dataset são os mesmos e foram descritos na CNN

In [16]:
class SentimentalClassifier(tf.keras.Model):
    def __init__(self, encoder, dense_units):
        super(SentimentalClassifier, self).__init__()
        self.encoder = encoder
        self.embedding = tf.keras.layers.Embedding(input_dim=len(encoder.get_vocabulary()),
                                                    output_dim=dense_units,
                                                    mask_zero=True)
        # It performs average pooling across the temporal dimension of the input data,
        # reducing the spatial dimensionality of the data while preserving important features.
        # self.averager = tf.keras.layers.GlobalAveragePooling1D()
        self.gru_one = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(dense_units, return_state=True, return_sequences=True))
        self.gru_two = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(dense_units, return_state=True, return_sequences=True))
        self.dense = tf.keras.layers.Dense(dense_units, activation='relu')
        self.dropout = tf.keras.layers.Dropout(0.5)
        self.outputer = tf.keras.layers.Dense(1, activation='sigmoid')

    def call(self, inputs, return_state=False, state=None):
        vector = self.encoder(inputs)
        vector = self.embedding(vector)
        y_ = self.gru_one(vector, initial_state=state)[0]
        y, forward_state, backward_state = self.gru_two(y_)
        y = self.dense(y)
        y = self.dropout(y)

        output = self.outputer(tf.keras.layers.GlobalAveragePooling1D()(y))
        #the second tf.concat param is the axis of concat, 0 will only append, 1 will append line by line and so
        if return_state:
          return tf.squeeze(output, axis=-1), tf.concat([forward_state, backward_state], 1)
        else:
          return tf.squeeze(output, axis=-1),

usa uma arquitetura baseada em GRU bidirecional para análise de sentimentos. Ele recebe texto codificado, aplica embedding, processa a sequência com duas camadas GRU bidirecionais, passa por uma camada densa com ReLU e dropout, e gera a previsão final com sigmoid.

In [17]:
model = SentimentalClassifier(encoder=encoder, dense_units=64)
model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=True))

sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
model(np.array([sample_text]))

(<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.50136924], dtype=float32)>,)

In [18]:
model.fit(train_dataset, epochs=5,
            validation_data=test_dataset,
            validation_steps=30)

Epoch 1/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m625s[0m 2s/step - loss: 0.6289 - val_loss: 0.5865
Epoch 2/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m346s[0m 886ms/step - loss: 0.5534 - val_loss: 0.5750
Epoch 3/5
[1m118/391[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m3:42[0m 817ms/step - loss: 0.5428

KeyboardInterrupt: 

In [29]:
model.evaluate(test_dataset)



0.6608366966247559