# Lab 6 : Análisis de Sentimientos

In [2]:
from tensorflow.keras.datasets import imdb

In [3]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM
import tensorflow as tf
import numpy as np

In [4]:
print('Cargando los datos...')
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=50000)
print('Datos cargados')

Cargando los datos...
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step
Datos cargados


In [5]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(25000,)
(25000,)
(25000,)
(25000,)


In [6]:
for i in range(0,10):
    print(len(X_train[i]))

218
189
141
550
147
43
123
562
233
130


In [7]:
for i in range(0,10):
    print(y_train[i])

1
0
0
1
0
0
1
0
1
0


### Preprocesamiento
Dejar todas las críticas en una longitud uniforme. De esta manera los datos no están tan sesgados.

In [8]:
# decide percentiles
lengths = np.array([len(seq) for seq in X_train])
p90 = np.percentile(lengths, 90)
p95 = np.percentile(lengths, 95)
p99 = np.percentile(lengths, 99)
p80 = np.percentile(lengths, 80)
print(p80, p90, p95, p99)


331.0 467.0 610.0 926.0


In [9]:
MAX_LEN  = int(p80)

Agregar features o características que puedan servir para que el modelo funcione mejor. 

In [12]:
def make_len_feats(seqs: np.ndarray, max_len: int) -> np.stack:
    lengths = np.array([min(len(s), max_len) for s in seqs], dtype=np.float32)
    return np.stack([np.log1p(lengths), lengths / max_len], axis=1)  # shape (N,2)

In [13]:
num_train = make_len_feats(X_train, MAX_LEN)
num_test  = make_len_feats(X_test,  MAX_LEN)


In [14]:
# fill in
X_train = sequence.pad_sequences(X_train, maxlen=MAX_LEN, padding='post', truncating='post', value=0) # not sure about default
X_test = sequence.pad_sequences(X_test, maxlen=MAX_LEN, padding='post', truncating='post', value=0)


### Modelo

In [16]:
seq_in = tf.keras.Input(shape=(MAX_LEN, ), dtype='int32', name='seq')
x = tf.keras.layers.Embedding(input_dim=50000, output_dim=128, mask_zero=True, name='emb')(seq_in)
x = tf.keras.layers.SpatialDropout1D(0.2)(x)
x = tf.keras.layers.LSTM(128, return_sequences=True, dropout=0.2)(x)

x_max = tf.keras.layers.GlobalAveragePooling1D()(x)
x_avg = tf.keras.layers.GlobalAveragePooling1D()(x)
x_seq = tf.keras.layers.Concatenate(name='pool_concat')([x_max, x_avg])
num_in = tf.keras.Input(shape=(2, ), dtype='float32', name='len_features')
n = tf.keras.layers.LayerNormalization()(num_in)
h = tf.keras.layers.Concatenate()([x_seq, n])
inputs = [seq_in, num_in]

h = tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-5))(h)
h = tf.keras.layers.Dropout(0.3)(h)
out = tf.keras.layers.Dense(1, activation='sigmoid')(h)

model = tf.keras.Model(inputs=inputs, outputs=out)




In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC(name='auc'), tf.keras.metrics.AUC(name='auprc', curve='PR')],
)
model.summary()

In [18]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Actualmente, el crecimiento de memoraria necesita ser igual en todas las GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

### Training

In [None]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_auprc', mode='max', patience=2, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_auprc', mode='max', factor=0.5, patience=1, min_lr=1e-5),
]
history = model.fit(
        {'seq': X_train, 'len_features': num_train},
        y_train,
        validation_split=0.2,
        epochs=15,
        batch_size=64,
        callbacks=callbacks,
        verbose=1,
    )

### Evaluate

In [None]:
model.evaluate({'seq': X_test, 'len_features': num_test}, y_test, verbose=1)