# Lab 6 : Análisis de Sentimientos

In [72]:
from tensorflow.keras.datasets import imdb

In [73]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM
import tensorflow as tf
import numpy as np

In [74]:
print('Cargando los datos...')
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=50000)
print('Datos cargados')

Cargando los datos...
Datos cargados


In [75]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(25000,)
(25000,)
(25000,)
(25000,)


In [76]:
for i in range(0,10):
    print(len(X_train[i]))

218
189
141
550
147
43
123
562
233
130


In [77]:
for i in range(0,10):
    print(y_train[i])

1
0
0
1
0
0
1
0
1
0


### Preprocesamiento
Dejar todas las críticas en una longitud uniforme. De esta manera los datos no están tan sesgados.

In [78]:
# decide percentiles
lengths = np.array([len(seq) for seq in X_train])
p90 = np.percentile(lengths, 90)
p95 = np.percentile(lengths, 95)
p99 = np.percentile(lengths, 99)
p80 = np.percentile(lengths, 80)
print(p80, p90, p95, p99)


331.0 467.0 610.0 926.0


In [79]:
MAX_LEN  = 331
UNITS = 128

Agregar features o características que puedan servir para que el modelo funcione mejor. 

In [80]:
def make_len_feats(seqs: np.ndarray, max_len: int) -> np.stack:
    lengths = np.array([min(len(s), max_len) for s in seqs], dtype=np.float32)
    return np.stack([np.log1p(lengths), lengths / max_len], axis=1)  # shape (N,2)

In [81]:
num_train = make_len_feats(X_train, MAX_LEN)
num_test  = make_len_feats(X_test,  MAX_LEN)


In [82]:
# fill in
X_train = sequence.pad_sequences(X_train, maxlen=MAX_LEN, padding='post', truncating='post', value=0) # not sure about default
X_test = sequence.pad_sequences(X_test, maxlen=MAX_LEN, padding='post', truncating='post', value=0)


### Modelo

In [None]:
seq_in = tf.keras.Input(shape=(MAX_LEN, ), dtype='int32', name='seq')
x = tf.keras.layers.Embedding(input_dim=50000, output_dim=UNITS, mask_zero=True, name='emb')(seq_in)
x = tf.keras.layers.SpatialDropout1D(0.25)(x)
x = tf.keras.layers.LSTM(UNITS, return_sequences=True, dropout=0.2)(x)

x_max = tf.keras.layers.GlobalMaxPooling1D()(x)
x_avg = tf.keras.layers.GlobalAveragePooling1D()(x)
x_seq = tf.keras.layers.Concatenate(name='pool_concat')([x_max, x_avg])
num_in = tf.keras.Input(shape=(2, ), dtype='float32', name='len_features')
n = tf.keras.layers.LayerNormalization()(num_in)
h = tf.keras.layers.Concatenate()([x_seq, n])
inputs = [seq_in, num_in]

h = tf.keras.layers.Dense(UNITS, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-5))(h)
h = tf.keras.layers.Dropout(0.25)(h)
out = tf.keras.layers.Dense(1, activation='sigmoid')(h)

model = tf.keras.Model(inputs=inputs, outputs=out)






In [84]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-4),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC(name='auc'), tf.keras.metrics.AUC(name='auprc', curve='PR')],
)
model.summary()

In [85]:
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))
print("Built with CUDA:", tf.test.is_built_with_cuda())
print("Is GPU available (legacy):", tf.test.is_gpu_available())


Num GPUs Available: 0
Built with CUDA: False
Is GPU available (legacy): False


In [None]:
tf.keras.mixed_precision.set_global_policy('mixed_float16')

### Training

In [86]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_auprc', mode='max', patience=2, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_auprc', mode='max', factor=0.5, patience=1, min_lr=1e-5),
]
history = model.fit(
        {'seq': X_train, 'len_features': num_train},
        y_train,
        validation_split=0.2,   # take 20% of training set
        epochs=15,
        batch_size=64,
        callbacks=callbacks,
        verbose=1,
    )

Epoch 1/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 340ms/step - accuracy: 0.7666 - auc: 0.8655 - auprc: 0.8665 - loss: 0.4535 - val_accuracy: 0.8822 - val_auc: 0.9497 - val_auprc: 0.9500 - val_loss: 0.2939 - learning_rate: 5.0000e-04
Epoch 2/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 253ms/step - accuracy: 0.9222 - auc: 0.9726 - auprc: 0.9712 - loss: 0.2088 - val_accuracy: 0.8882 - val_auc: 0.9523 - val_auprc: 0.9517 - val_loss: 0.2979 - learning_rate: 5.0000e-04
Epoch 3/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 271ms/step - accuracy: 0.9571 - auc: 0.9893 - auprc: 0.9885 - loss: 0.1233 - val_accuracy: 0.8782 - val_auc: 0.9501 - val_auprc: 0.9479 - val_loss: 0.3330 - learning_rate: 5.0000e-04
Epoch 4/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 344ms/step - accuracy: 0.9829 - auc: 0.9969 - auprc: 0.9969 - loss: 0.0579 - val_accuracy: 0.8696 - val_auc: 0.9357 - val_auprc: 0.9287 -

### Evaluate

In [87]:
model.evaluate({'seq': X_test, 'len_features': num_test}, y_test, verbose=1)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 53ms/step - accuracy: 0.8606 - auc: 0.9379 - auprc: 0.9339 - loss: 0.3431


[0.3430615961551666,
 0.8605999946594238,
 0.9378583431243896,
 0.9338710904121399]

In [88]:
hist = history.history
for k in ['auprc','val_auprc','loss','val_loss']:
    print(k, [round(v, 4) for v in hist[k]])

auprc [0.8665, 0.9712, 0.9885, 0.9969]
val_auprc [0.95, 0.9517, 0.9479, 0.9287]
loss [0.4535, 0.2088, 0.1233, 0.0579]
val_loss [0.2939, 0.2979, 0.333, 0.4959]


In [89]:
hist = history.history
print("last train:", hist['accuracy'][-1], hist['loss'][-1])
print("last val:",   hist['val_accuracy'][-1], hist['val_loss'][-1])

last train: 0.9828500151634216 0.05787217989563942
last val: 0.8695999979972839 0.4959242641925812
