# Lab 6 : Análisis de Sentimientos

In [39]:
from tensorflow.keras.datasets import imdb

In [40]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM
import tensorflow as tf
import numpy as np

In [41]:
print('Cargando los datos...')
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=50000)
print('Datos cargados')

Cargando los datos...
Datos cargados


In [42]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(25000,)
(25000,)
(25000,)
(25000,)


In [43]:
for i in range(0,10):
    print(len(X_train[i]))

218
189
141
550
147
43
123
562
233
130


In [44]:
for i in range(0,10):
    print(y_train[i])

1
0
0
1
0
0
1
0
1
0


### Preprocesamiento
Dejar todas las críticas en una longitud uniforme. De esta manera los datos no están tan sesgados.

In [45]:
# decide percentiles
lengths = np.array([len(seq) for seq in X_train])
p90 = np.percentile(lengths, 90)
p95 = np.percentile(lengths, 95)
p99 = np.percentile(lengths, 99)
p80 = np.percentile(lengths, 80)
print(p80, p90, p95, p99)


331.0 467.0 610.0 926.0


In [46]:
MAX_LEN  = 331
UNITS = 128

Agregar features o características que puedan servir para que el modelo funcione mejor. 

In [47]:
def make_len_feats(seqs: np.ndarray, max_len: int) -> np.stack:
    lengths = np.array([min(len(s), max_len) for s in seqs], dtype=np.float32)
    return np.stack([np.log1p(lengths), lengths / max_len], axis=1)  # shape (N,2)

In [48]:
num_train = make_len_feats(X_train, MAX_LEN)
num_test  = make_len_feats(X_test,  MAX_LEN)


In [49]:
# fill in
X_train = sequence.pad_sequences(X_train, maxlen=MAX_LEN, padding='post', truncating='post')
X_test = sequence.pad_sequences(X_test, maxlen=MAX_LEN, padding='post', truncating='post')


### Modelo

In [None]:
seq_in = tf.keras.Input(shape=(MAX_LEN, ), dtype='int32', name='seq')
x = tf.keras.layers.Embedding(input_dim=50000, output_dim=UNITS, mask_zero=True, name='emb')(seq_in)
x = tf.keras.layers.SpatialDropout1D(0.3)(x)
x = tf.keras.layers.LSTM(UNITS, return_sequences=True, dropout=0.2)(x)

x_max = tf.keras.layers.GlobalMaxPooling1D()(x)
x_avg = tf.keras.layers.GlobalAveragePooling1D()(x)
x_seq = tf.keras.layers.Concatenate(name='pool_concat')([x_max, x_avg])

num_in = tf.keras.Input(shape=(2, ), dtype='float32', name='len_features')
n = tf.keras.layers.LayerNormalization()(num_in)
h = tf.keras.layers.Concatenate()([x_seq, n])
inputs = [seq_in, num_in]

# h = x_seq
# inputs = seq_in


h = tf.keras.layers.Dense(UNITS, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(h)
h = tf.keras.layers.Dropout(0.3)(h)
out = tf.keras.layers.Dense(1, activation='sigmoid')(h)

model = tf.keras.Model(inputs=inputs, outputs=out)






In [51]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-4),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC(name='auc'), tf.keras.metrics.AUC(name='auprc', curve='PR')],
)
model.summary()

In [52]:
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))
print("Built with CUDA:", tf.test.is_built_with_cuda())
print("Is GPU available (legacy):", tf.test.is_gpu_available())


Num GPUs Available: 0
Built with CUDA: False
Is GPU available (legacy): False


In [53]:
tf.keras.mixed_precision.set_global_policy('mixed_float16')

### Training

In [54]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_auprc', mode='max', patience=2, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_auprc', mode='max', factor=0.5, patience=1, min_lr=1e-5),
]


history = model.fit(
        {'seq': X_train, 'len_features': num_train},
        y_train,
        validation_split=0.2,   # take 20% of training set
        epochs=8,
        batch_size=64,
        callbacks=callbacks,
        verbose=1,
    )




Epoch 1/8
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m269s[0m 854ms/step - accuracy: 0.7706 - auc: 0.8635 - auprc: 0.8613 - loss: 0.4607 - val_accuracy: 0.8784 - val_auc: 0.9478 - val_auprc: 0.9473 - val_loss: 0.2981 - learning_rate: 5.0000e-04
Epoch 2/8
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m234s[0m 746ms/step - accuracy: 0.9151 - auc: 0.9688 - auprc: 0.9685 - loss: 0.2239 - val_accuracy: 0.8480 - val_auc: 0.9526 - val_auprc: 0.9515 - val_loss: 0.3622 - learning_rate: 5.0000e-04
Epoch 3/8
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 663ms/step - accuracy: 0.9530 - auc: 0.9871 - auprc: 0.9862 - loss: 0.1362 - val_accuracy: 0.8814 - val_auc: 0.9513 - val_auprc: 0.9489 - val_loss: 0.3068 - learning_rate: 5.0000e-04
Epoch 4/8
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2511s[0m 8s/step - accuracy: 0.9773 - auc: 0.9956 - auprc: 0.9952 - loss: 0.0733 - val_accuracy: 0.8802 - val_auc: 0.9439 - val_auprc: 0.9312 - val

### Evaluate

In [55]:
model.evaluate({'seq': X_test, 'len_features': num_test}, y_test, verbose=1)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2751s[0m 4s/step - accuracy: 0.8285 - auc: 0.9387 - auprc: 0.9354 - loss: 0.3903


[0.3903196454048157,
 0.8285199999809265,
 0.9387007355690002,
 0.9354233145713806]

In [56]:
hist = history.history
for k in ['auprc','val_auprc','loss','val_loss']:
    print(k, [round(v, 4) for v in hist[k]])

auprc [0.8613, 0.9685, 0.9862, 0.9952]
val_auprc [0.9473, 0.9515, 0.9489, 0.9312]
loss [0.4607, 0.2239, 0.1362, 0.0733]
val_loss [0.2981, 0.3622, 0.3068, 0.4359]


In [57]:
hist = history.history
print("last train:", hist['accuracy'][-1], hist['loss'][-1])
print("last val:",   hist['val_accuracy'][-1], hist['val_loss'][-1])

last train: 0.9773499965667725 0.0733182281255722
last val: 0.8802000284194946 0.4359225630760193
