# Laboratorio 4
### Mejorando el Análisis de Sentimientos con LSTM y Características Adicionales

### Importación de datos

In [19]:
import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.regularizers import l2
import ssl

In [20]:
max_features = 50_000 # Palabras más frecuentes

In [21]:
ssl._create_default_https_context = ssl._create_unverified_context
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features)

In [22]:
word_index = imdb.get_word_index()
word_index_inv = {v: k for k, v in word_index.items()}

In [23]:
for w in X_train[10]:
    print(word_index_inv.get(w - 3, '?'), end=' ')

? french horror cinema has seen something of a revival over the last couple of years with great films such as inside and switchblade romance bursting on to the scene maléfique preceded the revival just slightly but stands head and shoulders over most modern horror titles and is surely one of the best french horror films ever made maléfique was obviously shot on a low budget but this is made up for in far more ways than one by the originality of the film and this in turn is complimented by the excellent writing and acting that ensure the film is a winner the plot focuses on two main ideas prison and black magic the central character is a man named carrère sent to prison for fraud he is put in a cell with three others the quietly insane lassalle body building transvestite marcus and his retarded boyfriend daisy after a short while in the cell together they stumble upon a hiding place in the wall that contains an old journal after translating part of it they soon realise its magical power

### Pre-procesamiento

In [24]:
# Se seleccionan algunas palabras positivas y negativas vistas en las reseñas
positive_words = ['good', 'great', 'excellent', 'amazing', 'love', 'wonderful', 'brilliant', 'loved', 'recommend', 'lovely', 'memorable']
negative_words = ['bad', 'terrible', 'awful', 'hate', 'worst', 'horrible', 'poor', 'claustrophobic', 'frightening', 'boring', 'lame']

# Convertir las listas de palabras a índices
positive_indices = [word_index[word] for word in positive_words if word in word_index and word_index[word] < max_features]
negative_indices = [word_index[word] for word in negative_words if word in word_index and word_index[word] < max_features]

#Función para extraer características de las secuencias
def extract_features(sequences):
    features = []
    for seq in sequences:
        length = len(seq)
        pos_count = sum(1 for word in seq if word in positive_indices)
        neg_count = sum(1 for word in seq if word in negative_indices)
        
        pos_ratio = pos_count / length if length > 0 else 0
        neg_ratio = neg_count / length if length > 0 else 0
        
        features.append([length, pos_ratio, neg_ratio])
    return np.array(features)

train_features = extract_features(X_train)
test_features = extract_features(X_test)

In [25]:
train_features

array([[2.18000000e+02, 4.58715596e-03, 0.00000000e+00],
       [1.89000000e+02, 5.29100529e-03, 0.00000000e+00],
       [1.41000000e+02, 7.09219858e-03, 7.09219858e-03],
       ...,
       [1.84000000e+02, 2.71739130e-02, 0.00000000e+00],
       [1.50000000e+02, 1.33333333e-02, 1.33333333e-02],
       [1.53000000e+02, 6.53594771e-03, 0.00000000e+00]])

In [26]:
len(X_train)

25000

In [27]:
len(X_test)

25000

In [28]:
X_train[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 22665,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 21631,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 19193,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 10311,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 31050,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 12118,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5

#### Longitud uniforme

In [29]:
maxlen = 500 # Número máximo de palabras por comentario

In [30]:

X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)

#### Unión de features con el texto

In [31]:
# Unir las características extraídas con las secuencias de palabras
X_train = np.concatenate([X_train, train_features], axis=1)
X_test = np.concatenate([X_test, test_features], axis=1)

### Modelo

In [32]:
modelo = Sequential()
modelo.add(Embedding(20000, 128))
modelo.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
modelo.add(LSTM(56, dropout=0.2, recurrent_dropout=0.2))
modelo.add(Dense(1, activation='sigmoid'))

In [33]:
modelo.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])