### Introducción

Este cuaderno cubre el proceso de desarrollo de un modelo de NLP (LSTM) para predecir el sentimiento de los tweets relacionados con el cambio climático. Cubrirá cuatro fases principales:

- Modelado y entrenamiento: donde se construirá el modelo.
- Optimización: donde se registrarán y compararán las métricas adquiridas para diferentes combinaciones de las fases anteriores.

### Import de las funciones y librerias pertinentes

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import Constant
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping


from sklearn.model_selection import train_test_split

from gensim.models import Word2Vec
import numpy  as np
import pandas as pd

### Carga de los datos

In [None]:
df = pd.read_csv('preprocessed_tweets_lstm_rnn.csv')[['RawContent', 'Sentiment']]
df['RawContent']  = df['RawContent'].apply(lambda x: [i.replace('\'', '').replace(']', '').replace('[', '') for i in x.split(', ')])

In [None]:
# Se eliminan los tweets neutrales
df = df[df['Sentiment'] != 1]
df.loc[df['Sentiment'] == 2, 'Sentiment'] = 1

### Entrenamiento del modelo

In [None]:
texts  = df['RawContent'] 
labels = df['Sentiment']

In [None]:
# Tokenizacion

word2vec_model = Word2Vec(texts, vector_size=100, window=5, min_count=2)
word2vec_model.train(texts, total_examples=len(texts), epochs=5)

word_indices = {word: (index+1) for word, index in word2vec_model.wv.key_to_index.items()}
word_vectors = np.vstack([np.zeros((1, word2vec_model.wv.vectors.shape[1])), word2vec_model.wv.vectors])

# Aplicar el diccionario de palabras de word2vec a los textos
sequences = []
for sentence in texts:
    sentence_indices = [0 if word not in word_indices else word_indices[word] for word in sentence]
    sequences.append(sentence_indices)

# Añadir padding a las secuencias para que todas tengan la misma longitud
max_length = max(len(sequence) for sequence in sequences)+1
padded_sequences = pad_sequences(sequences, maxlen=max_length)

In [None]:
# One hot encoding de los labels
labels = to_categorical(labels)

In [None]:
# Separar datos de entrenamiento y test

tweets_train, tweets_test, labels_train, labels_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=17)

In [None]:
results = pd.DataFrame(columns=['learning_rate', 'number_of_neurons', 'loss', 'accuracy'])

In [None]:
learning_rates = [0.05, 0.01, 0.005, 0.001]
neurons = [32,64,128,256]

for n in neurons:
    for lr in learning_rates:
        
        print(f'Learning rate: {lr}, Number of neurons: {n}')
        # Definimos el modelo de red neuronal. Será un modelo secuencial, es decir, 
        # una pila de capas de neuronas que reciben la salida de la capa anterior como entrada.
        model = Sequential()

        model.add(Embedding(
            input_dim=len(word_indices)+1, 
            output_dim=100, 
            embeddings_initializer=Constant(word_vectors), 
            trainable=False)
        )
        model.add(LSTM(n, return_sequences=True))
        model.add(LSTM(n*2, return_sequences=True))
        model.add(LSTM(n))
        model.add(Dense(2, activation='softmax'))

        early_stop = EarlyStopping(monitor='val_loss', patience=4)

        model.compile(
            optimizer=Adam(learning_rate=lr), 
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )

        model.fit(tweets_train, labels_train, 
                  batch_size=64, epochs=100, 
                  verbose=1, 
                  validation_data=(tweets_test, labels_test), callbacks=[early_stop])
        
        loss, accuract = model.evaluate(tweets_test, labels_test)
        
        new_row = pd.DataFrame({'learning_rate':[lr], 'number_of_neurons':[n], 
                                'loss':[loss], 'accuracy':[accuract]})
        
        results = pd.concat([results, new_row], ignore_index=True)
        results.to_csv('results_lstm_binary.csv')