# Importing necessary packages

In [1]:
import re
import numpy as np
import pandas as pd
import tensorflow as tf
import gensim.downloader as api

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, precision_score, recall_score, log_loss
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder



# General Functions

In [2]:
def word_vector(df_input, lemmatizer, word_vectors, vocabulary, col_sentences):
    """
    Función para preprocesar las palabras de entrada y obtener una lista con las matrices de embeddings
    de las palabras de cada registro.
    
    Parameters
    ----------
    df_input : dataframe
    dataframe de entrada con todos los textos.
    lemmatizer : object
    objeto del lematizador de NLTK.
    word_vectors : object
    objecto con los word2vec del vocabulario de Gensim.
    vocabulary : list
    lista con las palabras existentes en el vocabulario de Gensim.
    col_sentences : str
    columna del dataframe donde están las frases.
    
    Returns
    -------
    
    X : list
    Lista de listas en las que cada registro tiene la lista con los arrays de los embeddings de
    las palabras de esa frase. 
    Es decir, X[0] tiene una lista donde cada elemento corresponde a los embeddings de una palabra.
    Así, por ejemplo, X[0][2] será un vector de dimensión 100 donde aparece el vector de embeddings de
    la tercera palabra de la primera frase.
    """
    
    X = []
    
    for text in df_input[col_sentences]:
        # Tokenizo cada frase
        words = re.findall(r'\w+', text.lower(),flags = re.UNICODE) # Paso a minusculas todo
        # Eliminación de las stop_words
        words = [word for word in words if word not in stopwords.words('english')]
        # Elimino guiones y otros simbolos raros 
        words = [word for word in words if not word.isdigit()] # Elimino numeros
        # Stemming
        words = [lemmatizer.lemmatize(w) for w in words]
        # Eliminar palabras que no estén en el vocabulario
        words = [word for word in words if word in vocabulary]
        # Word2Vec
        words_embeddings = [word_vectors[x] for x in words]
        # Guardo la frase final
        X.append(words_embeddings) # lo guardo como un numpy array
        
    return X

In [3]:
def create_RNN(x_train, K, n_lstm=8, loss='categorical_crossentropy', optimizer='adam'):
    """
    Función para crear la RNN. Como parámetro de entrada sólo necesita la matriz de features para 
    especificar la dimensionalidad de entrada de la NN.
    
    Parameters
    ----------
    x_input : array
    Matriz de features de entrada.
    K: int
    Clases de salida
    n_lstm : int, optional
    Number of lstm used. The default is 8.
    loss : string, optional
    loss metric. The default is 'categorical_crossentropy'.
    optimizer : string, optional
    optimizer. The default is 'adam'.
    
    Returns
    -------
    model : object
    Trained model
    """
    
    # Begin sequence
    model = tf.keras.Sequential()
    # Add a LSTM layer with 8 internal units.
    model.add(LSTM(n_lstm, input_shape=x_train.shape[-2:]))
    # Output
    model.add(Dense(K, activation='sigmoid'))
    # Compile model
    model.compile(loss=loss, optimizer=optimizer)
    
    return model

# I. Load files

In [4]:
tf.random.set_seed(42)
path='Data_Sources/NLP_Classifier_TrainData.csv'
df_raw = pd.read_csv(path, encoding = 'latin-1', header=None, skipinitialspace=True, skiprows=1)
df_transformed = df_raw.iloc[:, 3:]
df_transformed.columns = ['text', 'category']

In [5]:
df_transformed['text'][10]

'Three people died from the heat wave so far'

In [6]:
df_transformed['category'].value_counts()/len(df_transformed)

0    0.57034
1    0.42966
Name: category, dtype: float64

## Shuffle input

In [7]:
df_transformed = df_transformed.sample(frac=1)

## Load Word2Vec

In [8]:
word_vectors = api.load('glove-wiki-gigaword-100')
vocabulary = [x for x in word_vectors.key_to_index]

## Set lemmatizer

In [9]:
lemmatizer = WordNetLemmatizer()

## X/y split

In [10]:
X = pd.DataFrame(df_transformed['text'])
y = pd.to_numeric(df_transformed['category'])

# II. Preprocess
## Obtain X variable and prepare y

In [11]:
X = word_vector(X, lemmatizer, word_vectors, vocabulary, col_sentences='text')

## Train/Test split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## Obtain tensorr: [N_SENTENCES x SEQ_LENGTH x EMBEDDING_FEATURES]

In [13]:
SEQ_LENGTH = np.int(np.round(np.percentile([len(x) for x in X], 99, interpolation = 'midpoint')))
data_train = pad_sequences(X_train, maxlen=SEQ_LENGTH, padding="post", truncating="post")
data_test = pad_sequences(X_test, maxlen=SEQ_LENGTH, padding="post", truncating="post")

# III. Train model
## Params

In [24]:
K = 1
batch_size = 50
epochs = 5

## Create RNN

In [25]:
model = create_RNN(x_train = data_train, K = K, n_lstm = 50, loss = 'binary_crossentropy', optimizer = 'adam')
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 50)                30200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 51        
Total params: 30,251
Trainable params: 30,251
Non-trainable params: 0
_________________________________________________________________


## Fit Model

In [26]:
model.fit(data_train, y_train, epochs = epochs, batch_size = batch_size)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fb29c1105e0>

In [27]:
y_train

7202    0
1630    0
4677    0
3240    0
2967    0
       ..
2963    0
475     1
7116    1
7222    1
1416    1
Name: category, Length: 5709, dtype: int64

In [28]:
data_train

array([[[ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        ...,
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0]],

       [[ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        ...,
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0]],

       [[ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  1,  0],
        ...,
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0]],

       ...,

       [[ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  1,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        ...,
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0

In [29]:
df_transformed['category']

3502    1
5068    1
3722    0
6064    1
3641    1
       ..
4186    0
6068    1
7124    1
1746    0
1203    0
Name: category, Length: 7613, dtype: int64

## Save model

In [30]:
model.save('Models/model_nlp_disaster.h5')

# IV. Evaluate
## Obtain predictions

In [31]:
y_pred = model.predict(data_test)

## Round predictions

In [33]:
#y_pred = y_pred.round()
#y_pred = [x[0] for x in y_pred]
#y_test = list(y_test.values)

AttributeError: 'list' object has no attribute 'round'

## Evaluate results

In [34]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: ", cm)
print("Precision: ", np.round(precision_score(y_test, y_pred, average='macro'), 4))
print("Recall: ", np.round(recall_score(y_test, y_pred, average='macro'), 4))
print("f1_score: ", np.round(f1_score(y_test, y_pred, average='macro'), 4))

Confusion Matrix:  [[951 108]
 [318 527]]
Precision:  0.7897
Recall:  0.7608
f1_score:  0.7646
