# Importing packages

In [1]:
import re
import numpy as np
import pandas as pd
import tensorflow as tf
import gensim.downloader as api

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, precision_score, recall_score, log_loss
from sklearn.preprocessing import LabelEncoder



# I. Load Data
## Load files

In [2]:
tf.random.set_seed(42)
path_files = './Data_Sources'
df_raw = pd.read_csv(path_files+'/all-data.csv', encoding= 'latin-1', header=None)
df_raw.columns = ['category', 'text']
#Shuffle input
df_raw = df_raw.sample(frac=1)

## Load Word2Vec

In [3]:
word_vectors = api.load('glove-wiki-gigaword-100')
vocabulary = [x for x in word_vectors.key_to_index]
lemmatizer = WordNetLemmatizer()

## Check embeddings of one word

In [4]:
vector = word_vectors['computer']
print(vector)

[-1.6298e-01  3.0141e-01  5.7978e-01  6.6548e-02  4.5835e-01 -1.5329e-01
  4.3258e-01 -8.9215e-01  5.7747e-01  3.6375e-01  5.6524e-01 -5.6281e-01
  3.5659e-01 -3.6096e-01 -9.9662e-02  5.2753e-01  3.8839e-01  9.6185e-01
  1.8841e-01  3.0741e-01 -8.7842e-01 -3.2442e-01  1.1202e+00  7.5126e-02
  4.2661e-01 -6.0651e-01 -1.3893e-01  4.7862e-02 -4.5158e-01  9.3723e-02
  1.7463e-01  1.0962e+00 -1.0044e+00  6.3889e-02  3.8002e-01  2.1109e-01
 -6.6247e-01 -4.0736e-01  8.9442e-01 -6.0974e-01 -1.8577e-01 -1.9913e-01
 -6.9226e-01 -3.1806e-01 -7.8565e-01  2.3831e-01  1.2992e-01  8.7721e-02
  4.3205e-01 -2.2662e-01  3.1549e-01 -3.1748e-01 -2.4632e-03  1.6615e-01
  4.2358e-01 -1.8087e+00 -3.6699e-01  2.3949e-01  2.5458e+00  3.6111e-01
  3.9486e-02  4.8607e-01 -3.6974e-01  5.7282e-02 -4.9317e-01  2.2765e-01
  7.9966e-01  2.1428e-01  6.9811e-01  1.1262e+00 -1.3526e-01  7.1972e-01
 -9.9605e-04 -2.6842e-01 -8.3038e-01  2.1780e-01  3.4355e-01  3.7731e-01
 -4.0251e-01  3.3124e-01  1.2576e+00 -2.7196e-01 -8

In [5]:
def word_vector(df_input, lemmatizer, word_vectors, vocabulary, col_sentences):
    """
    Función para preprocesar las palabras de entrada y obtener una lista con las matrices de embeddings
    de las palabras de cada registro.
    
    Parameters
    ----------
    df_input : dataframe
    dataframe de entrada con todos los textos.
    lemmatizer : object
    objeto del lematizador de NLTK.
    word_vectors : object
    objecto con los word2vec del vocabnbulario de Gensim.
    vocabulary : list
    lista con las palabras existentes en el vocabulario de Gensim.
    col_sentences : str
    columna del dataframe donde están las frases.
    
    Returns
    -------
    X : list
    Lista de listas en las que cada registro tiene la lista con los arrays de los embeddings de
    las palabras de esa frase. 
    Es decir, X[0] tiene una lista donde cada elemento corresponde a los embeddings de una palabra. 
    Así, por ejemplo, X[0][2] será un vector de dimensión 100 donde aparece el vector de embeddings
    de la tercera palabra de la primera frase.
    """
    
    X = []
    for text in df_input[col_sentences]:
        # Tokenizo cada frase + Paso a minusculas todo
        words = re.findall(r'\w+', text.lower(),flags = re.UNICODE)
        # Eliminación de las stopwords
        words = [word for word in words if word not in stopwords.words('english')]
        # Elimino numeros
        words = [word for word in words if not word.isdigit()]
        #Lemmatization
        words = [lemmatizer.lemmatize(w) for w in words]
        #Eliminar palabras que no esten en el vocabulario
        words = [word for word in words if word in vocabulary]
        #Word2Vec
        words_embeddings = [word_vectors[x] for x in words]
        #Guardo la frase final
        X.append(words_embeddings) # lo guardo como un numpy array
    
    return X

# II. Preprocess
## Obtain X and Y variables

In [6]:
X = word_vector(df_raw, lemmatizer, word_vectors, vocabulary, col_sentences="text")

## Label Encoding

In [8]:
lb = LabelEncoder()
df_raw['category'] = lb.fit_transform(df_raw['category'])
y = df_raw['category']

## One-hot encode output

In [9]:
y = to_categorical(y)

## Train/Test split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## Obtain tensor: [N_SENTENCES x SEQ_LENGTH x EMBEDDING_FEATURES]

In [11]:
SEQ_LENGTH = np.int(np.round(np.percentile([len(x) for x in X], 99, interpolation='midpoint')))

data_train = pad_sequences(X_train, maxlen=SEQ_LENGTH, padding='post', truncating='post')
data_test = pad_sequences(X_test, maxlen=SEQ_LENGTH, padding='post', truncating='post')

In [13]:
def create_RNN(X_train, K, n_lstm=8, loss='categorical_crossentropy', optimizer='adam'):
    """
    Función para crear la RNN. Como parámetro de entrada sólo necesita la matriz de features
    para especificar la dimensionalidad de entrada de la NN.
    
    Parameters
    ----------
    x_input : array
    Matriz de features de entrada.
    K: int
    Clases de salida
    n_lstm : int, optional
    Number of lstm used. The default is 8.
    loss : string, optional
    Métrica de perdida. El default es 'categorical_crossentropy'.
    optimizer : string, optional
    Optimizador. El default es 'adam'.
    """
    #Begin sequence
    model = tf.keras.Sequential()
    
    #Add a LSTM layer with 8 internal units
    model.add(LSTM(n_lstm, input_shape=X_train.shape[-2:]))
    
    #Output
    model.add(Dense(K, activation='softmax'))
    
    #Compile model
    model.compile(loss=loss, optimizer=optimizer)
    
    return model

# III. Train model
## Params

In [14]:
K = y_train.shape[1] # N classes
batch_size = 200
epochs = 50

## Create RNN

In [15]:
model = create_RNN(X_train = data_train, K = K, n_lstm=50)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 50)                30200     
_________________________________________________________________
dense (Dense)                (None, 3)                 153       
Total params: 30,353
Trainable params: 30,353
Non-trainable params: 0
_________________________________________________________________


## Save model

In [16]:
model.save('Models/model_nlp_reviews.h5')

# IV. Evaluate
## Obtain predictions

In [17]:
y_pred = model.predict(data_test)

## Obtain original values (not one-hot encoded)

In [18]:
y_test = [np.argmax(x) for x in y_test]
y_pred = [np.argmax(x) for x in y_pred]

## Evaluate results

In [19]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: ", cm)
print("Precision: ", np.round(precision_score(y_test, y_pred, average='macro'), 4))
print("Recall: ", np.round(recall_score(y_test, y_pred, average='macro'), 4))
print("f1_score: ", np.round(f1_score(y_test, y_pred, average='macro'), 4))

Confusion Matrix:  [[104  15  35]
 [591  62  69]
 [268  24  44]]
Precision:  0.3397
Recall:  0.2974
f1_score:  0.1729
