# Introducción
En aquest notebook s'analitza l'efecte de la mida de l'embedding sobre el model base i les dades preprocesades seguint la configuració resultant com la millor. 
A més, es comparar el resultat de balancejar i no balancejar les clases amb el model final aconseguit. 

# Llibreries

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Bidirectional, Dense, Dropout
import matplotlib.pyplot as plt

from jiahao_funcs2 import *

# Data and preprocess

In [None]:
"""!pip install gdown
!gdown "https://drive.google.com/uc?id=1u2wzXvsuscLeFHwXcDwMDaNDy0u_99-t"
!tar -zxf nlu_ATIS_data.tar.gz"""

In [None]:
train_data = pd.read_csv('./data/train.csv', header=None)
val_data = train_data.tail(900)
train_data = pd.read_csv('./data/train.csv', header=None, nrows=4078)
test_data = pd.read_csv('./data/test.csv', header=None)
print('-------------- Dataset original --------------')
print('Training size:', len(train_data))
print('Validation dataset size:', len(val_data))
print('Test dataset size:', len(test_data))


In [None]:
data_intent_recognition = preprocess_intent_recognition(train_data, val_data, test_data, num_words=300)
print('-------------- Dataset preprocessed --------------')
print('Vocab size:', data_intent_recognition['vocab_size'])
print('Maxlen:', data_intent_recognition['maxlen'])
print('Num classes:', data_intent_recognition['num_classes'])
print(data_intent_recognition["train_X"].shape, data_intent_recognition["train_y"].shape)
print(data_intent_recognition["val_X"].shape, data_intent_recognition["val_y"].shape)
print(data_intent_recognition["test_X"].shape, data_intent_recognition["test_y"].shape)

# Embeddings

In [None]:
def model_build(model, num_classes):
    model.add(GlobalMaxPooling1D(data_format='channels_last'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    return model
results = provar_embeddings(model_build, preprocessed_data=data_intent_recognition, batch_size=32, epochs=30, 
                            embedding_dims=[32, 64, 128, 256, 384, 512], patience=5, runs=5)

# Balanceo de clases
Usar el class_weight en model.fit(class_weight = class_weight)

In [None]:
train_encoded_labels = data_intent_recognition['train_y']
class_weights = calculate_class_weights(train_encoded_labels)
print('Class weights:')
for class_index, weight in class_weights.items():
    print(f'{class_index}: {np.round(weight, 2)}')

In [None]:
#Definimos el mejor modelo encontrado
def crear_model_rnn_con_dropout(num_classes, vocab_size, maxlen, embedding_dim=256, dropout_rate=0.1):
    """
    Crea model RNN amb la millor configuració trobada i dropout configurable.

    Args:
        dropout_rate: Dropout després de RNN i abans de la capa final

    Returns:
        Model compilat
    """
    model = Sequential()

    # Embedding
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))

    # RNN (millor config: GRU 32 units, Bidirectional)
    rnn_layer = GRU(32, return_sequences=False)
    model.add(Bidirectional(rnn_layer))

    # Dropout després de RNN
    model.add(Dropout(dropout_rate))

    # Dense layer
    model.add(Dense(128, activation='relu'))

    # Dropout abans de la capa final
    model.add(Dropout(dropout_rate))

    # Capa de sortida
    model.add(Dense(num_classes, activation='softmax'))

    # Compilar
    #f1_metric = tf.keras.metrics.F1Score(name='f1', average='macro')
    #model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=[f1_metric])

    return model


In [None]:
results, test_evaluations = probar_class_weights(crear_model_rnn_con_dropout, preprocessed_data=data_intent_recognition, batch_size=32, epochs=30, 
                               class_weights_list=[None, class_weights], patience=5, runs=5)