# Preparacion del datset y parametros del modelo

In [0]:
tf.flags.DEFINE_float("learning_rate", 0.05, "Learning rate for the optimizer.")
tf.flags.DEFINE_float("max_grad_norm", 5.0, "Clip gradients to this norm.")
tf.flags.DEFINE_integer("evaluation_interval", 1, "Evaluate and print results every x epochs")
tf.flags.DEFINE_integer("batch_size", 128, "Batch size for training.")
tf.flags.DEFINE_integer("epochs", 800, "Number of epochs to train for.")
tf.flags.DEFINE_integer("embedding_size", 20, "Embedding size for embedding matrices.")
tf.flags.DEFINE_integer("sentence_len", 50, "Maximum len of sentence.")
tf.flags.DEFINE_string("task", "Sentihood", "Sentihood")
tf.flags.DEFINE_integer("random_state", 67, "Random state.")
tf.flags.DEFINE_string("data_dir", "data/sentihood/", "Directory containing Sentihood data")
tf.flags.DEFINE_string("opt", "ftrl", "Optimizer [ftrl]")
tf.flags.DEFINE_string("embedding_file_path", None, "Embedding file path [None]")
tf.flags.DEFINE_boolean("update_embeddings", False, "Update embeddings [False]")
tf.flags.DEFINE_boolean("case_folding", True, "Case folding [True]")
tf.flags.DEFINE_integer("n_cpus", 6, "N CPUs [6]")
tf.flags.DEFINE_integer("n_keys", 7, "Number of keys [7]")
tf.flags.DEFINE_integer("n_tied", 2, "Number of tied keys [2]")
tf.flags.DEFINE_float("entnet_input_keep_prob", 0.8, "entnet input keep prob [0.8]")
tf.flags.DEFINE_float("entnet_output_keep_prob", 1.0, "entnet output keep prob [1.0]")
tf.flags.DEFINE_float("entnet_state_keep_prob", 1.0, "entnet state keep prob [1.0]")
tf.flags.DEFINE_float("final_layer_keep_prob", 0.8, "final layer keep prob [0.8]")
tf.flags.DEFINE_float("l2_final_layer", 1e-3, "Lambda L2 final layer [1e-3]")

**logger** es usado para hacer seguimiento del debug<br> 
**assert** es para capturar excepciones

In [0]:
logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    ch.setFormatter(formatter)
    logger.addHandler(ch)

    logger.info(" ".join(sys.argv))
    logger.info("Started Task: %s" % FLAGS.task)
    
    logger.info(pp.pformat(FLAGS.__flags))

    session_conf = tf.ConfigProto(
        intra_op_parallelism_threads=FLAGS.n_cpus,
        inter_op_parallelism_threads=FLAGS.n_cpus,
    )

    aspect2idx = {
        'general': 0,
        'price': 1,
        'transit-location': 2,
        'safety': 3,
    }

    assert FLAGS.n_keys >= 2
    assert FLAGS.n_tied == 2

**load_task** esta en **data_utils_sentihood.py**, se encarga de leer y tokenizar las reseñas en los archivos de entrenamiento y prueba. **FLAGS.data_dir** es la ubicacion de los archivos, **aspect2idx** es el id de los aspectos: <br>
'general': 0, 'price': 1, 'transit-location': 2, 'safety': 3

In [0]:
    with tf.Session(config=session_conf) as sess:

        np.random.seed(FLAGS.random_state)

        # task data
        (train, train_aspect_idx), (val, val_aspect_idx), (test, test_aspect_idx) = load_task(FLAGS.data_dir, aspect2idx)
        
        if FLAGS.case_folding:
            train = lower_case(train)
            val = lower_case(val)
            test = lower_case(test)

        data = train + val + test

Se obtiene el tamaño maximo de las oraciones, objetivos (entidades) y aspectos para establecer el tamaño de las cadenas de memorias y los embeddings de los objetivos (entidades) y aspectos

In [0]:
        max_sentence_len = max(map(lambda x: len(x[1]), data))
        max_sentence_len = min(FLAGS.sentence_len, max_sentence_len)
        logger.info('Max sentence len: %d' % max_sentence_len)
        max_target_len = 1 # should be one
        max_aspect_len = max(map(lambda x: len(x), [d[3] for d in data]))
        assert max_aspect_len == 2
        logger.info('Max target size: %d' % max_target_len)
        logger.info('Max aspect size: %d' % max_aspect_len)

Obteniendo embeddings 

In [0]:
        assert FLAGS.embedding_file_path is not None
        word_vocab = EmbeddingVocabulary(
            in_file=FLAGS.embedding_file_path,
        )
        word_vocab_processor = EmbeddingVocabularyProcessor(
            max_document_length=max_sentence_len,
            vocabulary=word_vocab,
        )
        embedding_mat = word_vocab.embeddings
        embedding_size = word_vocab.embeddings.shape[1]

Generando IDs de las etiquetas

In [0]:
        label_vocab = LabelVocabulary()
        label_vocab_processor = LabelVocabularyProcessor(
            vocabulary=label_vocab,
            min_frequency=0,
        )
    
        positive_idx = label_vocab.get('Positive')
        negative_idx = label_vocab.get('Negative')
        none_idx = label_vocab.get('None')

Transformando los datasets en vectores 

In [0]:
        train_sentences, train_targets, train_loc_indicators, train_aspects, train_labels, train_ids = vectorize_data(
            train,
            max_sentence_len,
            max_target_len,
            max_aspect_len,
            word_vocab_processor,
            label_vocab_processor,
        )
  
        val_sentences, val_targets, val_loc_indicators, val_aspects, val_labels, val_ids = vectorize_data(
            val,
            max_sentence_len,
            max_target_len,
            max_aspect_len,
            word_vocab_processor,
            label_vocab_processor,
        )
        
        test_sentences, test_targets, test_loc_indicators, test_aspects, test_labels, test_ids = vectorize_data(
            test,
            max_sentence_len,
            max_target_len,
            max_aspect_len,
            word_vocab_processor,
            label_vocab_processor,
        )
        
        target_terms = [['location1'], ['location2']]
        target_terms = word_vocab_processor.transform(target_terms)[:, :max_target_len]

Imprimiendo informacion de los dataset

In [0]:
        sentence_len = max_sentence_len
        vocab_size = len(word_vocab)
        answer_size = len(label_vocab)
  
        logger.info("Training sentences shape " + str(train_sentences.shape))
        logger.info("Training targets shape " + str(train_targets.shape))
        logger.info("Training aspects shape " + str(train_aspects.shape))
        logger.info("Validation sentences shape " + str(val_sentences.shape))
        logger.info("Validation targets shape " + str(val_targets.shape))
        logger.info("Validation aspects shape " + str(val_aspects.shape))
        logger.info("Test sentences shape " + str(test_sentences.shape))
        logger.info("Test targets shape " + str(test_targets.shape))
        logger.info("Test aspects shape " + str(test_aspects.shape))
        
        # params
        n_train = train_sentences.shape[0]
        n_val = val_sentences.shape[0]
        n_test = test_sentences.shape[0]
        
        logger.info("Training Size %d" % n_train)
        logger.info("Validation Size %d" % n_val)
        logger.info("Testing Size %d" % n_test)

Dividiendo el dataset de entrenamiento por su categoria (positivo, negativo, ninguno)

In [0]:
        train_positive_idx = np.where(train_labels == positive_idx)[0]
        train_negative_idx = np.where(train_labels == negative_idx)[0]
        train_none_idx = np.where(train_labels == none_idx)[0]

        train_positive_sentences = train_sentences[train_positive_idx]
        train_positive_targets = train_targets[train_positive_idx]
        train_positive_aspects = train_aspects[train_positive_idx]
        train_positive_labels = train_labels[train_positive_idx]

        train_negative_sentences = train_sentences[train_negative_idx]
        train_negative_targets = train_targets[train_negative_idx]
        train_negative_aspects = train_aspects[train_negative_idx]
        train_negative_labels = train_labels[train_negative_idx]

        train_none_sentences = train_sentences[train_none_idx]
        train_none_targets = train_targets[train_none_idx]
        train_none_aspects = train_aspects[train_none_idx]
        train_none_labels = train_labels[train_none_idx]

Asegurandose que los id de las categorias no sean iguales

In [0]:
        assert len(train_none_idx) > len(train_positive_idx)
        assert len(train_positive_idx) > len(train_negative_idx)

Tamaños de los grupos formados al dividir el dataset

In [0]:
        n_positive_train = len(train_positive_idx)
        n_negative_train = len(train_negative_idx)
        n_none_train = len(train_none_idx)
        n_train = n_negative_train # down-sampling

        logger.info("Positive training Size %d" % n_positive_train)
        logger.info("Negative training Size %d" % n_negative_train)
        logger.info("None training Size %d" % n_none_train)

Eligiendo un optimizador

In [0]:
        if FLAGS.opt == 'adam':
            optimizer = tf.train.AdamOptimizer(
                learning_rate=FLAGS.learning_rate, epsilon=FLAGS.epsilon)
        elif FLAGS.opt == 'ftrl':
            optimizer = tf.train.FtrlOptimizer(
                learning_rate=FLAGS.learning_rate
            )

Creando el modelo<br>
**answer_size** es el tamaño de la salida, un vercor de tamaño 3 (positivo, negativo, ninguno)<br>
**embedding_mat**  es el vocabulario de embeddings<br>
**tied_keys** es el vocavulario de los objetivos (entidades)

In [0]:
        model = Delayed_EntNet_Sentihood(
            batch_size, 
            vocab_size, 
            max_target_len,
            max_aspect_len,
            sentence_len, 
            answer_size,
            embedding_size, 
            session=sess,
            embedding_mat=word_vocab.embeddings,
            update_embeddings=FLAGS.update_embeddings,
            n_keys=FLAGS.n_keys,
            tied_keys=target_terms,
            l2_final_layer=FLAGS.l2_final_layer,
            max_grad_norm=FLAGS.max_grad_norm, 
            optimizer=optimizer,
            global_step=global_step
        )

Entrenando el modelo para cada categoria del dataset de entrenamiento

In [0]:
            for start, end in batches:
                # train negative
                sentences = train_negative_sentences[start:end]
                targets = train_negative_targets[start:end]
                aspects = train_negative_aspects[start:end]
                answers = train_negative_labels[start:end]
                cost_t = model.fit(sentences, targets, aspects, answers,
                                   FLAGS.entnet_input_keep_prob,
                                   FLAGS.entnet_output_keep_prob,
                                   FLAGS.entnet_state_keep_prob,
                                   FLAGS.final_layer_keep_prob)
                total_cost += cost_t
                total_training_instances += len(train_negative_sentences[start:end])

                # train positive
                positive_start = random.randint(0, n_positive_train - batch_size)
                positive_end = positive_start + batch_size
                sentences = train_positive_sentences[positive_start:positive_end]
                targets = train_positive_targets[positive_start:positive_end]
                aspects = train_positive_aspects[positive_start:positive_end]
                answers = train_positive_labels[positive_start:positive_end]
                cost_t = model.fit(sentences, targets, aspects, answers, 
                                   FLAGS.entnet_input_keep_prob,
                                   FLAGS.entnet_output_keep_prob,
                                   FLAGS.entnet_state_keep_prob,
                                   FLAGS.final_layer_keep_prob)
                total_cost += cost_t
                total_training_instances += len(train_positive_sentences[positive_start:positive_end])

                # train none
                none_start = random.randint(0, n_none_train - batch_size)
                none_end = none_start + batch_size
                sentences = train_none_sentences[none_start:none_end]
                targets = train_none_targets[none_start:none_end]
                aspects = train_none_aspects[none_start:none_end]
                answers = train_none_labels[none_start:none_end]
                cost_t = model.fit(sentences, targets, aspects, answers, 
                                   FLAGS.entnet_input_keep_prob,
                                   FLAGS.entnet_output_keep_prob,
                                   FLAGS.entnet_state_keep_prob,
                                   FLAGS.final_layer_keep_prob)