# Char-Word-2-Vec

This tutorial shows an implementation of the standard Word2Vec algorithm with the variant of using characters as interface with the model (i.e. for both input and supervision).

This feature brings several advantages, two of whom worth mentioning:
1. it allows avoiding the use of a dictionary of words, often not avialable or too big;
2. exploiting characters allows the capture of morphological similarities between words (which, even for humans, is the first signal of the meaning of a word, especially if unknown).

In order to use characters, this model applies two main variants to the standard model (we will use the CBOW model):
1. the lookup table, which stores the embeddings of words, is substituted by a recurrent neural network over characters; in this way, the goal of the model is to learn the weights of this NN and not the lookup table;
2. the softmax layer over words is substituted with a decoder into charactesrs and a softmax layer over characters (by far less computatianlly expensive, making unnecessary the use of approximations like NCE)

The entire code to build and run the model is provided in the file `CharWord2Vec.py`. Here, we provide the description of how to build the model.

Using a pattern often exploited in Google TensorFlow tutorials, we divide the model into two classes:
1. a `Config` class, which stores all the hyperparameters of the model and some useful variables;
2. a `Model`class, which builds the structure of the network.

In [1]:
import tensorflow as tf
from utils import rnn, data

In [3]:
class Config():

    char_vocab_size = 256  # The size of the character vocabulary
    char_embed_size = 50 # The size of the character embedding
    window_size = 11  # The total size of the window [window//2 center window//2]
    if window_size % 2 == 0:   window_size += 1  # Assuring window_size is odd
    context_size = (window_size - 1)
    batch_num_words = 1024 # We use a online built unique set of words, in order not to repeat embeddings computation 
    batch_size = batch_num_words - context_size
    cell_type = "LSTM"
    encoder_rnn_size = 500  # hidden layer of the encoder RNN
    decoder_rnn_size = 1000 # hidden layer of the decoder RNN
    encoder_num_layers = 1
    embedding_size = None
    word_max_len = 15
    _PAD = 0
    _GO = 1
    _EOW = 2
    _UNK = 3
    shuffling = False
    learning_rate = 0.001
    beta = 0.000
    dropout = 0.5
    epochs = 25

In [2]:
class Model():

    def __init__(self, config, is_training, is_eval=False):

        self.x = tf.placeholder(dtype=tf.int32, shape=[None, config.word_max_len])
        chars_length = tf.reduce_sum(tf.where(self.x>0, tf.ones_like(self.x), tf.zeros_like(self.x)), axis=1)

        self.y = self.x[(config.context_size//2):-(config.context_size//2),:]

        '''Characters Embeddings'''
        embedding = tf.get_variable("embedding", [config.char_vocab_size, config.char_embed_size], dtype=tf.float32)
        chars = tf.nn.embedding_lookup(embedding, self.x)

        '''Words Embeddings'''
        self.output = rnn.morph_encoder(chars, chars_length, config.encoder_rnn_size, is_training=is_training)
        self.word_embedding = self.output

        if is_eval:
            return

        '''Contexts Embeddings'''
        indices = data.create_indices_for_context2vec(config.window_size, config.batch_num_words, skip_center=True)
        indices = tf.cast(indices, tf.int32)
        words_contexts = tf.gather(self.output, indices=indices, name="gather1")
        self.contexts = tf.reduce_sum(words_contexts, axis = 1)

        "Deconding Phase"
        labels = self.y[:,1:]
        weights = tf.where(labels > 0, tf.ones_like(labels), tf.zeros_like(labels))
        logits = rnn.deconding(initial_state=self.contexts,
                               y=self.y,
                               decod_size=config.decoder_rnn_size,
                               embedding=embedding,
                               vocab_size=config.char_vocab_size,
                               batch_size=config.batch_size,
                               word_max_len=config.word_max_len,
                               dropout=config.dropout,
                               is_training=is_training)

        "Loss"
        self.predictions = tf.argmax(logits,  axis=2)
        weights = tf.cast(weights, tf.float32)
        loss = tf.contrib.seq2seq.sequence_loss(
            logits,
            labels,
            weights,
            average_across_timesteps=True,
            average_across_batch=True
        )

        self.cost = loss
        if not is_training:
            return
        self.train_op = tf.train.AdamOptimizer(config.learning_rate).minimize(self.cost)

In the `Model` class, we exploit two utility functions defined in the utils script `rnn.py`:
1. `rnn.morph_encoder`, which is the RNN for computing the word embedding using characters
2. `rnn.deconding`, which is the RNN for decoding the sequence of characters of the target word given a vectorial representation of the context around it.


In [4]:
def cell(size, type, dropout=None, proj=None, is_training = True):
    cell = None
    if type == "LSTM":
        cell= tf.contrib.rnn.BasicLSTMCell(size)
    elif type == "GRU":
        cell= tf.contrib.rnn.GRUCell(size)
    if dropout is not None and is_training:
        cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=dropout, output_keep_prob=1.0, state_keep_prob=1.0)
    if proj:
        cell = tf.contrib.rnn.OutputProjectionWrapper(cell, proj)
    return cell

def morph_encoder(chars, chars_length, size, cell_type="LSTM", dropout=None, is_training=True):
    '''Here we take a batch of words and compute their morphological embeddings, i.e. a hidden representation
    of a RNN over their characters'''
    with tf.variable_scope("MorphologicEncoder"):
        with tf.variable_scope("fw"):
            char_rnn_cell_fw = cell(size, cell_type, dropout, is_training=is_training)
        with tf.variable_scope("bw"):
            char_rnn_cell_bw = cell(size, cell_type, dropout, is_training=is_training)
        _, (fw_state, bw_state) = tf.nn.bidirectional_dynamic_rnn(cell_fw=char_rnn_cell_fw,
                                                               cell_bw=char_rnn_cell_bw,
                                                               inputs=chars,
                                                               sequence_length=chars_length,
                                                               dtype=tf.float32)

    return  tf.concat((fw_state.h, bw_state.h), axis=1)

In [4]:
def deconding(initial_state,decod_size, vocab_size, embedding, y, word_max_len, batch_size, cell_type="LSTM", dropout=1.0, is_training=True):
    '''Decoding Phase'''
    decoder_input = initial_state
    initial_decoder_state = tf.contrib.rnn.LSTMStateTuple(c=tf.zeros_like(decoder_input),
                                                          h=decoder_input)
    decoder_cell_fw = cell(decod_size, cell_type, dropout=dropout, is_training = is_training)

    decoder_inputs = tf.nn.embedding_lookup(embedding, y[:, :(word_max_len - 1)])
    decoder_inputs = tf.unstack(decoder_inputs, axis=1)
    final_outputs, _ = tf.nn.static_rnn(cell=decoder_cell_fw,
                                        dtype=tf.float32,
                                        inputs=decoder_inputs,
                                        initial_state=initial_decoder_state)

    W = tf.get_variable("softmax_w", [decod_size, vocab_size], dtype=tf.float32)
    b = tf.get_variable("softmax_b", [vocab_size], dtype=tf.float32)

    output = tf.reshape(tf.stack(axis=1, values=final_outputs), [-1, decod_size])

    logits = tf.matmul(output, W) + b
    logits = tf.reshape(logits, [batch_size, word_max_len - 1, vocab_size])

    return logits