<a href="https://colab.research.google.com/github/Jikhan-Jeong/2020_Keras_Deep_Learning/blob/master/Jan_13%2C_20_Keras_Text_one_Hot_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Jan 13, 20 Keras Text one-Hot Embedding
* Name: Jikhan Jeong
* Ref: https://github.com/fchollet/deep-learning-with-python-notebooks
* Ref: https://nbviewer.jupyter.org/github/fchollet/deep-learning-with-python-notebooks/blob/master/6.1-one-hot-encoding-of-words-or-characters.ipynb
----
* By using the build-in-function in Keras, it is realtively easy to apply One-Hot Embedding in Keras

In [4]:
%tensorflow_version 1.x
import keras
keras.__version__

'2.2.5'

# One-Hot in Word Level

In [0]:
import numpy as np

# This is our initial data; one entry per "sample"
# (in this toy example, a "sample" is just a sentence, but
# it could be an entire document).
samples = ['The cat sat on the mat.', 'The dog ate my homework.']

# First, build an index of all tokens in the data.
token_index = {}
for sample in samples:
    # We simply tokenize the samples via the `split` method.
    # in real life, we would also strip punctuation and special characters
    # from the samples.
    for word in sample.split():
        if word not in token_index:
            # Assign a unique index to each unique word
            token_index[word] = len(token_index) + 1
            # Note that we don't attribute index 0 to anything.

# Next, we vectorize our samples.
# We will only consider the first `max_length` words in each sample.
max_length = 10

# This is where we store our results:
results = np.zeros((len(samples), max_length, max(token_index.values()) + 1))
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = token_index.get(word)
        results[i, j, index] = 1.

# One-Hot in Character Level

In [0]:
import string

samples = ['The cat sat on the mat.', 'The dog ate my homework.']
characters = string.printable  # All printable ASCII characters.
token_index = dict(zip(characters, range(1, len(characters) + 1)))

max_length = 50
results = np.zeros((len(samples), max_length, max(token_index.values()) + 1))
for i, sample in enumerate(samples):
    for j, character in enumerate(sample[:max_length]):
        index = token_index.get(character)
        results[i, j, index] = 1.

# One-Hot Using Built-in function
* from keras.preprocessing.text import Tokenizer

In [0]:
from keras.preprocessing.text import Tokenizer

In [0]:
samples = ['The cat sat on the mat.', 'The dog ate my homework.']

###  A tokenizer, configured to only take into account the top-1000 most common words
* 1. tokenizer = Tokenizer(num_words=)
* 2. tokenizer.fit_on_texts(data)
* 3. tokenizer.texts_to_sequences(data) # index matirx
* 4. tokenizer.texts_to_matrix(data, mode='binary') # dummy

In [0]:
# We create a tokenizer, configured to only take into account the top-1000 most common words
tokenizer = Tokenizer(num_words=1000)

### This builds the word index

In [0]:
# This builds the word index with the defined tokenizer by using Tokenizer built in function
tokenizer.fit_on_texts(samples)

###  This turns strings into lists of integer indices.

In [14]:
# This turns strings into lists of integer indices.
sequences = tokenizer.texts_to_sequences(samples)
print('lists of integer indices: ', sequences)

lists of integer indices:  [[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]


### The one-hot binary representations.
* tokernizer.texts_to_matrix(data, mode='binary')

In [13]:
# You could also directly get the one-hot binary representations.
# Note that other vectorization modes than one-hot encoding are supported!
one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')
print('binary result of sample sentence with one-hot encoding: ', one_hot_results)

binary result of sample sentence with one-hot encoding:  [[0. 1. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]


In [11]:
# This is how you can recover the word index that was computed
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
print(word_index)

Found 9 unique tokens.
{'the': 1, 'cat': 2, 'sat': 3, 'on': 4, 'mat': 5, 'dog': 6, 'ate': 7, 'my': 8, 'homework': 9}
