In [3]:
"""
Deep learning for text and sequences: This chapter covers,
    * Preprocessing text data into useful representations.
    * Working with recurrent neural networks.
    * Using 1D convnets for sequence processing
    
    Like all other neural networks, deep-learning models dont take as input raw text:
    They only work with words, and transform each word into a vector.
    Vectorizing test is the process of tranforming text into numeric tensors. Do this by:
        * Segment text into words, and transform each word into a vector.
        * Segment text intop characters, and transform each character into a vector.
        * Extract n-grams of words or charactrers, and tranform each n-grams into a vector.
            N-grams are overlapping graoups of multiple consecutive words or chrs.
"""

# One-hot encoding of words and charcters

import numpy as np 

samples = ['The cat sat on the may.', 'The dog ate my homework.']


token_index = {}

for sample in samples:
    for word in sample.split():
        if word not in token_index:
            token_index[word] = len(token_index) + 1
max_legth = 10

results = np.zeros(shape=(len(samples),
                          max_legth,
                          max(token_index.values()) + 1 ))
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_legth]:
        index = token_index.get(word)
        results[i, j, index] = 1



In [5]:
# Character-level one-hot encoding (toy example)

import string 

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

characters = string.printable
token_index = dict(zip(range(1, len(characters) + 1), characters))

max_legth = 50 

results = np.zeros((len(samples), max_legth, max(token_index.keys()) + 1))
for i, sample in enumerate(samples):
    for j, characters in enumerate(sample):
        index = token_index.get(characters)
        results[i, j, index] =1

In [7]:
# Using Keras for word-level one-hot ecoding

from tensorflow.keras.preprocessing.text import Tokenizer

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(samples)

sequences = tokenizer.texts_to_sequences(samples)

one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')

word_index = tokenizer.word_index

print('Found %s unique tokens.' % len(word_index))



Found 9 unique tokens.


In [9]:
# word-level one-hot encoding with hashing trick (toy example)

samples = ['The cat set on the mat.', 'The dog ate my homework.']

dismensionatily = 1000
max_lenght = 10 

results = np.zeros((len(samples), max_legth, dismensionatily))

for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_lenght]:
        index = abs(hash(word)) % dismensionatily
        results[i,j, index] = 1
