# Sparse word vectorization

In [1]:
import numpy as np

## Word level

In [40]:
samples = [ 'The cat sat on a mat',
            'The dog ate my homework' ]

token_index = dict()
for sample in samples:
    for word in sample.split():
        if word not in token_index:
            token_index[word] = len(token_index) + 1

max_len = 10
results = np.zeros((len(samples), max_len, len(token_index) + 1))
for i, sample in enumerate(samples):
    for j, word in enumerate(sample.split()):
        idx = token_index[word]
        results[i, j, idx] = 1

## Character level

In [41]:
import string

characters = string.printable # ALL ASCII CHARS
token_index = dict(zip(characters, range(1, len(characters) + 1)))

max_len = 50
results = np.zeros((len(samples), max_len, len(token_index) + 1))

for i, sample in enumerate(samples):
    for j, letter in enumerate(sample):
        idx = token_index[letter]
        results[i, j, idx] = 1

In [27]:
results

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])

## Hashing trick

In [74]:
dims = 1000
max_length = 10

results = np.zeros((len(samples), max_len, dims))

for i, sample in enumerate(samples):
    for j, word in enumerate(sample.split()):
        index = abs(hash(word)) % dims
        results[i, j, index] = 1.


In [75]:
results

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])