- 원-핫 인코딩은 토큰을 벡터로 변환하는 가장 일반적이고 기본적인 방법입니다. 
- 모든 단어에 고유한 정수 인덱스를 부여하고 이 정수 인덱스 i를 크기가 N(어휘 사전의 크기)인 이진 벡터로 변환합니다. 
- 이 벡터는 i번째 원소만 1이고 나머지는 모두 0입니다.

- 물론 원-핫 인코딩은 문자 수준에서도 적용할 수 있습니다.


In [8]:
import numpy as np
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
token_index = {}
for sample in samples:
  for word in sample.split():
    if word not in token_index:
      token_index[word] = len(token_index) + 1

max_length = 10
results = np.zeros((len(samples),max_length, max(token_index.values()) + 1))
for i,sample in enumerate(samples):
  for j,word in list(enumerate(sample.split()))[:max_length]:
    index = token_index.get(word)
    results[i, j, index] = 1.

In [12]:
results[1]

array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [10]:
token_index

{'The': 1,
 'ate': 8,
 'cat': 2,
 'dog': 7,
 'homework.': 10,
 'mat.': 6,
 'my': 9,
 'on': 4,
 'sat': 3,
 'the': 5}

In [27]:
from typing import Sequence
from tensorflow.keras.preprocessing.text import Tokenizer

samples = ['The cat sat on the mat.', 'The dog ate my homework.']
tokenizer = Tokenizer(num_words=20)

# 단어 인덱스를 구축
tokenizer.fit_on_texts(samples)

# 정수 인덱스 리스트로 변환
sequences = tokenizer.texts_to_sequences(samples)

# 원-핫 이진 벡터 표현
one_hot_results = tokenizer.texts_to_matrix(samples,mode='binary')

word_index = tokenizer.word_index
print(word_index)
print(sequences)

{'the': 1, 'cat': 2, 'sat': 3, 'on': 4, 'mat': 5, 'dog': 6, 'ate': 7, 'my': 8, 'homework': 9}
[[1, 2, 3, 4, 1, 5], [1, 1, 1, 6, 7, 8, 9]]


영어 5개 문장으로 구성된 텍스트를 가져와서 다음을 수행하세요.
- 수작업으로 벡터화
- keras를 사용하여 벡터화

In [4]:
samples = ['Goal setting is the secret to a compelling future.', 'To know how much there is to know is the beginning of learning to live.' ,
           'Learn as if you will live forever, live like you will die tomorrow.', 'When you change your thoughts, remember to also change your world.', 'I never dreamed about success. I worked for it.']

In [6]:
import numpy as np

token_index = {}
for sample in samples:
  for word in sample.split():
    if word not in token_index:
      token_index[word] = len(token_index) + 1

max_length = 100
results = np.zeros((len(samples),max_length, max(token_index.values()) + 1))
for i,sample in enumerate(samples):
  for j,word in list(enumerate(sample.split()))[:max_length]:
    index = token_index.get(word)
    results[i, j, index] = 1.

In [9]:
print(token_index)

{'Goal': 1, 'setting': 2, 'is': 3, 'the': 4, 'secret': 5, 'to': 6, 'a': 7, 'compelling': 8, 'future.': 9, 'To': 10, 'know': 11, 'how': 12, 'much': 13, 'there': 14, 'beginning': 15, 'of': 16, 'learning': 17, 'live.': 18, 'Learn': 19, 'as': 20, 'if': 21, 'you': 22, 'will': 23, 'live': 24, 'forever,': 25, 'like': 26, 'die': 27, 'tomorrow.': 28, 'When': 29, 'change': 30, 'your': 31, 'thoughts,': 32, 'remember': 33, 'also': 34, 'world.': 35, 'I': 36, 'never': 37, 'dreamed': 38, 'about': 39, 'success.': 40, 'worked': 41, 'for': 42, 'it.': 43}


In [11]:
results[0]

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [13]:
from typing import Sequence
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=100)

# 단어 인덱스를 구축
tokenizer.fit_on_texts(samples)
# 원-핫 이진 벡터 표현
one_hot_results = tokenizer.texts_to_matrix(samples,mode='binary')

print(one_hot_results)

[[0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0.]
 [0. 1. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
  1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0.]
 [0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.