# 2-6 정수 인코딩

In [5]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

raw_text = "A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain."
print(raw_text)

A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain.


In [6]:
sentences = sent_tokenize(raw_text) # 문장 토큰화

In [9]:
preprocessed_sentences = []
stop_words = set(stopwords.words('english')) # 영어 불용어

for sentence in sentences:
    tokenized_sentence = word_tokenize(sentence)
    result = []
    
    for word in tokenized_sentence:
        if word not in stop_words and len(word)>2:
            result.append(word)
    preprocessed_sentences.append(result)

In [10]:
preprocessed_sentences

[['barber', 'person'],
 ['barber', 'good', 'person'],
 ['barber', 'huge', 'person'],
 ['Knew', 'Secret'],
 ['The', 'Secret', 'Kept', 'huge', 'secret'],
 ['Huge', 'secret'],
 ['His', 'barber', 'kept', 'word'],
 ['barber', 'kept', 'word'],
 ['His', 'barber', 'kept', 'secret'],
 ['But', 'keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'],
 ['barber', 'went', 'huge', 'mountain']]

## Counter

In [13]:
from collections import Counter

all_word_list = sum(preprocessed_sentences,[])

In [16]:
print(all_word_list)

['barber', 'person', 'barber', 'good', 'person', 'barber', 'huge', 'person', 'Knew', 'Secret', 'The', 'Secret', 'Kept', 'huge', 'secret', 'Huge', 'secret', 'His', 'barber', 'kept', 'word', 'barber', 'kept', 'word', 'His', 'barber', 'kept', 'secret', 'But', 'keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy', 'barber', 'went', 'huge', 'mountain']


In [23]:
# 단어 빈도수 사전 생성
# word: count

vocab = Counter(all_word_list) 
print(vocab)
print(type(vocab))

Counter({'barber': 8, 'huge': 4, 'secret': 4, 'person': 3, 'kept': 3, 'Secret': 2, 'His': 2, 'word': 2, 'keeping': 2, 'good': 1, 'Knew': 1, 'The': 1, 'Kept': 1, 'Huge': 1, 'But': 1, 'driving': 1, 'crazy': 1, 'went': 1, 'mountain': 1})
<class 'collections.Counter'>


In [25]:
# 등장 빈도 상위 5개 출력
vocab.most_common(5)

[('barber', 8), ('huge', 4), ('secret', 4), ('person', 3), ('kept', 3)]

## FreqDist
- Counter와 같은 기능

In [26]:
from nltk import FreqDist

In [28]:
vocab = FreqDist(all_word_list)

In [31]:
print(vocab)
print(type(vocab))

<FreqDist with 19 samples and 40 outcomes>
<class 'nltk.probability.FreqDist'>


In [33]:
# 등장 빈도 상위 5개 출력
vocab.most_common(5)

[('barber', 8), ('huge', 4), ('secret', 4), ('person', 3), ('kept', 3)]

## Keras
- 단어 빈도 계산 가능
- 빈도 기준 단어에 인덱스 부여 가능

In [37]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [38]:
tokenizer = Tokenizer()

tokenizer.fit_on_texts(preprocessed_sentences)

In [41]:
# word: index
# 빈도수가 높으면 작은 인덱스
tokenizer.word_index

{'barber': 1,
 'secret': 2,
 'huge': 3,
 'kept': 4,
 'person': 5,
 'his': 6,
 'word': 7,
 'keeping': 8,
 'good': 9,
 'knew': 10,
 'the': 11,
 'but': 12,
 'driving': 13,
 'crazy': 14,
 'went': 15,
 'mountain': 16}

In [44]:
tokenizer.word_counts

OrderedDict([('barber', 8),
             ('person', 3),
             ('good', 1),
             ('huge', 5),
             ('knew', 1),
             ('secret', 6),
             ('the', 1),
             ('kept', 4),
             ('his', 2),
             ('word', 2),
             ('but', 1),
             ('keeping', 2),
             ('driving', 1),
             ('crazy', 1),
             ('went', 1),
             ('mountain', 1)])

In [46]:
tokenizer.texts_to_sequences(preprocessed_sentences)

[[1, 5],
 [1, 9, 5],
 [1, 3, 5],
 [10, 2],
 [11, 2, 4, 3, 2],
 [3, 2],
 [6, 1, 4, 7],
 [1, 4, 7],
 [6, 1, 4, 2],
 [12, 8, 8, 3, 2, 13, 1, 14],
 [1, 15, 3, 16]]

In [47]:
tokenizer = Tokenizer(num_words = 5 + 1) # 상위 5개 단어만 사용 (인덱스 0은 padding)

tokenizer.fit_on_texts(preprocessed_sentences)

In [54]:
# 위와 동일
print(tokenizer.word_index)
print()
print(tokenizer.word_counts)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'his': 6, 'word': 7, 'keeping': 8, 'good': 9, 'knew': 10, 'the': 11, 'but': 12, 'driving': 13, 'crazy': 14, 'went': 15, 'mountain': 16}

OrderedDict([('barber', 8), ('person', 3), ('good', 1), ('huge', 5), ('knew', 1), ('secret', 6), ('the', 1), ('kept', 4), ('his', 2), ('word', 2), ('but', 1), ('keeping', 2), ('driving', 1), ('crazy', 1), ('went', 1), ('mountain', 1)])


In [56]:
# 이때 적용
tokenizer.texts_to_sequences(preprocessed_sentences)

[[1, 5],
 [1, 5],
 [1, 3, 5],
 [2],
 [2, 4, 3, 2],
 [3, 2],
 [1, 4],
 [1, 4],
 [1, 4, 2],
 [3, 2, 1],
 [1, 3]]

In [57]:
# OOV(Out of vocabulary)
tokenizer = Tokenizer(num_words = 5 + 2, oov_token='OOV') # 상위 5개 단어만 사용 (2는 padding, OOV)

tokenizer.fit_on_texts(preprocessed_sentences)

In [59]:
tokenizer.word_index['OOV']

1

In [61]:
# 빈도수 기준 상위 5개 단어가 아닌 단어는 OOV 처리(인덱스1)
tokenizer.texts_to_sequences(preprocessed_sentences)

[[2, 6],
 [2, 1, 6],
 [2, 4, 6],
 [1, 3],
 [1, 3, 5, 4, 3],
 [4, 3],
 [1, 2, 5, 1],
 [2, 5, 1],
 [1, 2, 5, 3],
 [1, 1, 1, 4, 3, 1, 2, 1],
 [2, 1, 4, 1]]

# 2-7 패딩


In [62]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [70]:
tokenizer = Tokenizer()

tokenizer.fit_on_texts(preprocessed_sentences)

In [71]:
encoded = tokenizer.texts_to_sequences(preprocessed_sentences)

In [73]:
# 0을 앞에
padded = pad_sequences(encoded)
padded

array([[ 0,  0,  0,  0,  0,  0,  1,  5],
       [ 0,  0,  0,  0,  0,  1,  9,  5],
       [ 0,  0,  0,  0,  0,  1,  3,  5],
       [ 0,  0,  0,  0,  0,  0, 10,  2],
       [ 0,  0,  0, 11,  2,  4,  3,  2],
       [ 0,  0,  0,  0,  0,  0,  3,  2],
       [ 0,  0,  0,  0,  6,  1,  4,  7],
       [ 0,  0,  0,  0,  0,  1,  4,  7],
       [ 0,  0,  0,  0,  6,  1,  4,  2],
       [12,  8,  8,  3,  2, 13,  1, 14],
       [ 0,  0,  0,  0,  1, 15,  3, 16]])

In [75]:
# 0을 뒤에
padded = pad_sequences(encoded, padding='post')
padded

array([[ 1,  5,  0,  0,  0,  0,  0,  0],
       [ 1,  9,  5,  0,  0,  0,  0,  0],
       [ 1,  3,  5,  0,  0,  0,  0,  0],
       [10,  2,  0,  0,  0,  0,  0,  0],
       [11,  2,  4,  3,  2,  0,  0,  0],
       [ 3,  2,  0,  0,  0,  0,  0,  0],
       [ 6,  1,  4,  7,  0,  0,  0,  0],
       [ 1,  4,  7,  0,  0,  0,  0,  0],
       [ 6,  1,  4,  2,  0,  0,  0,  0],
       [12,  8,  8,  3,  2, 13,  1, 14],
       [ 1, 15,  3, 16,  0,  0,  0,  0]])

In [77]:
# maxlen으로 인해 잘리는 문장의 경우, 앞단어가 짤림
padded = pad_sequences(encoded, padding='post', maxlen=5) 
padded

array([[ 1,  5,  0,  0,  0],
       [ 1,  9,  5,  0,  0],
       [ 1,  3,  5,  0,  0],
       [10,  2,  0,  0,  0],
       [11,  2,  4,  3,  2],
       [ 3,  2,  0,  0,  0],
       [ 6,  1,  4,  7,  0],
       [ 1,  4,  7,  0,  0],
       [ 6,  1,  4,  2,  0],
       [ 3,  2, 13,  1, 14],
       [ 1, 15,  3, 16,  0]])

In [78]:
padded = pad_sequences(encoded, padding='post', maxlen=5, 
                       truncating='post') # maxlen으로 인해 잘리는 문장의 경우, 뒷단어가 짤림
padded

array([[ 1,  5,  0,  0,  0],
       [ 1,  9,  5,  0,  0],
       [ 1,  3,  5,  0,  0],
       [10,  2,  0,  0,  0],
       [11,  2,  4,  3,  2],
       [ 3,  2,  0,  0,  0],
       [ 6,  1,  4,  7,  0],
       [ 1,  4,  7,  0,  0],
       [ 6,  1,  4,  2,  0],
       [12,  8,  8,  3,  2],
       [ 1, 15,  3, 16,  0]])

# 2-8 One-hot encoding

In [80]:
text = "나랑 점심 먹으러 갈래 점심 메뉴는 햄버거 갈래 갈래 햄버거 최고야"

In [81]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

In [82]:
sub_text = "점심 먹으러 갈래 메뉴는 햄버거 최고야"
encoded = tokenizer.texts_to_sequences([sub_text])

In [84]:
from tensorflow.keras.utils import to_categorical

In [85]:
one_hot = to_categorical(encoded)

In [86]:
one_hot

array([[[0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1.]]], dtype=float32)