In [6]:
import numpy as np
import tensorflow as tf
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten,Embedding
from tensorflow.keras.preprocessing.text import text_to_word_sequence
text="해보지 않으면 해낼 수 없다."
result=text_to_word_sequence(text, split = ' ',filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
print(f"원문: {text}")
print(f"토큰화: {result}")

원문: 해보지 않으면 해낼 수 없다.
토큰화: ['해보지', '않으면', '해낼', '수', '없다']


In [9]:
docs = ['먼저 텍스트의 각 단어를 나누어 토큰화 합니다.',
        '텍스트의 단어로 토큰화 해야 딥러닝에서 인식됩니다',
        '토큰화 한 결과는 딥러닝에서 사용 할 수 있습니다'] 

token = Tokenizer()
token.fit_on_texts(docs)

In [10]:
token.word_counts

OrderedDict([('먼저', 1),
             ('텍스트의', 2),
             ('각', 1),
             ('단어를', 1),
             ('나누어', 1),
             ('토큰화', 3),
             ('합니다', 1),
             ('단어로', 1),
             ('해야', 1),
             ('딥러닝에서', 2),
             ('인식됩니다', 1),
             ('한', 1),
             ('결과는', 1),
             ('사용', 1),
             ('할', 1),
             ('수', 1),
             ('있습니다', 1)])

In [11]:
token.document_count

3

In [12]:
token.word_index

{'토큰화': 1,
 '텍스트의': 2,
 '딥러닝에서': 3,
 '먼저': 4,
 '각': 5,
 '단어를': 6,
 '나누어': 7,
 '합니다': 8,
 '단어로': 9,
 '해야': 10,
 '인식됩니다': 11,
 '한': 12,
 '결과는': 13,
 '사용': 14,
 '할': 15,
 '수': 16,
 '있습니다': 17}

In [13]:
text = '오랫동안 꿈꾸는 자는 그 꿈을 닮아간다'

token = Tokenizer()
token.fit_on_texts([text])

print('문장의 토큰화:',token.word_index)

x = token.texts_to_sequences([text])
print('문장의 숫자화:', x)

from keras.utils import to_categorical

word_size = len(token.word_index) + 1
x = to_categorical(x, num_classes = word_size)
print("문장의 원-핫 인코딩:\n", x)

문장의 토큰화: {'오랫동안': 1, '꿈꾸는': 2, '자는': 3, '그': 4, '꿈을': 5, '닮아간다': 6}
문장의 숫자화: [[1, 2, 3, 4, 5, 6]]
문장의 원-핫 인코딩:
 [[[0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 0. 0. 0. 1.]]]


In [14]:
docs = ["너무 재밌네요","최고에요","참 잘 만든 영화예요","추천하고 싶은 영화입니다","한번 더 보고싶네요","글쎄요","별로예요","생각보다 지루하네요","연기가 어색해요","재미없어요"]
classes = array([1,1,1,1,1,0,0,0,0,0])

In [29]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense,Flatten,Embedding
token = Tokenizer()
token.fit_on_texts(docs)
x = token.texts_to_sequences(docs)
print(x)
padded_x = pad_sequences(x,4)
print(padded_x)

word_size = len(token.word_index) + 1

model = Sequential()
model.add(Embedding(word_size,8))
model.add(Flatten())
model.add(Dense(512,activation = 'relu'))
model.add(Dense(1,activation = 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.fit(padded_x, classes ,epochs = 20)

model.evaluate(padded_x,classes)[1]

[[1, 2], [3], [4, 5, 6, 7], [8, 9, 10], [11, 12, 13], [14], [15], [16, 17], [18, 19], [20]]
[[ 0  0  1  2]
 [ 0  0  0  3]
 [ 4  5  6  7]
 [ 0  8  9 10]
 [ 0 11 12 13]
 [ 0  0  0 14]
 [ 0  0  0 15]
 [ 0  0 16 17]
 [ 0  0 18 19]
 [ 0  0  0 20]]
Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 590ms/step - accuracy: 0.4000 - loss: 0.6941
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.7000 - loss: 0.6896
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.8000 - loss: 0.6854
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.8000 - loss: 0.6815
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.8000 - loss: 0.6777
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.9000 - loss: 0.6738
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

0.8999999761581421

In [43]:
predict = "화난다"

x = token.texts_to_sequences([predict])
padded_x_test = pad_sequences(x,4)

In [44]:
model.predict(padded_x_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step


array([[0.4298278]], dtype=float32)