## 임베딩 : embedding

- 텍스트를 유사도 기준으로 수치화 ==> 밀집행렬
- 토큰화 실행 후 커진 희소행렬을 밀집행렬 ==> 축소

In [70]:
from tensorflow.keras.layers import Embedding,Input,Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [71]:
sentences=['Today is a sunny day', 
           'Today is a rainy day',
           'Is it sunny today?']

In [72]:
# 데이터셋 사전 관련 상수
NUM_SIZE=10000
OOV='<UNK>'

tokenizer=Tokenizer(num_words=NUM_SIZE,oov_token=OOV)

In [73]:
# sentences 데이터 기반 단어 사전 생성
tokenizer.fit_on_texts(sentences)

In [74]:
tokenizer.word_counts, tokenizer.word_index

(OrderedDict([('today', 3),
              ('is', 3),
              ('a', 2),
              ('sunny', 2),
              ('day', 2),
              ('rainy', 1),
              ('it', 1)]),
 {'<UNK>': 1,
  'today': 2,
  'is': 3,
  'a': 4,
  'sunny': 5,
  'day': 6,
  'rainy': 7,
  'it': 8})

In [75]:
# 전체 단어 갯수
word_index=len(tokenizer.word_index)
word_index

8

In [76]:
# 문장 ==> 숫자로 변환
# sentences=['Today is a sunny day', 
        #    'Today is a rainy day',
        #    'Is it sunny today?']
rets=tokenizer.texts_to_sequences(sentences)
rets

[[2, 3, 4, 5, 6], [2, 3, 4, 7, 6], [3, 8, 5, 2]]

In [77]:
# 전체 문장의 길이 통일 ==> 패딩
lengths=[len(x) for x in rets]
TOKEN_LENTH=max(lengths)

In [78]:
# 전체문장의 길이 통일 ==> 패딩

rets2=pad_sequences(rets,maxlen=TOKEN_LENTH,padding='post',truncating='post')
rets2, rets2.shape, rets2.ndim

(array([[2, 3, 4, 5, 6],
        [2, 3, 4, 7, 6],
        [3, 8, 5, 2, 0]]),
 (3, 5),
 2)

### 모델에 적용-----

In [79]:
model = Sequential()

In [87]:
from tensorflow.keras.layers import SimpleRNN, RNN, LSTM, GRU
# 5개의 토큰이 하나의 문장=> 타입스텝프
# 단어 수만큼의 컬럼을 가짐 ==> 2개로 변환
# 토큰별 단어 수 만큼 행 1x9 ==> 1x2
#                (input_dim,   output_dim, input_length)
# model.add(Embedding(word_index+1,  2,   input_length=5))
model.add(Input(shape=(3,5)))
model.add(SimpleRNN(4))
model.add(Dense(1))

ValueError: Input 0 of layer "simple_rnn_7" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 1)

In [81]:
# 파라미터
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_3 (SimpleRNN)    (None, 4)                 40        
                                                                 
 dense_2 (Dense)             (None, 1)                 5         
                                                                 
Total params: 45
Trainable params: 45
Non-trainable params: 0
_________________________________________________________________


In [82]:
model.compile(loss='mse')

In [83]:
out=model.predict(rets2)

ValueError: in user code:

    File "c:\Users\y2kjd\anaconda3\lib\site-packages\keras\engine\training.py", line 2137, in predict_function  *
        return step_function(self, iterator)
    File "c:\Users\y2kjd\anaconda3\lib\site-packages\keras\engine\training.py", line 2123, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\y2kjd\anaconda3\lib\site-packages\keras\engine\training.py", line 2111, in run_step  **
        outputs = model.predict_step(data)
    File "c:\Users\y2kjd\anaconda3\lib\site-packages\keras\engine\training.py", line 2079, in predict_step
        return self(x, training=False)
    File "c:\Users\y2kjd\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\y2kjd\anaconda3\lib\site-packages\keras\engine\input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_4" is incompatible with the layer: expected shape=(None, 3, 5), found shape=(None, 5)


In [None]:
rets2.shape

(3, 5)

In [None]:
rets2

array([[2, 3, 4, 5, 6],
       [2, 3, 4, 7, 6],
       [3, 8, 5, 2, 0]])

In [None]:
# Embedding 후 결과
out.shape

(3, 5, 2, 2)

In [None]:
out

array([[[[ 0.00276004, -0.02673268],
         [ 0.00276004, -0.02673268]],

        [[ 0.00276004, -0.02673268],
         [ 0.00276004, -0.02673268]],

        [[ 0.00276004, -0.02673268],
         [ 0.00276004, -0.02673268]],

        [[ 0.00276004, -0.02673268],
         [ 0.00276004, -0.02673268]],

        [[ 0.00276004, -0.02673268],
         [ 0.00276004, -0.02673268]]],


       [[[ 0.00276004, -0.02673268],
         [ 0.00276004, -0.02673268]],

        [[ 0.00276004, -0.02673268],
         [ 0.00276004, -0.02673268]],

        [[ 0.00276004, -0.02673268],
         [ 0.00276004, -0.02673268]],

        [[ 0.00276004, -0.02673268],
         [ 0.00276004, -0.02673268]],

        [[ 0.00276004, -0.02673268],
         [ 0.00276004, -0.02673268]]],


       [[[ 0.00276004, -0.02673268],
         [ 0.00276004, -0.02673268]],

        [[ 0.00276004, -0.02673268],
         [ 0.00276004, -0.02673268]],

        [[ 0.00276004, -0.02673268],
         [ 0.00276004, -0.02673268]],

        