<a href="https://colab.research.google.com/github/Kimminsu-ds/Deep-Learning-NLP-using-Tensorflow/blob/main/03_03_Embedding_layer_VS_Pre_trained_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 러닝스푼즈 - Tensorflow를 활용한 딥러닝 자연어처리

## Embedding layer

In [7]:
from keras.preprocessing.text import one_hot
from keras.models import Sequential
from keras.layers import Embedding

### 데이터 세팅

In [9]:
# 임의의 문서 3개 생성
sample_text_1 = "bitty bought a bit of butter"
sample_text_2 = "but the bit of butter was a bit bitter"
sample_text_3 = "so she bought some better butter to make the bitter butter better"

corp = [sample_text_1, sample_text_2, sample_text_3]
no_docs = len(corp)
print(no_docs)

3


### 전처리

In [10]:
# 단어장의 크기를 50으로 하고 단어에 정수 부여
vocab_size = 50
encod_corp = []

for i, doc in enumerate(corp):
  encod_corp.append(one_hot(doc, 50))
  print("The encoding for document", i+1, "is :", one_hot(doc, 50))

The encoding for document 1 is : [32, 42, 47, 35, 18, 39]
The encoding for document 2 is : [1, 8, 35, 18, 39, 9, 47, 35, 49]
The encoding for document 3 is : [31, 45, 42, 8, 17, 39, 38, 47, 8, 49, 39, 17]


### Embedding layer 생성

In [11]:
model = Sequential()
model.add(Embedding(vocab_size, 128))

In [13]:
# 룩업 테이블 저장
embeddings = model.layers[0].get_weights()[0]

# 룩업 테이블 크기 확인
print(embeddings.shape)

(50, 128)


In [14]:
# 34번째 단어 임베딩 값 확인
embeddings[34]

array([-0.00106465, -0.03616438,  0.04885193,  0.00111686,  0.01090791,
       -0.02718016, -0.02023548, -0.02842462,  0.04583872,  0.01529602,
       -0.03879865, -0.01007236,  0.02253092,  0.02837094, -0.03026508,
        0.00520658,  0.03759216, -0.00694891, -0.00319543, -0.04416354,
       -0.01524875, -0.03944087,  0.04476741, -0.04458553,  0.04166938,
       -0.01489206,  0.01805771,  0.0451639 ,  0.00862771, -0.00648687,
        0.04708358,  0.00443799,  0.00688799,  0.01316089, -0.01066319,
       -0.02673641,  0.00018796,  0.03697864, -0.03489616, -0.04102333,
        0.02532159, -0.01965993,  0.03988898,  0.03812202, -0.00188378,
        0.00405959, -0.04233438,  0.00783805, -0.03105285,  0.00668001,
        0.03278694,  0.02390155, -0.0181102 ,  0.04256245, -0.0397527 ,
        0.03467287,  0.00592735, -0.0140148 , -0.02215563, -0.00746739,
       -0.02655622, -0.0365473 ,  0.04495398,  0.01361546, -0.00520309,
        0.01315745, -0.04091704,  0.03914733,  0.03469608, -0.02

In [15]:
# 첫번째 문서 임베딩화
embeddings[encod_corp[0]].shape

(6, 128)

In [16]:
# 두번째 문서 임베딩화
embeddings[encod_corp[1]]

array([[-0.032639  ,  0.02935718,  0.0272929 , ...,  0.00415765,
        -0.01161697,  0.00469721],
       [-0.03930248, -0.02292795, -0.00444122, ...,  0.03335805,
        -0.04427975, -0.04972593],
       [-0.0305411 , -0.03209617, -0.02797174, ...,  0.00707245,
         0.01082123, -0.00390488],
       ...,
       [ 0.00033738, -0.01362173, -0.04912856, ...,  0.01989211,
        -0.00812566, -0.00683923],
       [-0.0305411 , -0.03209617, -0.02797174, ...,  0.00707245,
         0.01082123, -0.00390488],
       [ 0.03835047,  0.02263499,  0.03161981, ..., -0.02635964,
         0.02252717,  0.02017839]], dtype=float32)

In [17]:
# 세번째 문서 임베딩화
embeddings[encod_corp[2]].shape

(12, 128)

## 감성 분류하기(Embedding layer)

In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [19]:
sentences = ['nice great best amazing', 'stop lies', 'pitiful nerd', 'excellent work', 'supreme quality', 'bad', 'highly respectable']
y_train = [1, 0, 0, 1, 1, 0, 1]

In [20]:
t = Tokenizer()
t.fit_on_texts(sentences)
vocab_size = len(t.word_index) + 1

print(vocab_size)

16


In [21]:
X_encoded = t.texts_to_sequences(sentences)
print(X_encoded)

[[1, 2, 3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13], [14, 15]]


In [22]:
max_len = max(len(l) for l in X_encoded)
print(max_len)

4


In [23]:
X_train = pad_sequences(X_encoded, maxlen = max_len, padding="post")
y_train = np.array(y_train)
print(X_train)

[[ 1  2  3  4]
 [ 5  6  0  0]
 [ 7  8  0  0]
 [ 9 10  0  0]
 [11 12  0  0]
 [13  0  0  0]
 [14 15  0  0]]


In [24]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

model = Sequential()
model.add(Embedding(vocab_size, 4, input_length = max_len)) # 모든 임베팅 벡터는 4차원
model.add(Flatten()) # Dense의 입력으로 넣기 위함
model.add(Dense(1, activation="sigmoid"))
model.compile(optimizer = "adam", loss="binary_crossentropy", metrics=["acc"])
model.fit(X_train, y_train, epochs=100, verbose=2)

Epoch 1/100
1/1 - 1s - loss: 0.6952 - acc: 0.4286
Epoch 2/100
1/1 - 0s - loss: 0.6936 - acc: 0.4286
Epoch 3/100
1/1 - 0s - loss: 0.6921 - acc: 0.4286
Epoch 4/100
1/1 - 0s - loss: 0.6906 - acc: 0.4286
Epoch 5/100
1/1 - 0s - loss: 0.6891 - acc: 0.5714
Epoch 6/100
1/1 - 0s - loss: 0.6876 - acc: 0.7143
Epoch 7/100
1/1 - 0s - loss: 0.6861 - acc: 0.7143
Epoch 8/100
1/1 - 0s - loss: 0.6846 - acc: 0.7143
Epoch 9/100
1/1 - 0s - loss: 0.6831 - acc: 0.7143
Epoch 10/100
1/1 - 0s - loss: 0.6816 - acc: 0.7143
Epoch 11/100
1/1 - 0s - loss: 0.6801 - acc: 0.7143
Epoch 12/100
1/1 - 0s - loss: 0.6786 - acc: 0.7143
Epoch 13/100
1/1 - 0s - loss: 0.6770 - acc: 0.7143
Epoch 14/100
1/1 - 0s - loss: 0.6755 - acc: 0.8571
Epoch 15/100
1/1 - 0s - loss: 0.6740 - acc: 0.8571
Epoch 16/100
1/1 - 0s - loss: 0.6725 - acc: 0.8571
Epoch 17/100
1/1 - 0s - loss: 0.6709 - acc: 0.8571
Epoch 18/100
1/1 - 0s - loss: 0.6694 - acc: 0.8571
Epoch 19/100
1/1 - 0s - loss: 0.6679 - acc: 0.8571
Epoch 20/100
1/1 - 0s - loss: 0.6663 - a

<keras.callbacks.History at 0x7f7a18d429d0>

## 감성 분류하기(Pre-trained embedding)

In [25]:
print(X_train)

[[ 1  2  3  4]
 [ 5  6  0  0]
 [ 7  8  0  0]
 [ 9 10  0  0]
 [11 12  0  0]
 [13  0  0  0]
 [14 15  0  0]]


In [26]:
import numpy as np
import gensim

In [27]:
# 현재 위치에 구글의 사전 훈련된 Word2Vec을 다운로드
!wget "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2021-10-19 13:28:47--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.160.184
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.160.184|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2021-10-19 13:29:07 (80.7 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [28]:
# 구글의 사전 훈련된 Word2Vec 모델을 로드합니다.
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)  

In [29]:
print(word2vec_model.vectors.shape) # 모델의 크기 확인

(3000000, 300)


In [30]:
# 단어 집합 크기의 행과 300개의 열을 가지는 행렬 생성하여 0으로 채움
embedding_matrix = np.zeros((vocab_size, 300))
np.shape(embedding_matrix)

(16, 300)

In [31]:
def get_vector(word):
  if word in word2vec_model:
    return word2vec_model[word]
  else:
    return None

In [32]:
for word, i in t.word_index.items(): # 훈련 데이터의 단어 집합에서 단어와 정수 인덱스를 1개씩 꺼내온다.
    temp = get_vector(word) # 단어(key) 해당되는 임베딩 벡터의 300개의 값(value)를 임시 변수에 저장
    if temp is not None: # 만약 None이 아니라면 임베딩 벡터의 값을 리턴받은 것이므로
        embedding_matrix[i] = temp # 해당 단어 위치의 행에 벡터의 값을 저장한다.

In [33]:
print(word2vec_model['nice'])

[ 0.15820312  0.10595703 -0.18945312  0.38671875  0.08349609 -0.26757812
  0.08349609  0.11328125 -0.10400391  0.17871094 -0.12353516 -0.22265625
 -0.01806641 -0.25390625  0.13183594  0.0859375   0.16113281  0.11083984
 -0.11083984 -0.0859375   0.0267334   0.34570312  0.15136719 -0.00415039
  0.10498047  0.04907227 -0.06982422  0.08642578  0.03198242 -0.02844238
 -0.15722656  0.11865234  0.36132812  0.00173187  0.05297852 -0.234375
  0.11767578  0.08642578 -0.01123047  0.25976562  0.28515625 -0.11669922
  0.38476562  0.07275391  0.01147461  0.03466797  0.18164062 -0.03955078
  0.04199219  0.01013184 -0.06054688  0.09765625  0.06689453  0.14648438
 -0.12011719  0.08447266 -0.06152344  0.06347656  0.3046875  -0.35546875
 -0.2890625   0.19628906 -0.33203125 -0.07128906  0.12792969  0.09619141
 -0.12158203 -0.08691406 -0.12890625  0.27734375  0.265625    0.1796875
  0.12695312  0.06298828 -0.34375    -0.05908203  0.0456543   0.171875
  0.08935547  0.14648438 -0.04638672 -0.00842285 -0.0279

In [34]:
print("단어 nice의 정수 인덱스:", t.word_index['nice'])

단어 nice의 정수 인덱스: 1


In [35]:
print(embedding_matrix[1])

[ 0.15820312  0.10595703 -0.18945312  0.38671875  0.08349609 -0.26757812
  0.08349609  0.11328125 -0.10400391  0.17871094 -0.12353516 -0.22265625
 -0.01806641 -0.25390625  0.13183594  0.0859375   0.16113281  0.11083984
 -0.11083984 -0.0859375   0.0267334   0.34570312  0.15136719 -0.00415039
  0.10498047  0.04907227 -0.06982422  0.08642578  0.03198242 -0.02844238
 -0.15722656  0.11865234  0.36132812  0.00173187  0.05297852 -0.234375
  0.11767578  0.08642578 -0.01123047  0.25976562  0.28515625 -0.11669922
  0.38476562  0.07275391  0.01147461  0.03466797  0.18164062 -0.03955078
  0.04199219  0.01013184 -0.06054688  0.09765625  0.06689453  0.14648438
 -0.12011719  0.08447266 -0.06152344  0.06347656  0.3046875  -0.35546875
 -0.2890625   0.19628906 -0.33203125 -0.07128906  0.12792969  0.09619141
 -0.12158203 -0.08691406 -0.12890625  0.27734375  0.265625    0.1796875
  0.12695312  0.06298828 -0.34375    -0.05908203  0.0456543   0.171875
  0.08935547  0.14648438 -0.04638672 -0.00842285 -0.0279

In [36]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, Input

model = Sequential()
model.add(Input(shape=(max_len, ), dtype="int32"))

# trainable 인자를 True로 주면 Pre-trained embedding 모델 위에 새롭게 학습
e = Embedding(vocab_size, 300, weights=[embedding_matrix], trainable=False)  

model.add(e)
model.add(Flatten())
model.add(Dense(1, activation="sigmoid"))
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=['acc'])
model.fit(X_train, y_train, epochs=100, verbose=2)

Epoch 1/100
1/1 - 0s - loss: 0.6957 - acc: 0.5714
Epoch 2/100
1/1 - 0s - loss: 0.6765 - acc: 0.5714
Epoch 3/100
1/1 - 0s - loss: 0.6579 - acc: 0.5714
Epoch 4/100
1/1 - 0s - loss: 0.6398 - acc: 0.8571
Epoch 5/100
1/1 - 0s - loss: 0.6223 - acc: 0.8571
Epoch 6/100
1/1 - 0s - loss: 0.6054 - acc: 0.8571
Epoch 7/100
1/1 - 0s - loss: 0.5890 - acc: 0.8571
Epoch 8/100
1/1 - 0s - loss: 0.5732 - acc: 1.0000
Epoch 9/100
1/1 - 0s - loss: 0.5580 - acc: 1.0000
Epoch 10/100
1/1 - 0s - loss: 0.5433 - acc: 1.0000
Epoch 11/100
1/1 - 0s - loss: 0.5291 - acc: 1.0000
Epoch 12/100
1/1 - 0s - loss: 0.5154 - acc: 1.0000
Epoch 13/100
1/1 - 0s - loss: 0.5022 - acc: 1.0000
Epoch 14/100
1/1 - 0s - loss: 0.4895 - acc: 1.0000
Epoch 15/100
1/1 - 0s - loss: 0.4773 - acc: 1.0000
Epoch 16/100
1/1 - 0s - loss: 0.4655 - acc: 1.0000
Epoch 17/100
1/1 - 0s - loss: 0.4541 - acc: 1.0000
Epoch 18/100
1/1 - 0s - loss: 0.4431 - acc: 1.0000
Epoch 19/100
1/1 - 0s - loss: 0.4325 - acc: 1.0000
Epoch 20/100
1/1 - 0s - loss: 0.4222 - a

<keras.callbacks.History at 0x7f79dac94450>

## Tensorflow Hub로부터 Pre-trained embedding 사용하기

In [37]:
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as hub

In [38]:
(train_data, validation_data, test_data), info = tfds.load('imdb_reviews', split=['train[:80%]', 'train[80%:90%]', 'train[90%:]'], with_info=True, as_supervised=True)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]





0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteOV6GOS/imdb_reviews-train.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteOV6GOS/imdb_reviews-test.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteOV6GOS/imdb_reviews-unsupervised.tfrecord


  0%|          | 0/50000 [00:00<?, ? examples/s]



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [39]:
train_examples_batch, train_labels_batch = next(iter(train_data.batch(10)))
print(train_examples_batch[0])

tf.Tensor(b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.", shape=(), dtype=string)


In [40]:
train_labels_batch

<tf.Tensor: shape=(10,), dtype=int64, numpy=array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0])>

In [41]:
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[], dtype=tf.string, trainable=True)
hub_layer(train_examples_batch[:3])

<tf.Tensor: shape=(3, 20), dtype=float32, numpy=
array([[ 1.765786  , -3.882232  ,  3.9134233 , -1.5557289 , -3.3362343 ,
        -1.7357955 , -1.9954445 ,  1.2989551 ,  5.081598  , -1.1041286 ,
        -2.0503852 , -0.72675157, -0.65675956,  0.24436149, -3.7208383 ,
         2.0954835 ,  2.2969332 , -2.0689783 , -2.9489717 , -1.1315987 ],
       [ 1.8804485 , -2.5852382 ,  3.4066997 ,  1.0982676 , -4.056685  ,
        -4.891284  , -2.785554  ,  1.3874227 ,  3.8476458 , -0.9256538 ,
        -1.896706  ,  1.2113281 ,  0.11474707,  0.76209456, -4.8791065 ,
         2.906149  ,  4.7087674 , -2.3652055 , -3.5015898 , -1.6390051 ],
       [ 0.71152234, -0.6353217 ,  1.7385626 , -1.1168286 , -0.5451594 ,
        -1.1808156 ,  0.09504455,  1.4653089 ,  0.66059524,  0.79308075,
        -2.2268345 ,  0.07446612, -1.4075904 , -0.70645386, -1.907037  ,
         1.4419787 ,  1.9551861 , -0.42660055, -2.8022065 ,  0.43727064]],
      dtype=float32)>

In [42]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation="relu"))
model.add(tf.keras.layers.Dense(1,  activation="sigmoid"))
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, 20)                400020    
_________________________________________________________________
dense_2 (Dense)              (None, 16)                336       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 400,373
Trainable params: 400,373
Non-trainable params: 0
_________________________________________________________________


In [43]:
model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics=["acc"])
history = model.fit(train_data.shuffle(10000).batch(512),
                    epochs=20,
                    validation_data = validation_data.batch(512),
                    verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [44]:
results = model.evaluate(test_data.batch(512), verbose=2)
for name, value in zip(model.metrics_names, results):
  print("%s: %.3f" % (name, value))

5/5 - 0s - loss: 0.2800 - acc: 0.8824
loss: 0.280
acc: 0.882
