In [1]:
v = Embedding(20000, 128, input_length = 500)

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [3]:
sentences = ['nice great best amazing', 'stop lies', 'pitiful nerd', 'excellent work', 'supreme quality', 'bad', 'highly respectable']
y_train = [1, 0, 0, 1, 1, 0, 1]

In [4]:
t = Tokenizer()
t.fit_on_texts(sentences)
vocab_size = len(t.word_index)+1

print(vocab_size)

16


In [5]:
X_encoded = t.texts_to_sequences(sentences)
print(X_encoded)

[[1, 2, 3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13], [14, 15]]


In [6]:
max_len = max(len(l) for l in X_encoded)
print(max_len)

4


In [7]:
X_train = pad_sequences(X_encoded, maxlen=max_len, padding='post')
y_train = np.array(y_train)
print(X_train)

[[ 1  2  3  4]
 [ 5  6  0  0]
 [ 7  8  0  0]
 [ 9 10  0  0]
 [11 12  0  0]
 [13  0  0  0]
 [14 15  0  0]]


In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

model = Sequential()
model.add(Embedding(vocab_size, 4, input_length = max_len))

model.add(Flatten())
model.add(Dense(1, activation = 'sigmoid'))

In [15]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])
model.fit(X_train, y_train, epochs =100, verbose=2)

Epoch 1/100
1/1 - 0s - loss: 0.6991 - acc: 0.5714
Epoch 2/100
1/1 - 0s - loss: 0.6971 - acc: 0.5714
Epoch 3/100
1/1 - 0s - loss: 0.6951 - acc: 0.5714
Epoch 4/100
1/1 - 0s - loss: 0.6931 - acc: 0.5714
Epoch 5/100
1/1 - 0s - loss: 0.6912 - acc: 0.5714
Epoch 6/100
1/1 - 0s - loss: 0.6892 - acc: 0.5714
Epoch 7/100
1/1 - 0s - loss: 0.6872 - acc: 0.5714
Epoch 8/100
1/1 - 0s - loss: 0.6852 - acc: 0.5714
Epoch 9/100
1/1 - 0s - loss: 0.6832 - acc: 0.5714
Epoch 10/100
1/1 - 0s - loss: 0.6813 - acc: 0.7143
Epoch 11/100
1/1 - 0s - loss: 0.6793 - acc: 0.7143
Epoch 12/100
1/1 - 0s - loss: 0.6773 - acc: 0.7143
Epoch 13/100
1/1 - 0s - loss: 0.6754 - acc: 0.7143
Epoch 14/100
1/1 - 0s - loss: 0.6734 - acc: 0.7143
Epoch 15/100
1/1 - 0s - loss: 0.6715 - acc: 0.7143
Epoch 16/100
1/1 - 0s - loss: 0.6695 - acc: 0.7143
Epoch 17/100
1/1 - 0s - loss: 0.6676 - acc: 0.8571
Epoch 18/100
1/1 - 0s - loss: 0.6656 - acc: 0.8571
Epoch 19/100
1/1 - 0s - loss: 0.6636 - acc: 0.8571
Epoch 20/100
1/1 - 0s - loss: 0.6617 - a

&lt;tensorflow.python.keras.callbacks.History at 0x2789d4146c8&gt;

# 사전 훈련된 워드 임베딩 사용하기 

In [18]:
print(X_train)
print(y_train)

[[ 1  2  3  4]
 [ 5  6  0  0]
 [ 7  8  0  0]
 [ 9 10  0  0]
 [11 12  0  0]
 [13  0  0  0]
 [14 15  0  0]]
[1 0 0 1 1 0 1]


# 사전 훈련된 GloVe 사용하기

In [19]:
n = 0
f = open('glove.6B.100d.txt', encoding="utf8")

for line in f:
    word_vector = line.split()
    print(word_vector)
    word = word_vector[0]
    print(word)
    n = n+1
    if n==2:
        break
f.close()

[&#39;the&#39;, &#39;-0.038194&#39;, &#39;-0.24487&#39;, &#39;0.72812&#39;, &#39;-0.39961&#39;, &#39;0.083172&#39;, &#39;0.043953&#39;, &#39;-0.39141&#39;, &#39;0.3344&#39;, &#39;-0.57545&#39;, &#39;0.087459&#39;, &#39;0.28787&#39;, &#39;-0.06731&#39;, &#39;0.30906&#39;, &#39;-0.26384&#39;, &#39;-0.13231&#39;, &#39;-0.20757&#39;, &#39;0.33395&#39;, &#39;-0.33848&#39;, &#39;-0.31743&#39;, &#39;-0.48336&#39;, &#39;0.1464&#39;, &#39;-0.37304&#39;, &#39;0.34577&#39;, &#39;0.052041&#39;, &#39;0.44946&#39;, &#39;-0.46971&#39;, &#39;0.02628&#39;, &#39;-0.54155&#39;, &#39;-0.15518&#39;, &#39;-0.14107&#39;, &#39;-0.039722&#39;, &#39;0.28277&#39;, &#39;0.14393&#39;, &#39;0.23464&#39;, &#39;-0.31021&#39;, &#39;0.086173&#39;, &#39;0.20397&#39;, &#39;0.52624&#39;, &#39;0.17164&#39;, &#39;-0.082378&#39;, &#39;-0.71787&#39;, &#39;-0.41531&#39;, &#39;0.20335&#39;, &#39;-0.12763&#39;, &#39;0.41367&#39;, &#39;0.55187&#39;, &#39;0.57908&#39;, &#39;-0.33477&#39;, &#39;-0.36559&#39;, &#39;-0.54857&#39;, &#

In [20]:
print(type(word_vector))
print(len(word_vector))

&lt;class &#39;list&#39;&gt;
101


In [22]:
import numpy as np
embedding_dict = dict()

f = open('glove.6B.100d.txt', encoding="utf8")

for line in f:
    word_vector = line.split()
    word = word_vector[0]
    word_vector_arr = np.asarray(word_vector[1:], dtype='float32')
    embedding_dict[word] = word_vector_arr
f.close()
print('%s개의 Embedding vector가 있습니다.' % len(embedding_dict))

400000개의 Embedding vector가 있습니다.


In [23]:
print(embedding_dict['respectable'])
print(len(embedding_dict['respectable']))

[-0.049773   0.19903    0.10585    0.1391    -0.32395    0.44053
  0.3947    -0.22805   -0.25793    0.49768    0.15384   -0.08831
  0.0782    -0.8299    -0.037788   0.16772   -0.45197   -0.17085
  0.74756    0.98256    0.81872    0.28507    0.16178   -0.48626
 -0.006265  -0.92469   -0.30625   -0.067318  -0.046762  -0.76291
 -0.0025264 -0.018795   0.12882   -0.52457    0.3586     0.43119
 -0.89477   -0.057421  -0.53724    0.25587    0.55195    0.44698
 -0.24252    0.29946    0.25776   -0.8717     0.68426   -0.05688
 -0.1848    -0.59352   -0.11227   -0.57692   -0.013593   0.18488
 -0.32507   -0.90171    0.17672    0.075601   0.54896   -0.21488
 -0.54018   -0.45882   -0.79536    0.26331    0.18879   -0.16363
  0.3975     0.1099     0.1164    -0.083499   0.50159    0.35802
  0.25677    0.088546   0.42108    0.28674   -0.71285   -0.82915
  0.15297   -0.82712    0.022112   1.067     -0.31776    0.1211
 -0.069755  -0.61327    0.27308   -0.42638   -0.085084  -0.17694
 -0.0090944  0.1109     0.

In [24]:
embedding_matrix = np.zeros((vocab_size, 100))

np.shape(embedding_matrix)

(16, 100)

In [26]:
print(t.word_index.items())

dict_items([(&#39;nice&#39;, 1), (&#39;great&#39;, 2), (&#39;best&#39;, 3), (&#39;amazing&#39;, 4), (&#39;stop&#39;, 5), (&#39;lies&#39;, 6), (&#39;pitiful&#39;, 7), (&#39;nerd&#39;, 8), (&#39;excellent&#39;, 9), (&#39;work&#39;, 10), (&#39;supreme&#39;, 11), (&#39;quality&#39;, 12), (&#39;bad&#39;, 13), (&#39;highly&#39;, 14), (&#39;respectable&#39;, 15)])


In [27]:
for word, i in t.word_index.items():
    temp = embedding_dict.get(word)
    if temp is not None:
        embedding_dict[i] = temp

In [29]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

model = Sequential()
e = Embedding(vocab_size, 100, weights = [embedding_matrix], input_length=max_len, trainable = False)

In [30]:
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, epochs=100, verbose=2)

Epoch 1/100
1/1 - 0s - loss: 0.6931 - acc: 0.4286
Epoch 2/100
1/1 - 0s - loss: 0.6931 - acc: 0.5714
Epoch 3/100
1/1 - 0s - loss: 0.6930 - acc: 0.5714
Epoch 4/100
1/1 - 0s - loss: 0.6929 - acc: 0.5714
Epoch 5/100
1/1 - 0s - loss: 0.6929 - acc: 0.5714
Epoch 6/100
1/1 - 0s - loss: 0.6928 - acc: 0.5714
Epoch 7/100
1/1 - 0s - loss: 0.6927 - acc: 0.5714
Epoch 8/100
1/1 - 0s - loss: 0.6927 - acc: 0.5714
Epoch 9/100
1/1 - 0s - loss: 0.6926 - acc: 0.5714
Epoch 10/100
1/1 - 0s - loss: 0.6925 - acc: 0.5714
Epoch 11/100
1/1 - 0s - loss: 0.6924 - acc: 0.5714
Epoch 12/100
1/1 - 0s - loss: 0.6924 - acc: 0.5714
Epoch 13/100
1/1 - 0s - loss: 0.6923 - acc: 0.5714
Epoch 14/100
1/1 - 0s - loss: 0.6922 - acc: 0.5714
Epoch 15/100
1/1 - 0s - loss: 0.6922 - acc: 0.5714
Epoch 16/100
1/1 - 0s - loss: 0.6921 - acc: 0.5714
Epoch 17/100
1/1 - 0s - loss: 0.6920 - acc: 0.5714
Epoch 18/100
1/1 - 0s - loss: 0.6920 - acc: 0.5714
Epoch 19/100
1/1 - 0s - loss: 0.6919 - acc: 0.5714
Epoch 20/100
1/1 - 0s - loss: 0.6918 - a

&lt;tensorflow.python.keras.callbacks.History at 0x278b13d22c8&gt;

# 사전 훈련된 Word2Vec 사용하기

In [32]:
import numpy as np
import gensim

In [35]:
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [36]:
print(word2vec_model.vectors.shape)

(3000000, 300)


In [37]:
embedding_matrix = np.zeros((vocab_size, 300))
np.shape(embedding_matrix)

(16, 300)

In [38]:
def get_vector(word):
    if word in word2vec_model:
        return word2vec_model[word]
    else:
        return None
        

In [39]:
for word, i in t.word_index.items():
    temp = get_vector(word)
    if temp is not None:
        embedding_matrix[i] = temp

In [40]:
print(word2vec_model['nice'])

[ 0.15820312  0.10595703 -0.18945312  0.38671875  0.08349609 -0.26757812
  0.08349609  0.11328125 -0.10400391  0.17871094 -0.12353516 -0.22265625
 -0.01806641 -0.25390625  0.13183594  0.0859375   0.16113281  0.11083984
 -0.11083984 -0.0859375   0.0267334   0.34570312  0.15136719 -0.00415039
  0.10498047  0.04907227 -0.06982422  0.08642578  0.03198242 -0.02844238
 -0.15722656  0.11865234  0.36132812  0.00173187  0.05297852 -0.234375
  0.11767578  0.08642578 -0.01123047  0.25976562  0.28515625 -0.11669922
  0.38476562  0.07275391  0.01147461  0.03466797  0.18164062 -0.03955078
  0.04199219  0.01013184 -0.06054688  0.09765625  0.06689453  0.14648438
 -0.12011719  0.08447266 -0.06152344  0.06347656  0.3046875  -0.35546875
 -0.2890625   0.19628906 -0.33203125 -0.07128906  0.12792969  0.09619141
 -0.12158203 -0.08691406 -0.12890625  0.27734375  0.265625    0.1796875
  0.12695312  0.06298828 -0.34375    -0.05908203  0.0456543   0.171875
  0.08935547  0.14648438 -0.04638672 -0.00842285 -0.0279

In [41]:
print('단어 nice의 정수 인덱스 : ', t.word_index['nice'])

단어 nice의 정수 인덱스 :  1


In [42]:
print(embedding_matrix[1])

[ 0.15820312  0.10595703 -0.18945312  0.38671875  0.08349609 -0.26757812
  0.08349609  0.11328125 -0.10400391  0.17871094 -0.12353516 -0.22265625
 -0.01806641 -0.25390625  0.13183594  0.0859375   0.16113281  0.11083984
 -0.11083984 -0.0859375   0.0267334   0.34570312  0.15136719 -0.00415039
  0.10498047  0.04907227 -0.06982422  0.08642578  0.03198242 -0.02844238
 -0.15722656  0.11865234  0.36132812  0.00173187  0.05297852 -0.234375
  0.11767578  0.08642578 -0.01123047  0.25976562  0.28515625 -0.11669922
  0.38476562  0.07275391  0.01147461  0.03466797  0.18164062 -0.03955078
  0.04199219  0.01013184 -0.06054688  0.09765625  0.06689453  0.14648438
 -0.12011719  0.08447266 -0.06152344  0.06347656  0.3046875  -0.35546875
 -0.2890625   0.19628906 -0.33203125 -0.07128906  0.12792969  0.09619141
 -0.12158203 -0.08691406 -0.12890625  0.27734375  0.265625    0.1796875
  0.12695312  0.06298828 -0.34375    -0.05908203  0.0456543   0.171875
  0.08935547  0.14648438 -0.04638672 -0.00842285 -0.0279

In [46]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

model = Sequential()
e = Embedding(vocab_size, 300, weights = [embedding_matrix], input_length=max_len, trainable = False)

model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, epochs=100, verbose=2)

Epoch 1/100
1/1 - 0s - loss: 0.7456 - acc: 0.2857
Epoch 2/100
1/1 - 0s - loss: 0.7252 - acc: 0.5714
Epoch 3/100
1/1 - 0s - loss: 0.7053 - acc: 0.5714
Epoch 4/100
1/1 - 0s - loss: 0.6860 - acc: 0.5714
Epoch 5/100
1/1 - 0s - loss: 0.6672 - acc: 0.5714
Epoch 6/100
1/1 - 0s - loss: 0.6490 - acc: 0.7143
Epoch 7/100
1/1 - 0s - loss: 0.6314 - acc: 0.8571
Epoch 8/100
1/1 - 0s - loss: 0.6144 - acc: 0.8571
Epoch 9/100
1/1 - 0s - loss: 0.5979 - acc: 0.8571
Epoch 10/100
1/1 - 0s - loss: 0.5820 - acc: 0.8571
Epoch 11/100
1/1 - 0s - loss: 0.5666 - acc: 1.0000
Epoch 12/100
1/1 - 0s - loss: 0.5518 - acc: 1.0000
Epoch 13/100
1/1 - 0s - loss: 0.5375 - acc: 1.0000
Epoch 14/100
1/1 - 0s - loss: 0.5237 - acc: 1.0000
Epoch 15/100
1/1 - 0s - loss: 0.5104 - acc: 1.0000
Epoch 16/100
1/1 - 0s - loss: 0.4976 - acc: 1.0000
Epoch 17/100
1/1 - 0s - loss: 0.4852 - acc: 1.0000
Epoch 18/100
1/1 - 0s - loss: 0.4733 - acc: 1.0000
Epoch 19/100
1/1 - 0s - loss: 0.4618 - acc: 1.0000
Epoch 20/100
1/1 - 0s - loss: 0.4507 - a

&lt;tensorflow.python.keras.callbacks.History at 0x279c14954c8&gt;