In [20]:
import re
from keras.datasets import imdb
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, GRU, Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

vocab_size = 10000
max_len = 500

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocab_size)

X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)


In [21]:
import gensim
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [22]:
import numpy as np
print('모델의 크기(shape) :',word2vec_model.vectors.shape) # 모델의 크기 확인
embedding_matrix = np.zeros((vocab_size, 300))
print('임베딩 행렬의 크기(shape) :',np.shape(embedding_matrix))

모델의 크기(shape) : (3000000, 300)
임베딩 행렬의 크기(shape) : (10000, 300)


In [None]:
word_index = imdb.get_word_index()

for word, index in word_index.items():
    if index < vocab_size and word in word2vec_model:
        embedding_matrix[index] = word2vec_model[word]

In [24]:
from keras.layers import Flatten, Input

In [None]:
model = Sequential()
model.add(Input(shape=(max_len,), dtype='int32'))
# 해결: Word2Vec을 초기값으로 사용하고 미세조정
model.add(Embedding(vocab_size, 300, weights=[embedding_matrix], 
                   trainable=True))   # ← 임베딩도 함께 학습!
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('word2.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=64, validation_split=0.2)

Epoch 1/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - acc: 0.6156 - loss: 0.6363
Epoch 1: val_acc improved from -inf to 0.83000, saving model to word2.h5




[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 39ms/step - acc: 0.6159 - loss: 0.6359 - val_acc: 0.8300 - val_loss: 0.3863
Epoch 2/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - acc: 0.9618 - loss: 0.1328
Epoch 2: val_acc improved from 0.83000 to 0.85400, saving model to word2.h5




[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 39ms/step - acc: 0.9618 - loss: 0.1328 - val_acc: 0.8540 - val_loss: 0.3454
Epoch 3/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - acc: 0.9958 - loss: 0.0455
Epoch 3: val_acc improved from 0.85400 to 0.86080, saving model to word2.h5




[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 39ms/step - acc: 0.9958 - loss: 0.0455 - val_acc: 0.8608 - val_loss: 0.3477
Epoch 4/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - acc: 0.9987 - loss: 0.0195
Epoch 4: val_acc improved from 0.86080 to 0.86140, saving model to word2.h5




[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 39ms/step - acc: 0.9987 - loss: 0.0195 - val_acc: 0.8614 - val_loss: 0.3588
Epoch 5/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - acc: 0.9999 - loss: 0.0097
Epoch 5: val_acc did not improve from 0.86140
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 40ms/step - acc: 0.9999 - loss: 0.0097 - val_acc: 0.8606 - val_loss: 0.3720
Epoch 6/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - acc: 1.0000 - loss: 0.0058
Epoch 6: val_acc improved from 0.86140 to 0.86200, saving model to word2.h5




[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 39ms/step - acc: 1.0000 - loss: 0.0058 - val_acc: 0.8620 - val_loss: 0.3838
Epoch 6: early stopping


In [26]:
from keras.models import load_model
loaded_model = load_model('word2.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))



[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - acc: 0.8602 - loss: 0.3728

 테스트 정확도: 0.8600
