In [None]:
import os
import config
from dataloader.loader import Loader
from preprocessing.utils import Preprocess, remove_empty_docs
from dataloader.embeddings import GloVe
from model.cnn_document_model import DocumentModel, TrainingParameters
from keras.callbacks import ModelCheckpoint, EarlyStopping
import numpy as np

In [None]:
# 학습된 모델을 저장할 디렉토리 생성
if not os.path.exists(os.path.join(config.MODEL_DIR, 'imdb')):
    os.makedirs(os.path.join(config.MODEL_DIR, 'imdb'))

# 학습 파라미터 설정
train_params = TrainingParameters('imdb_transfer_tanh_activation', 
                                  model_file_path = config.MODEL_DIR+ '/imdb/naver_transfer_model.hdf5',
                                  model_hyper_parameters = config.MODEL_DIR+ '/imdb/naver_transfer_model.json',
                                  model_train_parameters = config.MODEL_DIR+ '/imdb/naver_transfer_model.json',
                                  num_epochs=1000,
                                  batch_size=128)

In [None]:
train_df = Loader.load_imdb_data(directory = 'train')
# train_df = train_df.sample(frac=0.05, random_state = train_params.seed)
print(f'train_df.shape : {train_df.shape}')

test_df = Loader.load_imdb_data(directory = 'test')
print(f'test_df.shape : {test_df.shape}')

# 텍스트 데이터, 레이블 추출
corpus = train_df['review'].tolist()
target = train_df['sentiment'].tolist()
corpus, target = remove_empty_docs(corpus, target)
print(f'corpus size : {len(corpus)}')
print(f'target size : {len(target)}')

In [None]:
Preprocess.NUM_SENTENCES = 20

# 학습셋을 인덱스 시퀀스로 변환
preprocessor = Preprocess(corpus=corpus)
corpus_to_seq = preprocessor.fit()

In [None]:
print(f'corpus_to_seq size : {len(corpus_to_seq)}')
print(f'corpus_to_seq[0] size : {len(corpus_to_seq[0])}')

In [None]:
# 테스트셋을 인덱스 시퀀스로 변환
test_corpus = test_df['review'].tolist()
test_target = test_df['sentiment'].tolist()
test_corpus, test_target = remove_empty_docs(test_corpus, test_target)
test_corpus_to_seq = preprocessor.transform(test_corpus)

In [None]:
print(f'test_corpus_to_seq size : {len(test_corpus_to_seq)}')
print(f'test_corpus_to_seq[0] size : {len(test_corpus_to_seq[0])}')

In [None]:
# 학습셋, 테스트셋 준비
x_train = np.array(corpus_to_seq)
x_test = np.array(test_corpus_to_seq)
y_train = np.array(target)
y_test = np.array(test_target)

print(f'x_train.shape : {x_train.shape}')
print(f'y_train.shape : {y_train.shape}')
print(f'x_test.shape : {x_test.shape}')
print(f'y_test.shape : {y_test.shape}')

In [None]:
# GloVe 임베딩 초기화 - glove.6B.50d.txt pretrained 벡터 사용
glove = GloVe(50)
initial_embeddings = glove.get_embedding(preprocessor.word_index)
print(f'initial_embeddings.shape : {initial_embeddings.shape}')

In [None]:
print(initial_embeddings[2])

In [None]:
# 모델 하이퍼파라미터 로드
model_json_path = os.path.join(config.MODEL_DIR, 'amazonreviews/model_06.json')
amazon_review_model = DocumentModel.load_model(model_json_path)

# 모델 가중치 로드
model_hdf5_path = os.path.join(config.MODEL_DIR, 'amazonreviews/model_06.hdf5')
amazon_review_model.load_model_weights(model_hdf5_path)

In [None]:
# 모델 임베딩 레이어 추출
learned_embeddings = amazon_review_model.get_classification_model().get_layer('imdb_embedding').get_weights()[0]
print(f'learned_embeddings size : {len(learned_embeddings)}')

# 기존 GloVe 모델을 학습된 임베딩 행렬로 업데이트한다
glove.update_embeddings(preprocessor.word_index, 
                        np.array(learned_embeddings), 
                        amazon_review_model.word_index)

# 업데이트된 임베딩을 얻는다
initial_embeddings = glove.get_embedding(preprocessor.word_index)

In [None]:
naver_model = DocumentModel(vocab_size=preprocessor.get_vocab_size(),
                           word_index = preprocessor.word_index,
                           num_sentences=Preprocess.NUM_SENTENCES,     
                           embedding_weights=initial_embeddings,
                           embedding_regularizer_l2 = 0.0,
                           conv_activation = 'tanh',
                           train_embedding = True,   # 임베딩 레이어의 가중치 학습함
                           learn_word_conv = False,  # 단어 수준 conv 레이어의 가중치 학습 안 함
                           learn_sent_conv = False,  # 문장 수준 conv 레이어의 가중치 학습 안 함
                           hidden_dims=64,                                        
                           input_dropout=0.1, 
                           hidden_layer_kernel_regularizer=0.01,
                           final_layer_kernel_regularizer=0.01)

for l_name in ['word_conv','sentence_conv','hidden_0', 'final']:
    new_weights = amazon_review_model.get_classification_model().get_layer(l_name).get_weights()
    naver_model.get_classification_model().get_layer(l_name).set_weights(weights=new_weights)

In [None]:
# 모델 컴파일              
naver_model.get_classification_model().compile(loss="binary_crossentropy", 
                                              optimizer='rmsprop',
                                              metrics=["accuracy"])

# callback (1) - 체크포인트
checkpointer = ModelCheckpoint(filepath=train_params.model_file_path,
                                verbose=1,
                                save_best_only=True,
                                save_weights_only=True)

# callback (2) - 조기종료
early_stop = EarlyStopping(patience=2)

# 학습 시작
history = naver_model.get_classification_model().fit(x_train, 
                                          y_train, 
                                          batch_size=train_params.batch_size,
                                          epochs=train_params.num_epochs,
                                          verbose=2,
                                          validation_split=0.01,
                                          callbacks=[checkpointer])

# 모델 저장
naver_model._save_model(train_params.model_hyper_parameters)
train_params.save()

In [None]:
history.history['accuracy']

In [None]:
history.history['loss']

In [None]:
history.history['val_accuracy']

In [None]:
history.history['val_loss']

In [None]:
# 모델 평가
naver_model.get_classification_model().evaluate(x_test, 
                                               y_test, 
                                               batch_size=train_params.batch_size*10,
                                               verbose=2)

In [None]:
learned_embeddings = naver_model.get_classification_model().get_layer('imdb_embedding').get_weights()[0]

embd_change = {}
for word, i in preprocessor.word_index.items():
    # Frobenium norm (Euclidean norm) 계
    embd_change[word] = np.linalg.norm(initial_embeddings[i]-learned_embeddings[i])
embd_change = sorted(embd_change.items(), key=lambda x: x[1], reverse=True)
embd_change[0:100]