In [1]:
import os
import tensorflow as tf
from tokenization_bert import BertTokenizer
from modeling_tf_bert import TFBertForSequenceClassification

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
import numpy as np
import re
import wget

# if you have gpu
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "6"

In [2]:
# 시각화

def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string], '')
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

In [3]:
ckpt_path = 'bert_ckpt'
vocab_path = os.path.join(ckpt_path, 'bert-base-multilingual-uncased-vocab.txt')
config_path = os.path.join(ckpt_path, 'config.json')
model_path = os.path.join(ckpt_path, 'tf_model.h5')

# Vocab 파일 불러오기
if os.path.isfile(vocab_path):
    print("vocab exists")
    tokenizer = BertTokenizer(vocab_path) #토크나이저 불러오기
else:
    print("vocab does not exists")
    wget.download("https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt", ckpt_path)
    
#BERT Config파일 불러오기
if os.path.isfile(config_path):
    print("Config model exists")
else:
    print("Config model does not exists")
    wget.download("https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json", ckpt_path)
    
#BERT 모델 불러오기, huggingface의 저장된 로컬 모델을 불러오려면 아래와 같은 방법으로 접근해야 가능.
if os.path.isfile(model_path):
    print("Pretrained model exists")
    model = TFBertForSequenceClassification.from_pretrained(ckpt_path) # 모델 학습 불러오기
else:
    print("Pretrained model does not exists")
    wget.download("https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-tf_model.h5", ckpt_path)
    os.rename(os.path.join(ckpt_path, 'bert-base-multilingual-uncased-config.json'), config_path) # 'bert-base-multilingual-uncased-config.json' -> config.json
    os.rename(os.path.join(ckpt_path, 'bert-base-multilingual-uncased-tf_model.h5'), model_path) # 'bert-base-multilingual-uncased-tf_model.h5' -> tf_model.h5

tokenizer = BertTokenizer(vocab_path) #토크나이저 불러오기
model = TFBertForSequenceClassification.from_pretrained(ckpt_path) # 모델 학습 불러오기
    
# Load dataset, tokenizer, model from pretrained model/vocabulary

# bert_checkpoint_path = "larva-kor-plus-base-cased-pytorch/"
# tokenizer = BertTokenizer.from_pretrained(bert_checkpoint_path)
# model = TFBertForSequenceClassification.from_pretrained(bert_checkpoint_path, from_pt=True)

In [33]:
#random seed 고정

tf.random.set_seed(0)
np.random.seed(0)

BATCH_SIZE = 256
NUM_EPOCHS = 20
VALID_SPLIT = 0.2
MAX_LEN = 50
DATA_OUT_PATH = "data_out/"
model_name = "tf2_bert"

In [5]:
# Special Tokens
print(tokenizer.all_special_tokens, "\n", tokenizer.all_special_ids)

# Test Tokenizers
kor_encode = tokenizer.encode("안녕하세요, 반갑습니다")
eng_encode = tokenizer.encode("Hello world")

kor_decode = tokenizer.decode(kor_encode)
eng_decode = tokenizer.decode(eng_encode)

print(kor_encode)
print(eng_encode)
print(kor_decode)
print(eng_decode)

['[SEP]', '[CLS]', '[UNK]', '[MASK]', '[PAD]'] 
 [102, 101, 100, 103, 0]
[101, 1174, 26646, 49345, 13045, 35132, 25169, 47024, 117, 1170, 26646, 11376, 17360, 13212, 79427, 102]
[101, 29155, 10228, 102]
[CLS] 안녕하세요, 반갑습니다 [SEP]
[CLS] hello world [SEP]


In [7]:
# 데이터 전처리 준비

DATA_PATH = 'data_in/'
DATA_TRAIN_PATH = DATA_PATH + "ratings_train.txt"
DATA_TEST_PATH = DATA_PATH + "ratings_test.txt"

train_data = pd.read_csv(DATA_TRAIN_PATH, header = 0, delimiter = '\t', quoting = 3)
train_data.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [8]:
# 텍스트 전처리

def clean_text(sent):
    sent_clean = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]", "", sent)
    return sent_clean

In [9]:
# train_data = train_data[:1000] # for test

train_data_sents = []
train_data_labels = []

for train_sent, train_label in zip(train_data["document"], train_data["label"]):
    try:
        token_sent = tokenizer.encode(clean_text(train_sent))
        train_data_sents.append(token_sent)
        train_data_labels.append(train_label)
    except Exception as e:
        print(e)
        print(train_sent)
        pass

train_data_sent_pads = np.asarray(pad_sequences(train_data_sents, maxlen=MAX_LEN, padding='post'), dtype=np.int32) # convert into numpy
train_data_labels = np.asarray(train_data_labels, dtype=np.int32) #레이블 토크나이징 리스트

print("num sents, labels {}, {}".format(len(train_data_sent_pads), len(train_data_labels)))

expected string or bytes-like object
nan
expected string or bytes-like object
nan
expected string or bytes-like object
nan
expected string or bytes-like object
nan
expected string or bytes-like object
nan
num sents, labels 149995, 149995


In [10]:
print(train_data_sent_pads[0])

[  101  1174 25539 23236 29234 13045 87550 97082 25539  1176 25539 24937
 13045 16801 72197 47024  1169 70724 22585 13926   102     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]


In [11]:
# 학습 준비하기
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [12]:
# model.save_pretrained("bert_checkpoint/") # 모델 프리트레인 저장하기
model.summary() #모델 파라메터 수 체크

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  167356416 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 167,357,954
Trainable params: 167,357,954
Non-trainable params: 0
_________________________________________________________________


In [None]:
# overfitting을 막기 위한 ealrystop 추가
earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001,patience=5)
# min_delta: the threshold that triggers the termination (acc should at least improve 0.0001)
# patience: no improvment epochs (patience = 1, 1번 이상 상승이 없으면 종료)\

checkpoint_path = DATA_OUT_PATH + model_name + '/weights.{epoch:02d}-{val_accuracy:.2f}.h5'
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create path if exists
if os.path.exists(checkpoint_dir):
    print("{} -- Folder already exists \n".format(checkpoint_dir))
else:
    os.makedirs(checkpoint_dir, exist_ok=True)
    print("{} -- Folder create complete \n".format(checkpoint_dir))
    
cp_callback = ModelCheckpoint(
    checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)

# 학습과 eval 시작
history = model.fit(train_data_sent_pads, train_data_labels, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE,
                    validation_split = VALID_SPLIT, callbacks=[earlystop_callback, cp_callback], validation_steps=2)

#steps_for_epoch

print(history.history)

data_out/tf2_bert -- Folder already exists 

Train on 119996 samples, validate on 29999 samples
Epoch 1/20
Epoch 00001: val_accuracy improved from -inf to 0.84766, saving model to data_out/tf2_bert/weights.01-0.85.h5
Epoch 2/20
Epoch 00002: val_accuracy did not improve from 0.84766
Epoch 3/20

In [None]:
plot_graphs(history, 'accuracy')

In [None]:
plot_graphs(history, 'loss')

# Test 데이터

In [None]:
test_data = pd.read_csv(DATA_TEST_PATH, header = 0, delimiter = '\t', quoting = 3)
test_data.head()

In [None]:
test_data_sents = []
test_data_labels = []

for test_sent, test_label in zip(test_data["document"], test_data["label"]):
    try:
        token_sent = tokenizer.encode(clean_text(test_sent))
        test_data_sents.append(token_sent)
        test_data_labels.append(test_label)
    except Exception as e:
        print(e)
        print(test_sent)
        pass

test_data_sent_pad = np.asarray(pad_sequences(test_data_sents, maxlen=MAX_LEN, padding='post'), dtype=np.int32) # convert into numpy
test_data_labels = np.asarray(test_data_labels, dtype=np.int32) #레이블 토크나이징 리스트

print("num sents, labels {}, {}".format(len(test_data_sent_pad), len(test_data_labels)))

In [None]:
results = model.evaluate(test_data_sent_pad, test_data_labels)
print("test loss, test acc: ", results)