In [1]:
import os
import wget

import numpy as np
import tensorflow as tf

from tokenization_openai import OpenAIGPTTokenizer
from modeling_tf_openai import TFOpenAIGPTLMHeadModel
from tokenization_utils import PreTrainedTokenizer

from tensorflow.keras.preprocessing.sequence import pad_sequences

from nltk.tokenize import sent_tokenize

In [2]:
ckpt_path = 'gpt_ckpt'
vocab_path = os.path.join(ckpt_path, 'vocab.json')
merges_path = os.path.join(ckpt_path, 'merges.txt')
config_path = os.path.join(ckpt_path, 'config.json')
model_path = os.path.join(ckpt_path, 'tf_model.h5')

if not os.path.exists(ckpt_path):
    os.makedirs(ckpt_path)

# Vocab 파일 불러오기
if os.path.isfile(vocab_path):
    print("vocab exists")
    tokenizer = OpenAIGPTTokenizer.from_pretrained(ckpt_path) #토크나이저 불러오기
else:
    print("vocab does not exists")
    wget.download("https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json", ckpt_path)
    wget.download("https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt", ckpt_path)
    os.rename(os.path.join(ckpt_path, 'openai-gpt-vocab.json'), vocab_path)
    os.rename(os.path.join(ckpt_path, 'openai-gpt-merges.txt'), merges_path)
    
#BERT Config파일 불러오기
if os.path.isfile(config_path):
    print("Config model exists")
else:
    print("Config model does not exists")
    wget.download("https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json", ckpt_path)
    os.rename(os.path.join(ckpt_path, 'openai-gpt-config.json'), config_path)
    
#BERT 모델 불러오기, huggingface의 저장된 로컬 모델을 불러오려면 아래와 같은 방법으로 접근해야 가능.
if os.path.isfile(model_path):
    print("Pretrained model exists")
    model = TFOpenAIGPTLMHeadModel.from_pretrained(ckpt_path)# 모델 학습 불러오기
else:
    print("Pretrained model does not exists")
    wget.download("https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-tf_model.h5", ckpt_path)
    os.rename(os.path.join(ckpt_path, 'openai-gpt-tf_model.h5'), model_path) # 'bert-base-multilingual-uncased-tf_model.h5' -> tf_model.h5


ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


vocab exists
gpt_ckpt/vocab.json
Config model exists
Pretrained model exists


In [3]:
tokenizer.pad_token = '_'

In [4]:
sents = [s[:-1] for s in open('train.txt').readlines()]
input_data = list()
output_data = list()

for s in sents:
    tokens = tokenizer.encode(s, max_length=21, add_special_tokens=True, pad_to_max_length=True)
    input_data.append(tokens[:-1])
    output_data.append(tokens[1:])

input_data = np.array(input_data, dtype=np.int64)
output_data = np.array(output_data, dtype=np.int64)

In [5]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

def accuracy_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    mask = tf.expand_dims(tf.cast(mask, dtype=pred.dtype), axis=-1)
    pred *= mask    
    acc = train_accuracy(real, pred)

    return tf.reduce_mean(acc)

In [6]:
model.compile(loss=loss_function,
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=[accuracy_function])

In [7]:
# overfitting을 막기 위한 ealrystop 추가
# earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001,patience=1)
# min_delta: the threshold that triggers the termination (acc should at least improve 0.0001)
# patience: no improvment epochs (patience = 1, 1번 이상 상승이 없으면 종료)

history = model.fit(input_data, output_data, 
                    batch_size=16, epochs=1,
                    validation_split=0.1)#, callbacks=[earlystop_callback])


# history = model.fit(dataset, epochs=2)#, callbacks=[earlystop_callback])

Train on 172 samples, validate on 20 samples


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "




In [11]:
text = 'when'
gen_count = 10

for _ in range(gen_count):
    input_ids = tf.constant(tokenizer.encode(text, add_special_tokens=True))[None, :]  # Batch size 1
    outputs = model(input_ids)
    last_hidden_states = outputs[0]  

    o = tf.argmax(last_hidden_states, axis=-1).numpy()[0]
    new = tokenizer.decode([o[-1]])
    text += ' ' + new
    
print(text)

when i was a girl , i had a great deal
