# 플랫폼 업로드를 쉽게하기 위한 로컬 개발 코드
- T3Q.ai(T3Q.cep + T3Q.dl): 빅데이터/인공지능 통합 플랫폼
- 플랫폼 업로드를 쉽게하기 위하여 로컬에서 아래의 코드(파일1)를 개발한다.
- 파일 1(파일명): 1_local_platform_text_generation.ipynb

### 전처리 객체 또는 학습모델 객체
- 전처리 객체나 학습모델 객체는 meta_data 폴더 아래에 저장한다.

### 데이터셋(학습 데이터/테스트 데이터)
- 학습과 테스트에 사용되는 데이터를 나누어 관리한다.
- 학습 데이터: dataset 폴더 아래에 저장하거나 dataset.zip 파일 형태로 저장한다.
- 테스트 데이터: test_dataset 폴더 아래에 저장하거나 test_dataset.zip 파일 형태로 저장한다.

### 로컬 개발 워크플로우(workflow)  
- 로컬 개발 워크플로우를 다음의 4단계로 분리한다.

1. 데이터셋 준비(Data Setup)
- 로컬 저장소에서 전처리 및 학습에 필요한 학습 데이터셋을 준비한다.

2. 데이터 전처리(Data Preprocessing)
- 데이터셋의 분석 및 정규화(Normalization)등의 전처리를 수행한다.
- 데이터를 모델 학습에 사용할 수 있도록 가공한다.
- 추론과정에서 필요한 경우, 데이터 전처리에 사용된 객체를 meta_data 폴더 아래에 저장한다.

3. 학습 모델 훈련(Train Model)
- 데이터를 훈련에 사용할 수 있도록 가공한 뒤에 학습 모델을 구성한다. 
- 학습 모델을 준비된 데이터셋으로 훈련시킨다.
- 정확도(Accuracy)나 손실(Loss)등 학습 모델의 성능을 검증한다.
- 학습 모델의 성능 검증 후, 학습 모델을 배포한다.
- 배포할 학습 모델을 meta_data 폴더 아래에 저장한다.

4. 추론(Inference)
- 저장된 전처리 객체나 학습 모델 객체를 준비한다.
- 추론에 필요한 테스트 데이터셋을 준비한다.
- 배포된 학습 모델을 통해 테스트 데이터에 대한 추론을 진행한다. 

In [1]:
from IPython.display import Image
#Image(filename='./T3Q.ai.jpg')

# 인공지능 통합플랫폼(T3Q.ai) 프로세스를 이해하고 인공지능 쉽게 하기

In [2]:
# 파일명: text_generation_preprocess.py

'''
from text_generation_preprocess_sub import exec_process
'''
import logging

logging.basicConfig(level=logging.INFO)

def process_for_train(pm):
    
    exec_process(pm)
    
    logging.info('[hunmin log] the end line of the function [process_for_train]')
    
    
def init_svc(im, rule):
    return {}


def transform(df, params, batch_id):
    
    logging.info('[hunmin log] df : {}'.format(df))
    logging.info('[hunmin log] df.shape : {}'.format(df.shape))
    logging.info('[hunmin log] type(df) : {}'.format(type(df)))   
    logging.info('[hunmin log] the end line of the function [transform]')
    
    return df

In [3]:
# 파일명: text_generation_preprocess_sub.py

import os
import numpy as np
import pandas as pd
import zipfile
import logging


def exec_process(pm):

    logging.info('[hunmin log]  the start line of the function [exec_process]')

    logging.info('[hunmin log] pm.source_path : {}'.format(pm.source_path))

    # 저장 파일 확인
    list_files_directories(pm.source_path)
    
    # pm.source_path의 dataset.zip 파일을 
    # pm.target_path의 dataset 폴더에 압축을 풀어준다.
    my_zip_path = os.path.join(pm.source_path,'dataset.zip')
    extract_zip_file = zipfile.ZipFile(my_zip_path)
    extract_zip_file.extractall(pm.target_path)
    extract_zip_file.close()
    
    # 저장 파일 확인
    list_files_directories(pm.target_path)

    logging.info('[hunmin log]  the finish line of the function [exec_process]')



# 저장 파일 확인
def list_files_directories(path):
    # Get the list of all files and directories in current working directory
    dir_list = os.listdir(path)
    logging.info('[hunmin log] Files and directories in {} :'.format(path))
    logging.info('[hunmin log] dir_list : {}'.format(dir_list))
    
    

INFO:numexpr.utils:Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


In [4]:
# 파일명: text_generation_train.py

'''
from text_generation_train_sub import exec_train, exec_init_svc, exec_inference
'''
import logging


def train(tm):
    
    exec_train(tm)
    logging.info('[hunmin log] the end line of the function [train]')


def init_svc(im):
    
    params = exec_init_svc(im)
    logging.info('[hunmin log] the end line of the function [init_svc]')
    
    return { **params }


def inference(df, params, batch_id):
    
    result = exec_inference(df, params, batch_id)
    logging.info('[hunmin log] the end line of the function [inference]')
    
    return { **result }


In [5]:
# 파일명: text_generation_train_sub.py

# Imports
import tensorflow as tf
import numpy as np
import os
import pickle
import logging

logging.info(f'[hunmin log] tensorflow ver : {tf.__version__}')

# 사용할 gpu 번호를 적는다.
os.environ["CUDA_VISIBLE_DEVICES"]='0'

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_visible_devices(gpus, 'GPU')
        logging.info('[hunmin log] gpu set complete')
        logging.info('[hunmin log] num of gpu: {}'.format(len(gpus)))
    
    except RuntimeError as e:
        logging.info('[hunmin log] gpu set failed')
        logging.info(e)
        
        
def exec_train(tm):
    
    logging.info('[hunmin log] the start line of the function [exec_train]')
    
    logging.info('[hunmin log] tm.train_data_path : {}'.format(tm.train_data_path))
    
    # 저장 파일 확인
    list_files_directories(tm.train_data_path)
    
    ###########################################################################
    ## 1. 데이터셋 준비(Data Setup)
    ###########################################################################
    logging.info('[hunmin log] data load')
    
    path_to_file = os.path.join(tm.train_data_path, 'dataset/shakespeare.txt')
    logging.info('[hunmin log] file path : {}'.format(path_to_file))
    
    text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
    logging.info('[hunmin log] loaded data check (text[:100]) : {}'.format(text[:100]))
    
    ###########################################################################
    ## 2. 데이터 전처리(Data Preprocessing)
    ###########################################################################
    vocab = sorted(set(text))
    
    # 추론에 사용할 vocab데이터 저장
    with open(os.path.join(tm.model_path, 'vocabulary.p'), 'wb') as f:
        pickle.dump(vocab, f)
    
    # 문자를 id로 변환
    ids_from_chars = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=list(vocab), mask_token=None)
    all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
    ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

    seq_length = 100
    sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)
    dataset = sequences.map(split_input_target)

    # Batch size
    BATCH_SIZE = 64 * len(gpus) if len(gpus) > 0 else 64
    BUFFER_SIZE = 10000

    dataset = (dataset
            .shuffle(BUFFER_SIZE)
            .batch(BATCH_SIZE, drop_remainder=True)
            .prefetch(tf.data.experimental.AUTOTUNE))
    
    ###########################################################################
    ## 3. 학습 모델 훈련(Train Model)
    ###########################################################################

    # 모델 구축 (Build Model)
    # The embedding dimension
    embedding_dim = 256
    # Number of RNN units
    rnn_units = 1024
    
    
    # 단일 gpu 혹은 cpu학습
    if len(gpus) < 2:
        model = MyModel(
                    # Be sure the vocabulary size matches the `StringLookup` layers.
                    vocab_size=len(ids_from_chars.get_vocabulary()),
                    embedding_dim=embedding_dim,
                    rnn_units=rnn_units)
            
        # 입력 텍스트 다음에 올 문자 중 확률이 가장 높은 문자를 추출해야 하므로 
        # 다중분류에 사용되는 loss를 사용한다.
        loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
        model.compile(optimizer='adam', loss=loss)
        
    # multi-gpu
    else:
        strategy = tf.distribute.MirroredStrategy()
        logging.info('[hunmin log] gpu devices num {}'.format(strategy.num_replicas_in_sync))
        with strategy.scope():
            model = MyModel(
                    # Be sure the vocabulary size matches the `StringLookup` layers.
                    vocab_size=len(ids_from_chars.get_vocabulary()),
                    embedding_dim=embedding_dim,
                    rnn_units=rnn_units)
            
            # 입력 텍스트 다음에 올 문자 중 확률이 가장 높은 문자를 추출해야 하므로 
            # 다중분류에 사용되는 loss를 사용한다.
            loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
            model.compile(optimizer='adam', loss=loss)
            

    # 모델 학습
    # Directory where the checkpoints will be saved
    checkpoint_dir = os.path.join(tm.model_path, 'training_checkpoints')
    
    # 체크포인트 콜백
    checkpoint_prefix = os.path.join(checkpoint_dir, "last_ckpt")
    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
                                        filepath=checkpoint_prefix,
                                        save_weights_only=True)
    
    EPOCHS = 50
    history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])
    
    logging.info('[hunmin log] model.summary() : ')
    model.summary(print_fn=logging.info)
    
    ###########################################################################
    ## 플랫폼 시각화
    ###########################################################################  
    '''
    plot_metrics(tm, history)
    '''
    
    # 저장 파일 확인
    list_files_directories(tm.model_path)
    
    logging.info('[hunmin log]  the finish line of the function [exec_train]')
    

def exec_init_svc(im):

    logging.info('[hunmin log] im.model_path : {}'.format(im.model_path))
    
    # 저장 파일 확인
    list_files_directories(im.model_path)
    
    ###########################################################################
    ## 학습 모델 준비
    ########################################################################### 
    with open(os.path.join(im.model_path, 'vocabulary.p'), 'rb') as f:
        vocab = pickle.load(f)
        
    # rebuild model
    ids_from_chars = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=list(vocab), mask_token=None)
    # The embedding dimension
    embedding_dim = 256
    # Number of RNN units
    rnn_units = 1024
    
    loaded_model = MyModel(
            # Be sure the vocabulary size matches the `StringLookup` layers.
            vocab_size=len(ids_from_chars.get_vocabulary()),
            embedding_dim=embedding_dim,
            rnn_units=rnn_units)
    loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
    loaded_model.compile(optimizer='adam', loss=loss)
    
    # 가장 최근 체크포인트를 호출
    latest = tf.train.latest_checkpoint(os.path.join(im.model_path, 'training_checkpoints'))
    loaded_model.load_weights(latest)
    
    chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)
    
    # rebuild한 모델을 이용하여 입력 텍스트에 이어지는 텍스트를 예측하는 모델을 반환한다.
    loaded_one_step_model = OneStep(loaded_model, chars_from_ids, ids_from_chars)
    
    return {'model' : loaded_one_step_model}



def exec_inference(df, params, batch_id):
    
    ###########################################################################
    ## 4. 추론(Inference)
    ###########################################################################
    
    logging.info('[hunmin log] the start line of the function [exec_inference]')
    
    ## 학습 모델 준비
    model = params['model']
    
    origin_data = df.iloc[0, 0]
    input_data = tf.constant([origin_data])
    
    logging.info('[hunmin log] data predict')
    # data predict
    # 상태 초기값 : None
    states = None
    prediction = [input_data]
    # 입력 이후 100자 예측
    for n in range(100):
        input_data, states = model.generate_one_step(input_data, states=states)
        prediction.append(input_data)
    
    inference = tf.strings.join(prediction)[0].numpy().decode("utf-8")
    logging.info('[hunmin log] inference : {}'.format(inference))
    
    # inverse transform
    result = {'inference' : inference}
    logging.info('[hunmin log] result : {}'.format(result))

    return result


# 저장 파일 확인
def list_files_directories(path):
    # Get the list of all files and directories in current working directory
    dir_list = os.listdir(path)
    logging.info('[hunmin log] Files and directories in {} :'.format(path))
    logging.info('[hunmin log] dir_list : {}'.format(dir_list))


###########################################################################
## exec_train(tm) 호출 함수 
###########################################################################
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

# 모델 객체 정의
class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(rnn_units,
                                       return_sequences=True,
                                       return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)
    
    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)
    
        if return_state:
            return x, states
        else:
            return x

class OneStep(tf.keras.Model):
    def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
        super().__init__()
        self.temperature = temperature
        self.model = model
        self.chars_from_ids = chars_from_ids
        self.ids_from_chars = ids_from_chars
    
        # Create a mask to prevent "[UNK]" from being generated.
        skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
        sparse_mask = tf.SparseTensor(
            # Put a -inf at each bad index.
            values=[-float('inf')]*len(skip_ids),
            indices=skip_ids,
            # Match the shape to the vocabulary
            dense_shape=[len(ids_from_chars.get_vocabulary())])
        self.prediction_mask = tf.sparse.to_dense(sparse_mask)
  
    @tf.function
    def generate_one_step(self, inputs, states=None):
        # Convert strings to token IDs.
        input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
        input_ids = self.ids_from_chars(input_chars).to_tensor()
    
        # Run the model.
        # predicted_logits.shape is [batch, char, next_char_logits]
        predicted_logits, states = self.model(inputs=input_ids, states=states,
                                              return_state=True)
        # Only use the last prediction.
        predicted_logits = predicted_logits[:, -1, :]
        predicted_logits = predicted_logits/self.temperature
        # Apply the prediction mask: prevent "[UNK]" from being generated.
        predicted_logits = predicted_logits + self.prediction_mask
    
        # Sample the output logits to generate token IDs.
        predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
        predicted_ids = tf.squeeze(predicted_ids, axis=-1)
    
        # Convert from token ids to characters
        predicted_chars = self.chars_from_ids(predicted_ids)
    
        # Return the characters and model state.
        return predicted_chars, states

# 시각화
def plot_metrics(tm, history):
    
    # accuracy_list = history.history['accuracy']
    loss_list = history.history['loss']
    
    for step, loss in enumerate(loss_list):
        metric={}
        metric['accuracy'] = 0
        metric['loss'] = loss
        metric['step'] = step
        tm.save_stat_metrics(metric)

    logging.info('[hunmin log] accuracy and loss curve plot for platform')
    

INFO:root:[hunmin log] tensorflow ver : 2.9.0
INFO:root:[hunmin log] gpu set complete
INFO:root:[hunmin log] num of gpu: 1


In [6]:
# PM 클래스: pm 객체
class PM:
    def __init__(self):
        self.source_path = './'
        self.target_path = './meta_data'

# TM 클래스: tm 객체
class TM:
    param_info = {}
    def __init__(self):
        self.train_data_path = './meta_data'
        self.model_path = './meta_data'

# IM 클래스: im 객체
class IM:
    def __init__(self):
        self.model_path = './meta_data'


# pm 객체
pm = PM()
print('pm.source_path:', pm.source_path)
print('pm.target_path: ', pm.target_path)

# tm 객체
tm = TM()
print('tm.train_data_path: ', tm.train_data_path)
print('tm.model_path: ', tm.model_path)

# im 객체
im = IM()
print('im.model_path: ', im.model_path)

# inferecne(df, params, batch_id) 함수 입력
params = {}
batch_id = 0

import io
import pandas as pd

# base64 encoded image
data = [['ROMEO : ']]
df = pd.DataFrame(data)
print('df: ', df)
print('df.dtypes:', df.dtypes)
df.columns

pm.source_path: ./
pm.target_path:  ./meta_data
tm.train_data_path:  ./meta_data
tm.model_path:  ./meta_data
im.model_path:  ./meta_data
df:            0
0  ROMEO : 
df.dtypes: 0    object
dtype: object


RangeIndex(start=0, stop=1, step=1)

In [7]:
%%time
process_for_train(pm)

train(tm)

transform(df, params, batch_id)

params = init_svc(im)

inference(df, params, batch_id)

INFO:root:[hunmin log]  the start line of the function [exec_process]
INFO:root:[hunmin log] pm.source_path : ./
INFO:root:[hunmin log] Files and directories in ./ :
INFO:root:[hunmin log] dir_list : ['.ipynb_checkpoints', '0_local_text_generation.ipynb', '0_local_text_generation_requirement.txt', '1_local_platform_text_generation.ipynb', '2_1_1_platform_text_generation_preprocess.py', '2_1_2_platform_text_generation_preprocess_sub.py', '2_2_1_platform_text_generation_train.py', '2_2_2_platform_text_generation_train_sub.py', 'dataset.zip', 'LICENSE.txt', 'meta_data', 'README.txt', 'T3Q.ai_platform_text_generation', 'test_dataset.zip']
INFO:root:[hunmin log] Files and directories in ./meta_data :
INFO:root:[hunmin log] dir_list : ['dataset', 'test_dataset', 'training_checkpoints', 'vocabulary.p']
INFO:root:[hunmin log]  the finish line of the function [exec_process]
INFO:root:[hunmin log] the end line of the function [process_for_train]
INFO:root:[hunmin log] the start line of the funct

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


INFO:root:[hunmin log] model.summary() : 
INFO:root:Model: "my_model"
INFO:root:_________________________________________________________________
INFO:root: Layer (type)                Output Shape              Param #   
INFO:root: embedding (Embedding)       multiple                  16896     
INFO:root:                                                                 
INFO:root: gru (GRU)                   multiple                  3938304   
INFO:root:                                                                 
INFO:root: dense (Dense)               multiple                  67650     
INFO:root:                                                                 
INFO:root:Total params: 4,022,850
INFO:root:Trainable params: 4,022,850
INFO:root:Non-trainable params: 0
INFO:root:_________________________________________________________________
INFO:root:[hunmin log] Files and directories in ./meta_data :
INFO:root:[hunmin log] dir_list : ['dataset', 'test_dataset', 'training_checkp

Wall time: 10min 5s


{'inference': 'ROMEO : the world and all this profession\nIf that your son shall run by you shall repose\nAnd take it; if it '}