# MLP - Tensorflow
---
# Order
## 0. Library
## 1. Load Data
## 2. Clean Data
## 3. Make Vocab & Vectorize
## 4. Modeling
## 5. Submit
---

In [None]:
!nvidia-smi

# 0. Library

In [None]:
# 1. Load Data
from Closed import load_data
import pandas as pd
import numpy as np

# 2. Clean Data
import hanja
import re

# 3. Make Vocab & Vectorize
from tokenizers import ByteLevelBPETokenizer
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 4. Modeling
from tensorflow.keras.models import Model
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
tf.compat.v1.enable_eager_execution()
import tensorflow.keras.backend as K
from sklearn.model_selection import StratifiedKFold

os.environ["CUDA_VISIBLE_DEVICES"]="0"
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)



# 5. Subission
from Closed import get_token
from dacon_submit_api import dacon_submit_api 

# 6. Others
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')


# 1. Load Data

In [None]:
train, test, sample = load_data()
train = train.sample(frac = 1).reset_index(drop = True).drop('index', axis = 1)
test = test.drop('index', axis = 1)
display(train.head())

# 2. Clean Data

In [None]:
# Translate Chinese -> Korean
def replace_all(text, dic):
    for i, j in dic.items():
        text = text.replace(i, j+' ')
    return text

d = { "中": "중국", "美": "미국","北":"북한",'日':"일본",'英':'영국','行':'행','靑':'청와대','朴':'박','銀':'은행','與':'여당',
    '文':'문','野':'야당','獨':'독일','伊':'이탈리아','韓':'한국','佛':'프랑스','前':'전','檢':'검찰','軍':'군','安':'안철수','南':'남한',
    '亞':'아시아','展':'전시회','重':'차','株':'주식','詩':'시'}

train['title'] = train['title'].apply(lambda x : replace_all(x, d))
test['title']  = test['title'].apply(lambda x : replace_all(x, d))

# 3. Make Vocab

In [None]:
with open('./train_text.txt', 'w', encoding = 'utf-8') as f: 
    for row in train.iterrows() : 
        f.write(row[1]['title']+'\n')
f.close()

vocab_size = 15000
tokenizer = ByteLevelBPETokenizer()
tokenizer.train('./train_text.txt',vocab_size = vocab_size, min_frequency=2)
train_tokens = []
test_tokens = []
for title in train['title'] : 
    train_tokens.append(tokenizer.encode(title).ids)
for title in test['title'] : 
    test_tokens.append(tokenizer.encode(title).ids)

In [None]:
train_onehot = np.zeros(shape = (len(train_tokens), vocab_size))
test_onehot = np.zeros(shape = (len(test_tokens), vocab_size))

In [None]:
for idx, token_nums in zip(np.arange(len(train_tokens)), train_tokens) : 
    for token_num in token_nums : 
        train_onehot[idx, token_num] = 1
for idx, token_nums in zip(np.arange(len(test_tokens)), test_tokens) : 
    for token_num in token_nums : 
        test_onehot[idx, token_num] = 1

# 5.Modeling

In [None]:
def fc(x, unit, dr) : 
    x = Dense(unit)(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = Dropout(dr)(x)
    return x

def recall(y_target, y_pred):
    # clip(t, clip_value_min, clip_value_max) : clip_value_min~clip_value_max 이외 가장자리를 깎아 낸다
    # round : 반올림한다
    y_target_yn = K.round(K.clip(y_target, 0, 1)) # 실제값을 0(Negative) 또는 1(Positive)로 설정한다
    y_pred_yn = K.round(K.clip(y_pred, 0, 1)) # 예측값을 0(Negative) 또는 1(Positive)로 설정한다

    # True Positive는 실제 값과 예측 값이 모두 1(Positive)인 경우이다
    count_true_positive = K.sum(y_target_yn * y_pred_yn) 

    # (True Positive + False Negative) = 실제 값이 1(Positive) 전체
    count_true_positive_false_negative = K.sum(y_target_yn)

    # Recall =  (True Positive) / (True Positive + False Negative)
    # K.epsilon()는 'divide by zero error' 예방차원에서 작은 수를 더한다
    recall = count_true_positive / (count_true_positive_false_negative + K.epsilon())

    # return a single tensor value
    return recall

def f1score(y_target, y_pred):
    _recall = recall(y_target, y_pred)
    _precision = precision(y_target, y_pred)
    _f1score = ( 2 * _recall * _precision) / (_recall + _precision+ K.epsilon())
    
    return _f1score

In [None]:
def create_model() : 
    inp = Input(shape = (vocab_size,))
    fc_layer = fc(inp, 2048, 0.2)
    fc_layer = fc(fc_layer, 512, 0.2)
    fc_layer = fc(fc_layer, 128, 0.2)
    fc_layer = fc(fc_layer, 32, 0.2)
    fc_layer = fc(fc_layer, 8, 0.2)
    out = Softmax()(BatchNormalization()(Dense(7)(fc_layer)))
    model = Model(inp, out)
    model.compile(optimizer = tf.keras.optimizers.Adam(0.1), 
                loss = tf.keras.losses.CategoricalCrossentropy(), metrics = ['accuracy'])
    return model

In [None]:
y_train = tf.keras.utils.to_categorical(train['topic_idx'])
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=20,verbose = 0, min_delta=0.000001)
early = EarlyStopping(monitor = 'val_loss', patience = 50, verbose = 0)
mck = ModelCheckpoint(filepath='model.h5',  monitor='val_loss', save_best_only=True, verbose = 0, model = 'min')
callbacks = [reduce_lr, early, mck]

preds =[]
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 20211209)
for train_idx, valid_idx in skf.split(train.index, train['topic_idx']) : 
    if "model.h5" in os.listdir() : 
        os.remove("model.h5")
    model = create_model()
    X_tr = train_onehot[train_idx]
    X_val = train_onehot[valid_idx]

    y_tr = y_train[train_idx]
    y_val = y_train[valid_idx]

    history = model.fit(X_tr, y_tr,
              batch_size = 12200,
              epochs = 30000,
              validation_data = (X_val, y_val),
              callbacks = callbacks,
              verbose = 0)
    print(max(history.history['val_accuracy']))
    print('')
    model.load_weights('model.h5')

    pred = model.predict(test_onehot)
    preds.append(pred)

# 6. Submit

In [None]:
save_path = 'MLP-Tensorflow.csv'
pred = np.mean(preds, axis = 0)
final_pred = np.argmax(pred, axis = 1)
sample['topic_idx'] = final_pred
sample.to_csv(save_path, index=False)

In [None]:
token = get_token()
result = dacon_submit_api.post_submission_file(
                                                save_path, 
                                                token, 
                                                235747, 
                                                'Jay Hong', 
                                                'MLP-tensorflow')