In [3]:
!pip install keras_bert
!pip install keras_radam
!pip install tensorflow_addons

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras_bert
  Using cached keras-bert-0.89.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting keras-transformer==0.40.0 (from keras_bert)
  Using cached keras-transformer-0.40.0.tar.gz (9.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting keras-pos-embd==0.13.0 (from keras-transformer==0.40.0->keras_bert)
  Using cached keras-pos-embd-0.13.0.tar.gz (5.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting keras-multi-head==0.29.0 (from keras-transformer==0.40.0->keras_bert)
  Using cached keras-multi-head-0.29.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting keras-layer-normalization==0.16.0 (from keras-transformer==0.40.0->keras_bert)
  Using cached keras-layer-normalization-0.16.0.tar.gz (3.9 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting keras-position-wise-fee

In [4]:
import os
import tensorflow as tf

import pandas as pd
import numpy as np
import re
import pickle

import keras as keras
from keras.models import load_model
from keras import backend as K
from keras import Input, Model
from keras import optimizers

import codecs
from tqdm import tqdm
import shutil
import warnings
import tensorflow as tf
from keras_bert import load_trained_model_from_checkpoint, load_vocabulary
from keras_bert import Tokenizer
from keras_bert import AdamWarmup, calc_train_steps

from keras_radam import RAdam

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
vocab_path = '/content/drive/MyDrive/model/bert/vocab.txt'
config_path = '/content/drive/MyDrive/model/bert/bert_config.json'
checkpoint_path = '/content/drive/MyDrive/model/bert/bert_model.ckpt'

In [7]:
import glob
import json
import random
import zipfile

with zipfile.ZipFile('/content/drive/MyDrive/data/xml/xml.zip', 'r') as zip_ref:
    zip_ref.extractall('data/xml')

In [8]:
json_files = glob.glob('data/xml/data/processed_sample_*.json')
random.shuffle(json_files)

SEQ_LEN = 512

text_data = []

for json_file in json_files:
    with open(json_file, 'r') as file:
        data_dict = json.load(file)
        text_data.append("")
        for text_info in data_dict['form']:
            text = text_info['text']
            if type(text) == str:
                text_data[-1] += text + ' '

In [9]:
text_data[130]

'Introduction Enter the subtitle 01 enter the contents The Name\nOf The Rose\n(1986) Cinema and literature You can enter general information describing the page. Double-click the text box to enter the information. Please enter a brief summary about the topic here. Use the right page layout to suit your purposes. You can enter general information describing the page.\xa0\nDouble-click the text box to enter the information Please enter a brief summary about the topic here. Use the right page layout to suit your purposes. You can enter general information describing the page. Double-click the text box to enter the information. Source: Enter the source information. '

In [10]:
class inherit_Tokenizer(Tokenizer):
  def _tokenize(self, text):
        if not self._cased:
            text = text

            text = text.lower()
        spaced = ''
        for ch in text:
            if self._is_punctuation(ch) or self._is_cjk_character(ch):
                spaced += ' ' + ch + ' '
            elif self._is_space(ch):
                spaced += ' '
            elif ord(ch) == 0 or ord(ch) == 0xfffd or self._is_control(ch):
                continue
            else:
                spaced += ch
        tokens = []
        for word in spaced.strip().split():
            tokens += self._word_piece_tokenize(word)
        return tokens

In [11]:
token_dict = {}
with codecs.open(vocab_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        if "_" in token:
          token = token.replace("_","")
          token = "##" + token
        token_dict[token] = len(token_dict)

In [12]:
tokenizer = inherit_Tokenizer(token_dict)

In [13]:
import pandas as pd

metadata = pd.read_csv('/content/drive/MyDrive/data/metadata/metadata.csv')

In [14]:
keyword_dict = {}

for i in range(len(metadata)):
    keyword = metadata['keyword'][i]
    if type(keyword) == str:
        keyword = keyword.split('|')
        for word in keyword:
            if word not in keyword_dict:
                keyword_dict[word] = len(keyword_dict)

In [15]:
def get_one_hot_keyword(keyword):
    one_hot_encoding = np.zeros(len(keyword_dict))
    if type(keyword) == str:
        keyword = keyword.split('|')
        for word in keyword:
            one_hot_encoding[keyword_dict[word]] = 1
    return one_hot_encoding

In [16]:
def convert_data():
    global tokenizer
    indices, targets = [], []
    for i in tqdm(range(len(text_data))):
        ids, segments = tokenizer.encode(text_data[i], max_len=SEQ_LEN)
        indices.append(ids)
        targets.append(get_one_hot_keyword(metadata['keyword'][i]))
    items = list(zip(indices, targets))

    indices, targets = zip(*items)
    indices = np.array(indices)
    return [indices, np.zeros_like(indices)], np.array(targets)

In [17]:
data_x, data_y = convert_data()
train_x, train_y = [data_x[0][:3000], data_x[1][:3000]], data_y[:3000]
test_x, test_y = [data_x[0][3000:], data_x[1][3000:]], data_y[3000:]

100%|██████████| 3013/3013 [00:01<00:00, 1874.79it/s]


In [106]:
layer_num = 12
model = load_trained_model_from_checkpoint(
    config_path,
    checkpoint_path,
    training=True,
    trainable=True,
    seq_len=SEQ_LEN,)

In [19]:
CLASS_NUM = len(keyword_dict)

In [107]:
from tensorflow.keras import backend as K

def f1_score(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())

    return f1_val

def dice_loss(y_true, y_pred):
    numerator = 2 * K.sum(y_true * y_pred, axis=-1)
    denominator = K.sum(y_true + y_pred, axis=-1)

    return 1 - (numerator + 1) / (denominator + 1)


In [109]:
import tensorflow_addons as tfa

def get_bert_multilabel_model(model):
    inputs = model.inputs[:2]
    dense = model.layers[-3].output

    hidden = keras.layers.Dense(1024, activation='relu')(dense)

    outputs = keras.layers.Dense(CLASS_NUM, activation='sigmoid',
                                 kernel_initializer=keras.initializers.TruncatedNormal(stddev=0.02),
                                 name = 'real_output')(hidden)

    multilabel_model = keras.models.Model(inputs, outputs)
    multilabel_model.compile(
        optimizer=tfa.optimizers.RectifiedAdam(learning_rate=0.00001, weight_decay=0.0025),
        loss=dice_loss,
        metrics=[tf.keras.metrics.BinaryAccuracy(threshold=0.5), f1_score]
    )

    return multilabel_model


In [110]:
multilabel_model = get_bert_multilabel_model(model)

In [111]:
import tensorflow as tf

# GPU 장치 목록을 가져옵니다.
gpus = tf.config.experimental.list_physical_devices('GPU')

if gpus:
  # GPU가 있는 경우 해당 정보를 출력합니다.
  for i, gpu in enumerate(gpus):
    print(f'GPU {i}: {gpu}')
else:
  print('No GPUs detected.')


GPU 0: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


In [112]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True)

history = multilabel_model.fit(
    train_x, train_y,
    epochs=100,
    batch_size=16,
    verbose = 1,
    validation_data=(test_x, test_y),
    shuffle=True,
    callbacks=[early_stop]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 15: early stopping


In [84]:
def predict_keywords(input_text, threshold=0.5):
    ids, _ = tokenizer.encode(input_text, max_len=SEQ_LEN)
    in_data = [tf.expand_dims(ids, 0), tf.expand_dims(np.zeros_like(ids), 0)]

    predictions = multilabel_model.predict(in_data)
    print(max(predictions[0]))

    keywords = [k for k, v in keyword_dict.items() if predictions[0][v] >= threshold]

    return keywords


In [158]:

i = random.randint(1, 3000)
pred = predict_keywords(text_data[i])
print(pred)
keyword = metadata['keyword'][i]
if type(keyword) == str:
    keyword = keyword.split('|')
print(keyword)

print(set(pred) & set(keyword))

0.9999987
['심플', '안내', '사진', '화이트', '강의', '과제', '교육', '학교', '학생', '깔끔', '심플한', '프레임', '이벤트', '디자인', '일러스트', '회사', '귀여운', '표', '홍보', '대학생', '마케팅', '라인', '비즈니스', '소개', '보고서', '도형', '모던', '제안서', '강조', '기업', '아이콘', '카드뉴스', '그래프', '발표', '피피티', '프레젠테이션', 'PPT', '파워포인트', '프리젠테이션', '조별과제', 'ppt', '기획서', '포트폴리오', '사업', '목록', '분할', '플랫한', '상세페이지']
['발표', '안내', '직장', '패션', '회사', '대학생', '보고서', '프로젝트', '프레젠테이션', 'ppt', 'PPT', '발표', '안내', '직장', '패션', '회사', '대학생', '보고서', '프로젝트', '프레젠테이션', 'ppt', 'PPT', '발표', '안내', '직장', '패션', '회사', '대학생', '보고서', '프로젝트', '프레젠테이션', 'ppt', 'PPT', '발표', '안내', '직장', '패션', '회사', '대학생', '보고서', '프로젝트', '프레젠테이션', 'ppt', 'PPT', '발표', '안내', '직장', '패션', '회사', '대학생', '보고서', '프로젝트', '프레젠테이션', 'ppt', 'PPT', '발표', '안내', '직장', '패션', '회사', '대학생', '보고서', '프로젝트', '프레젠테이션', 'ppt', 'PPT']
{'발표', '보고서', '프레젠테이션', '대학생', 'ppt', '회사', 'PPT', '안내'}


In [159]:
print(train_x[0].shape)

(3000, 512)
