In [1]:
import os
import tensorflow as tf

import pandas as pd
import numpy as np  
import re
import pickle

import keras as keras
from keras.models import load_model
from keras import backend as K
from keras import Input, Model
from keras import optimizers

import codecs
from tqdm import tqdm
import shutil
import warnings
import tensorflow as tf
from keras_bert import load_trained_model_from_checkpoint, load_vocabulary
from keras_bert import Tokenizer
from keras_bert import AdamWarmup, calc_train_steps

from keras_radam import RAdam

In [2]:
vocab_path = 'bert/vocab.txt'
config_path = 'bert/bert_config.json'
checkpoint_path = 'bert/bert_model.ckpt'
data_path = 'data'

In [3]:
import glob
import json
import random
import zipfile

with zipfile.ZipFile('processed_sample.zip', 'r') as zip_ref:
    zip_ref.extractall('data/text')

json_files = glob.glob(data_path + '/text/processed_sample_*.json')
random.shuffle(json_files)

SEQ_LEN = 512

text_data = []

for json_file in json_files:
    with open(json_file, 'r') as file:
        data_dict = json.load(file)
        text_data.append("")
        for text_info in data_dict['form']:
            text = text_info['text']
            if type(text) == str:
                text_data[-1] += text + ' '     


In [4]:
text_data[130]

'Presentation enter a title Please enter the title text here. LIAM SMITH Lecture name '

In [5]:
class inherit_Tokenizer(Tokenizer):
  def _tokenize(self, text):
        if not self._cased:
            text = text
            
            text = text.lower()
        spaced = ''
        for ch in text:
            if self._is_punctuation(ch) or self._is_cjk_character(ch):
                spaced += ' ' + ch + ' '
            elif self._is_space(ch):
                spaced += ' '
            elif ord(ch) == 0 or ord(ch) == 0xfffd or self._is_control(ch):
                continue
            else:
                spaced += ch
        tokens = []
        for word in spaced.strip().split():
            tokens += self._word_piece_tokenize(word)
        return tokens

In [6]:
token_dict = {}
with codecs.open(vocab_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        if "_" in token:
          token = token.replace("_","")
          token = "##" + token
        token_dict[token] = len(token_dict)

In [7]:
tokenizer = inherit_Tokenizer(token_dict)

In [8]:
import pandas as pd

metadata = pd.read_csv(data_path + '/metadata/metadata.csv')

In [9]:
keyword_dict = {}

for i in range(len(metadata)):
    keyword = metadata['keyword'][i]
    if type(keyword) == str:
        keyword = keyword.split('|')
        for word in keyword:
            if word not in keyword_dict:
                keyword_dict[word] = len(keyword_dict)

In [10]:
def get_one_hot_keyword(keyword):
    one_hot_encoding = np.zeros(len(keyword_dict))
    if type(keyword) == str:
        keyword = keyword.split('|')
        for word in keyword:
            one_hot_encoding[keyword_dict[word]] = 1
    return one_hot_encoding

In [11]:
def convert_data():
    global tokenizer
    indices, targets = [], []
    for i in tqdm(range(len(text_data))):
        ids, segments = tokenizer.encode(text_data[i], max_len=SEQ_LEN)
        indices.append(ids)
        targets.append(get_one_hot_keyword(metadata['keyword'][i]))
    items = list(zip(indices, targets))
    
    indices, targets = zip(*items)
    indices = np.array(indices)
    return [indices, np.zeros_like(indices)], np.array(targets)

In [20]:
data_x, data_y = convert_data()
train_x, train_y = [data_x[0][:30], data_x[1][:30]], data_y[:30]
test_x, test_y = [data_x[0][30:], data_x[1][30:]], data_y[30:]

100%|████████████████████████████████████████████████████████████████████████████| 3014/3014 [00:01<00:00, 1542.01it/s]


In [13]:
layer_num = 12
model = load_trained_model_from_checkpoint(
    config_path,
    checkpoint_path,
    training=True,
    trainable=True,
    seq_len=SEQ_LEN,)

In [14]:
CLASS_NUM = len(keyword_dict)

In [15]:
import tensorflow_addons as tfa

def get_bert_multilabel_model(model):
    inputs = model.inputs[:2]
    dense = model.layers[-3].output

    outputs = keras.layers.Dense(CLASS_NUM, activation='sigmoid', 
                                 kernel_initializer=keras.initializers.TruncatedNormal(stddev=0.02),
                                 name = 'real_output')(dense)

    multilabel_model = keras.models.Model(inputs, outputs)
    multilabel_model.compile(
        optimizer=tfa.optimizers.RectifiedAdam(learning_rate=0.00001, weight_decay=0.0025),
        loss='binary_crossentropy',
        metrics=['accuracy'])
  
    return multilabel_model



TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

 The versions of TensorFlow you are currently using is 2.8.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [16]:
multilabel_model = get_bert_multilabel_model(model)

In [17]:
import tensorflow as tf

# GPU 장치 목록을 가져옵니다.
gpus = tf.config.experimental.list_physical_devices('GPU')

if gpus:
  # GPU가 있는 경우 해당 정보를 출력합니다.
  for i, gpu in enumerate(gpus):
    print(f'GPU {i}: {gpu}')
else:
  print('No GPUs detected.')


No GPUs detected.


In [21]:
history = multilabel_model.fit(train_x, train_y, epochs=2, batch_size=16, verbose = 1, validation_data=(test_x, test_y), shuffle=True)

Epoch 1/2


KeyboardInterrupt: 