In [None]:
!pip install hanja
!pip install konlpy
!pip install transformers
!pip install tensorflow_addons
!pip install sentencepiece

In [None]:
import numpy as np
import pandas as pd
import os
import re
import json
import matplotlib.pyplot as plt
%matplotlib inline 
from wordcloud import WordCloud
import seaborn as sns
from tqdm import tqdm

import hanja
from hanja import hangul
from konlpy.tag import Okt
okt = Okt()



import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import train_test_split, KFold, cross_val_score, StratifiedKFold
from tensorflow.keras.models import clone_model

In [None]:
import logging
import os
import unicodedata
from shutil import copyfile
 
from transformers import PreTrainedTokenizer



 
logger = logging.getLogger(__name__)
 
VOCAB_FILES_NAMES = {"vocab_file": "tokenizer_78b3253a26.model",
                     "vocab_txt": "vocab.txt"}
 
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "monologg/kobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert/tokenizer_78b3253a26.model",
        "monologg/kobert-lm": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert-lm/tokenizer_78b3253a26.model",
        "monologg/distilkobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/distilkobert/tokenizer_78b3253a26.model"
    },
    "vocab_txt": {
        "monologg/kobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert/vocab.txt",
        "monologg/kobert-lm": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert-lm/vocab.txt",
        "monologg/distilkobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/distilkobert/vocab.txt"
    }
}
 
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "monologg/kobert": 512,
    "monologg/kobert-lm": 512,
    "monologg/distilkobert": 512
}
 
PRETRAINED_INIT_CONFIGURATION = {
    "monologg/kobert": {"do_lower_case": False},
    "monologg/kobert-lm": {"do_lower_case": False},
    "monologg/distilkobert": {"do_lower_case": False}
}
 
SPIECE_UNDERLINE = u'▁'
 
 
class KoBertTokenizer(PreTrainedTokenizer):
    """
        SentencePiece based tokenizer. Peculiarities:
            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
    """
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
    def __init__(
            self,
            vocab_file,
            vocab_txt,
            do_lower_case=False,
            remove_space=True,
            keep_accents=False,
            unk_token="[UNK]",
            sep_token="[SEP]",
            pad_token="[PAD]",
            cls_token="[CLS]",
            mask_token="[MASK]",
            **kwargs):
        super().__init__(
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            **kwargs
        )
 
        # Build vocab
        self.token2idx = dict()
        self.idx2token = []
        with open(vocab_txt, 'r', encoding='utf-8') as f:
            for idx, token in enumerate(f):
                token = token.strip()
                self.token2idx[token] = idx
                self.idx2token.append(token)
 
        #self.max_len_single_sentence = self.max_len - 2  
        #self.max_len_sentences_pair = self.max_len - 3  
 
        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning("You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
                           "pip install sentencepiece")
 
        self.do_lower_case = do_lower_case
        self.remove_space = remove_space
        self.keep_accents = keep_accents
        self.vocab_file = vocab_file
        self.vocab_txt = vocab_txt
 
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(vocab_file)
 
    @property
    def vocab_size(self):
        return len(self.idx2token)
 
    def __getstate__(self):
        state = self.__dict__.copy()
        state["sp_model"] = None
        return state
 
    def __setstate__(self, d):
        self.__dict__ = d
        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning("You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
                           "pip install sentencepiece")
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(self.vocab_file)
 
    def preprocess_text(self, inputs):
        if self.remove_space:
            outputs = " ".join(inputs.strip().split())
        else:
            outputs = inputs
        outputs = outputs.replace("``", '"').replace("''", '"')
 
        if not self.keep_accents:
            outputs = unicodedata.normalize('NFKD', outputs)
            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
        if self.do_lower_case:
            outputs = outputs.lower()
 
        return outputs
 
    def _tokenize(self, text, return_unicode=True, sample=False):
        """ Tokenize a string. """
        text = self.preprocess_text(text)
 
        if not sample:
            pieces = self.sp_model.EncodeAsPieces(text)
        else:
            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
        new_pieces = []
        for piece in pieces:
            if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
                    if len(cur_pieces[0]) == 1:
                        cur_pieces = cur_pieces[1:]
                    else:
                        cur_pieces[0] = cur_pieces[0][1:]
                cur_pieces.append(piece[-1])
                new_pieces.extend(cur_pieces)
            else:
                new_pieces.append(piece)
 
        return new_pieces
 
    def _convert_token_to_id(self, token):
        """ Converts a token (str/unicode) in an id using the vocab. """
        return self.token2idx.get(token, self.token2idx[self.unk_token])
 
    def _convert_id_to_token(self, index, return_unicode=True):
        """Converts an index (integer) in a token (string/unicode) using the vocab."""
        return self.idx2token[index]
 
    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (strings for sub-words) in a single string."""
        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
        return out_string
 
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
        by concatenating and adding special tokens.
        A RoBERTa sequence has the following format:
            single sequence: [CLS] X [SEP]
            pair of sequences: [CLS] A [SEP] B [SEP]
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep
 
    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
        Args:
            token_ids_0: list of ids (must not contain special tokens)
            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
                for sequence pairs
            already_has_special_tokens: (default False) Set to True if the token list is already formated with
                special tokens for the model
        Returns:
            A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
        """
 
        if already_has_special_tokens:
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
                    "ids is already formated with special tokens for the model."
                )
            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1]
 
    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
        A BERT sequence pair mask has the following format:
        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence
        if token_ids_1 is None, only returns the first portion of the mask (0's).
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
 
    def save_vocabulary(self, save_directory):
        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
            to a directory.
        """
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
            return
 
        # 1. Save sentencepiece model
        out_vocab_model = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
 
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_model):
            copyfile(self.vocab_file, out_vocab_model)
 
        # 2. Save vocab.txt
        index = 0
        out_vocab_txt = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_txt"])
        with open(out_vocab_txt, "w", encoding="utf-8") as writer:
            for token, token_index in sorted(self.token2idx.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    logger.warning(
                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!".format(out_vocab_txt)
                    )
                    index = token_index
                writer.write(token + "\n")
                index += 1
 
        return out_vocab_model, out_vocab_txt

In [None]:
train = pd.concat([pd.read_csv('f.csv'), pd.read_csv('g.csv')], axis = 0).reset_index(drop=True)
test = pd.read_csv("test_data.csv")
submission = pd.read_csv("sample_submission.csv")
topic_dict = pd.read_csv("topic_dict.csv")

STOPWORDSPATH ="stopwords.txt"

In [None]:
## preprocessing
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }

def clean_punc(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text.strip()


cleaned_train_corpus = []
cleaned_test_corpus = []
train.title = train.title.apply(lambda x : hanja.translate(x, 'substitution'))
test.title = test.title.apply(lambda x : hanja.translate(x, 'substitution'))

for sent in train['title']:
    cleaned_train_corpus.append(clean_punc(sent, punct, punct_mapping))
    
for sent in test['title']:
    cleaned_test_corpus.append(clean_punc(sent, punct, punct_mapping))


def clean_text(texts):
    corpus = []
    for i in range(0, len(texts)):
        texts[i] = texts[i].replace("外人","외국인")
        texts[i] = texts[i].replace("日","일본")
        texts[i] = texts[i].replace("美","미국")
        texts[i] = texts[i].replace("北","북한")
        texts[i] = texts[i].replace("英","영국")
        texts[i] = texts[i].replace("中","중국")
        texts[i] = texts[i].replace("與","여당")
        texts[i] = texts[i].replace("靑","청와대")
        texts[i] = texts[i].replace("野","야당")
        texts[i] = texts[i].replace("伊","이탈리아")
        texts[i] = texts[i].replace("韓","한국")
        texts[i] = texts[i].replace("南","한국")
        texts[i] = texts[i].replace("獨","독일")
        texts[i] = texts[i].replace("佛","프랑스")
        texts[i] = texts[i].replace("檢","검찰")
        texts[i] = texts[i].replace("銀","은행")
        texts[i] = texts[i].replace("亞","아시아")
        texts[i] = texts[i].replace("人","사람")
        texts[i] = texts[i].replace("孫","손혜원")
        texts[i] = texts[i].replace("企","기업")
        texts[i] = texts[i].replace("前","이전")
        texts[i] = texts[i].replace("反","반대")
        texts[i] = texts[i].replace("安","안철수")
        texts[i] = texts[i].replace("展","전시회")
        texts[i] = texts[i].replace("故","사망")
        texts[i] = texts[i].replace("文","문재인")
        texts[i] = texts[i].replace("新","새로운")
        texts[i] = texts[i].replace("曺","조국")
        texts[i] = texts[i].replace("朴","박근혜")
        texts[i] = texts[i].replace("株","주식")
        texts[i] = texts[i].replace("男","남자")
        texts[i] = texts[i].replace("硏","연구")
        texts[i] = texts[i].replace("車","자동차")
        texts[i] = texts[i].replace("軍","군대")
        texts[i] = texts[i].replace("重","중공업")       

        review = re.sub(r'[@%\\*=()/~#&\+á?\xc3\xa1\-\|\.\:\;\!\-\,\_\~\$\'\"]', '',str(texts[i])) #remove punctuation
        review = re.sub(r'1보','', str(review))
        review = re.sub(r'\d+','', str(review))# remove number
        review = re.sub(r'→','에서 ', str(review))
        review = re.sub(r'…',' ', str(review))
        review = re.sub(r'NYT','뉴욕 타임스', str(review))
        review = re.sub(r'KAIST','카이스트', str(review))
        review = re.sub(r'WMO','세계 기상 기구', str(review))
        review = re.sub(r'KBL','한국 프로 농구', str(review))
        review = re.sub(r'UAE','아랍에미리트', str(review))
        review = re.sub(r'EU','유럽 연합', str(review))
        review = re.sub(r'NBA','농구 연맹', str(review))
        review = re.sub(r'CIA','중앙정보국', str(review))
        review = re.sub(r'ECB','유럽 중앙 은행', str(review))
        review = re.sub(r'AFC','아시아 축구 연맹', str(review))
        review = re.sub(r'ITU','국제전기통신연합', str(review))
        review = re.sub(r'MVP','최우수 선수', str(review))
        #review = re.sub(r'MB','이명박', str(review))
        review = re.sub(r'APEC','아시아 태평량 경제협력체', str(review))
        review = re.sub(r'PSG','파리 셍제르망', str(review))
        review = re.sub(r'IMO','국제해사기구', str(review))
        review = re.sub(r'MLB','프로 야구 리그 ', str(review))
        review = re.sub(r'MOU','양해각서', str(review))
        review = re.sub(r'FA','자유계약선수제도', str(review))
        review = re.sub(r'EPL','잉글랜드프리미어리그', str(review))
        review = re.sub(r'KBO','한국야구위원회', str(review))
        review = re.sub(r'IPU','국제 의회 연맹', str(review))
        review = re.sub(r'AG','아시안게임', str(review))
        review = re.sub(r'PS','포스트시즌', str(review))
        review = re.sub(r'PO','플레이오프', str(review))
        #review = re.sub(r'닷컴','사이트', str(review))
        #review = re.sub(r'OUT','방출', str(review))
        #review = re.sub(r'IN','영입', str(review))
        review = re.sub(r'TPP',' 환태평양 경제 동반자협정', str(review))
        review = re.sub(r'EAS','동아시아 정상회의', str(review))
        review = re.sub(r'DC','', str(review))
        #review = re.sub(r'①','', str(review))
        #review = re.sub(r'②','', str(review))
        #review = re.sub(r'⑤','', str(review))
        review = re.sub(r'·',' 및 ', str(review))
        #sent = re.sub(r'G20','', str(sent))
        review = re.sub(r'↑','상승 ', str(review))
        review = re.sub(r'↓','하락 ', str(review))
        review = re.sub(r'ITF','국제태권도연맹 ', str(review))
        review = re.sub(r'IS','이슬람 ', str(review))
        #review = re.sub(r'러','러시아 ', str(review))
        review = re.sub(r'W농구','한국여자농구', str(review))
        review = re.sub(r'C팰리스','크리스탈팰리스', str(review))
        review = re.sub(r'SLBM','잠수함발사탄도미사일', str(review))
        review = re.sub(r'VNL','배구네이션스리그', str(review))
        #sent = re.sub(r'D','하루전', str(sent))
        review = re.sub(r'LA타임스','로스엔젤레스타임스', str(review))
        review = re.sub(r'V리그','배구리그', str(review))
        review = re.sub(r'KOVO','한국배구연맹', str(review))
        review = re.sub(r'ℓ','리터', str(review))
        review = re.sub(r'SUN','선동열', str(review))
        review = re.sub(r'WSJ',' 월스트리트 저널', str(review))
        review = re.sub(r'ERA',' 평균자책점', str(review))
        review = re.sub(r'IoT',' 사물인터넷', str(review))
        review = re.sub(r'QS',' 선발 3자책점 투구', str(review))
        review = re.sub(r'NL','내셔널리그', str(review))
        review = re.sub(r'UFG20','한미 합동 군사', str(review))
        review = re.sub(r'F35','전투기', str(review))
        review = re.sub(r'WP','워싱턴포스트', str(review))
        review = re.sub(r'TK','대구와 경북', str(review))
        review = re.sub(r'ACL','아시아축구연맹 챔피언스리그', str(review))
        review = re.sub(r'IT','정보기술', str(review))
        review = re.sub(r'AI','인공지능', str(review))
        review = re.sub(r'TF','태스크포스', str(review))
        review = re.sub(r'ML','메이저리그', str(review))
        review = re.sub(r'FC','축구 클럽', str(review))
        #review = re.sub(r'SI','스포츠 일러스트레이티드', str(review))
        review = re.sub(r'㈜','', str(review))
        review = re.sub(r'MS','마이크로소프트', str(review))
        review = re.sub(r'SNS','소셜 네트워크 서비스', str(review))
        review = re.sub(r'B52',' 전투기', str(review))
        review = re.sub(r'VR','가상현실', str(review))
        review = re.sub(r'ELB','주가 연계 파생상품', str(review))
        review = re.sub(r'CES','국제전자제품박람회', str(review))
        review = re.sub(r'NPL','부실채권', str(review))
        review = re.sub(r'IPO','기업공개', str(review))
        review = re.sub(r'ERA','방어율', str(review))
        review = re.sub(r'MWC','모바일 산업 박람회', str(review))
        review = re.sub(r'NSC','국가안전보장회의', str(review))
        review = review.lower() #lower case
        review = re.sub(r'\s+', ' ', review) #remove extra space
        review = re.sub(r'<[^>]+>','',review) #remove Html tags
        review = re.sub(r'\s+', ' ', review) #remove spaces
        review = re.sub(r"^\s+", '', review) #remove space from start
        review = re.sub(r'\s+$', '', review) #remove space from the end
        #review = re.sub("[一-龥]",'', review)

        review = review.replace('[^A-Za-z가-힣ㄱ-ㅎㅏ-ㅣ一-龥]', ' ')
        corpus.append(review)
    return corpus

basic_preprocessed_train_corpus = clean_text(cleaned_train_corpus)
basic_preprocessed_test_corpus = clean_text(cleaned_test_corpus)


stopwords = []
# with open(STOPWORDSPATH) as f:
#     for line in f:
#         stopwords.append(line.strip())

removed_stopword_train_corpus = []
removed_stopword_test_corpus = []

for tagged in basic_preprocessed_train_corpus:
    #tagged=okt.pos(tagged)
    
    #temp = []
    #for tag in tagged:
    #    if tag[0] in stopwords or tag[1] not in ["Alpha", "Noun", "Foreign"]:
    #        continue
    #    temp.append(tag[0])

    removed_stopword_train_corpus.append(tagged)
    
for tagged in basic_preprocessed_test_corpus:
    #tagged=okt.pos(tagged)
    
    #temp = []
    #for tag in tagged:
    #    if tag[0] in stopwords or tag[1] not in ["Alpha", "Noun", "Foreign"]:
    #        continue
    #    temp.append(tag[0])

    removed_stopword_test_corpus.append(tagged)


train_text = removed_stopword_train_corpus
test_text = removed_stopword_test_corpus
train_label = np.asarray(train.topic_idx)

In [None]:
train['clear_title'] = train_text
test['clear_title'] = test_text

In [None]:
train_length = train['clear_title'].astype(str).apply(len)
train_length.max()

196

In [None]:
train_data_text = list(train['title'])

train_clear_text = []

for i in tqdm(range(len(train_data_text))):
  train_clear_text.append(str(train_data_text[i]).replace('\\n', ''))
train['clear_title'] = train_clear_text


train_clear_text = list(train['clear_title'])

train_clear_text2 = []

for text in train_clear_text:
  temp = re.sub('[-=+,#:;//●<>▲\?:^$.☆!★()Ⅰ@*\"※~>`\'…》→←]', ' ', text)
  train_clear_text2.append(temp)
train['clear_title'] = train_clear_text2


test_data_text = list(test['title'])

test_clear_text = []

for i in tqdm(range(len(test_data_text))):
  test_clear_text.append(test_data_text[i].replace('\\n', ' '))
test['clear_title'] = test_clear_text


test_clear_text = list(test['clear_title'])

test_clear_text_final = []

for text in test_clear_text:
  temp = re.sub('[-=+,#:;//●<>▲\?:^$.☆!★()Ⅰ@*\"※~>`\'…》→←]', ' ', text)
  test_clear_text_final.append(temp)
test['clear_title'] = test_clear_text_final

100%|██████████| 773157/773157 [00:00<00:00, 1220455.14it/s]
100%|██████████| 9131/9131 [00:00<00:00, 1396877.48it/s]


In [None]:
tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'KoBertTokenizer'.


In [None]:
model_name = 'monologg/kobert'
SEED_NUM = 615
tf.random.set_seed(SEED_NUM)
np.random.seed(SEED_NUM)
BATCH_SIZE = 128
NUM_EPOCHS = 10
VALID_SPLIT = 0.2
MAX_LEN = 30
NUM_CLASS = 7
K_SPLIT = 5

In [None]:
def bert_tokenizer(sent, MAX_LEN):
    
    encoded_dict = tokenizer.encode_plus(
        
        text = sent,
        add_special_tokens = True,
        max_length = MAX_LEN,
        padding = True,                                   
        return_attention_mask = True,
        truncation = True 
    )


    input_id = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']
    token_type_id = encoded_dict['token_type_ids']


    return input_id, attention_mask, token_type_id

In [None]:
input_ids = []
attention_masks = []
token_type_ids = []
train_data_labels = []


for train_sent, train_label in tqdm(zip(train["clear_title"], train["topic_idx"])): 
    try:

        input_id, attention_mask, token_type_id = bert_tokenizer(train_sent, MAX_LEN)
        
        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        train_data_labels.append(train_label)
        
    except Exception as e:
        print(e)
        print(train_sent)
        pass


773157it [02:19, 5528.65it/s]


In [None]:
def get_numpy_from_nonfixed_2d_array(aa, fixed_length, padding_value=0):
    rows = []
    for a in aa:
        rows.append(np.pad(a, (0, fixed_length), 'constant', constant_values=padding_value)[:fixed_length])
    return np.concatenate(rows, axis=0).reshape(-1, fixed_length)

In [None]:
train_news_input_ids = get_numpy_from_nonfixed_2d_array(input_ids, fixed_length=MAX_LEN, padding_value=0)
train_news_attention_masks = get_numpy_from_nonfixed_2d_array(attention_masks, fixed_length=MAX_LEN, padding_value=0)
train_news_type_ids = get_numpy_from_nonfixed_2d_array(token_type_ids, fixed_length=MAX_LEN, padding_value=0)


train_news_inputs = (train_news_input_ids, train_news_attention_masks, train_news_type_ids)
train_data_labels = np.asarray(train_data_labels)

In [None]:
class TFBertClassifier(tf.keras.Model):                                                
    def __init__(self, model_name, dir_path, num_class):
        super(TFBertClassifier, self).__init__()

         
        self.bert = TFBertModel.from_pretrained(model_name, from_pt=True) 
                                                                                                                                    
        self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)
        # self.classifier을 통해 topic_idx를 전부 분류
        self.classifier = tf.keras.layers.Dense(num_class,
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range), 
                                                name="classifier") 


    def call(self, inputs, attention_mask=None, token_type_ids=None, training=False): 
        outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output, training=training)
        logits = self.classifier(pooled_output)

        return logits

cls_model = TFBertClassifier(model_name=model_name, dir_path='bert_ckpt',num_class=NUM_CLASS)

Downloading:   0%|          | 0.00/369M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertModel.

All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
optimizer = tfa.optimizers.RectifiedAdam(learning_rate=7.0e-5, total_steps = 2344*4, warmup_proportion=0.1, min_lr=1e-5, epsilon=1e-07, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

cls_model.compile(optimizer=optimizer,
                                loss=loss,
                                metrics=[metric])

In [None]:
es_callback = EarlyStopping(monitor='val_loss', 
                                mode='min',
                                min_delta=0.0001, 
                                patience=3,
                                baseline=0.4
                                 ) 

DATA_OUT_PATH = 'kobert/best_model'
checkpoint_path = DATA_OUT_PATH +  '/best_modeling.ckpt'
checkpoint_dir = os.path.dirname(checkpoint_path)

if os.path.exists(checkpoint_dir):
    print("{} -- Folder already exists \n".format(checkpoint_dir))
else:
    os.makedirs(checkpoint_dir, exist_ok=True)
    print("{} -- Folder create complete \n".format(checkpoint_dir))

  
cp_callback = ModelCheckpoint(
    checkpoint_path, 
    monitor='val_accuracy',
    verbose=1, 
    save_best_only=True, 
    save_weights_only=True 
    )

kobert/best_model -- Folder create complete 



In [None]:
history = cls_model.fit(train_news_inputs, train_data_labels, 
                        epochs=NUM_EPOCHS,
                        batch_size=BATCH_SIZE,
                        validation_split = VALID_SPLIT,
                        callbacks=[es_callback, cp_callback]
                        )  

Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported
 724/4833 [===>..........................] - ETA: 32:03:16 - loss: 1.1673 - accuracy: 0.5722