In [None]:
# colab 사용 시 주석 풀고 mount_path 설정 후 실행
# from google.colab import drive
# drive.mount("/content/drive")

In [None]:
# colab 사용 시 주석 풀고 실행
# !pip install transformers
# vscode
# %pip install transformers

In [None]:
# colab 사용 시 주석 풀고 실행
# !pip install seaborn
# vscode
# %pip install seaborn

In [None]:
from _init import *

from commons import file_util, string_util

import os
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt

from tqdm import tqdm

# from tensorflow.python.keras.optimizer_v1 import Adam
from tensorflow.keras.optimizers import Adam
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint

from transformers import TFBertForSequenceClassification, TFBertModel, AutoTokenizer
from transformers import BertTokenizer, BertForSequenceClassification, BertModel, BertConfig

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix, classification_report, matthews_corrcoef, cohen_kappa_score, log_loss

# file_dir variable
WORK_DIR = "../../../"
IN_DIR = WORK_DIR + "resources/keyword_extract/"
OUT_DIR = WORK_DIR + "resources/keyword_extract_model/bert_model.h5"
OUT_GRAPH_PATH = WORK_DIR + "resources/keyword_extract_model/bert_accuracy.png"
OUT_REPORTFILE_PATH = WORK_DIR + "resources/keyword_extract_model/cl_report_bert_model.csv"
OUT_CFMATRIX_PATH = WORK_DIR + "resources/keyword_extract_model/matrix/cf_matrix_bert_model.png"
OUT_METRIXFILE_PATH = WORK_DIR + "resources/keyword_extract_model/matrix/bert_model_metric.csv"
ENCODING = "UTF-8"
DELIM = "\t"
DECIMAL_POINT = 2

# Hyperparameter variable
BATCH_SIZE = 32
MAX_SEQ_LEN = 128
DROPOUT_RATE = 0.2
DENCE_UNIT = 2
LEARNING_RATE = 2e-5
PATIENCE = 5
VERBOSE = 1
EPOCHS = 25

# bert-model
# BERT_MODEL_NAME = "bert-base-uncased"
BERT_MODEL_NAME = "bert-base-multilingual-cased"
# BERT_MODEL_NAME = "klue/bert-base"

In [None]:
device_name = tf.test.gpu_device_name()

# GPU divice name checker
if device_name == "/device:GPU:0" :
    print("Found GPU at : {}".format(device_name))
else :
    print("GPU device not found")

In [None]:
# device setting
if torch.cuda.is_available() :
    device = torch.device("cuda")

    print("There are %d GPU(s) available." % torch.cuda.device_count())
    print("We will use the GPU : ", torch.cuda.get_device_name(0))
else :
    device = torch.device("cpu")
    print("No GPU available, using the CPU instead.")

In [None]:
# tokenizer 설정
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
# tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME, do_lower_case=False)
config = BertConfig.from_pretrained(BERT_MODEL_NAME)

# model 설정 사용할 모델만 활성화
# model = BertModel.from_pretrained(BERT_MODEL_NAME)
model = TFBertModel.from_pretrained(BERT_MODEL_NAME, config=config)
# model.cuda()

In [None]:
# vs code 사용 시 데이터 불러오기
# data -> list로 넘길 때 사용
def load(in_file_path: str, encoding: str, out_list: list) :
	file_paths = file_util.get_file_paths(in_file_path, True)

	for file_path in file_paths :
		in_file = file_util.open_file(file_path, encoding, "r")

		while True :
			line = in_file.readline()

			if not line :
				break

			line = file_util.preprocess(line)
			if string_util.is_empty(line, True) :
				continue

			out_list.append(line)
	in_file.close()

# 이거 활성화 시켜서 사용하면 됨.
# raw_data = []
# load(IN_DIR, ENCODING, raw_data)
# raw_data = [line.strip().split(DELIM) for line in raw_data]

# raw_df = pd.DataFrame(raw_data, columns=["train", "label"])
# print(raw_df.shape)
# raw_df.head()

In [None]:
# file_name 추출
file_paths = file_util.get_file_paths(IN_DIR, True)

for file_path in file_paths :
	file_name = file_util.get_file_name(file_path)

In [None]:
# raw_df 생성
raw_df = pd.read_csv(IN_DIR + file_name, sep=DELIM, names=["train", "label"])
print(raw_df.shape)
raw_df.head()

In [None]:
# train_set, test_set 분할
# train 8.1, validation 0.9, test 1.0
print("여기서부터 문맥에 맞게 설정")
print(f"train - test : {int(raw_df.shape[0] * 0.9)}")
DATA_RATE = 11277

train_df = raw_df[:DATA_RATE]
test_df = raw_df[DATA_RATE:]
train_df.tail(), test_df.head()

In [None]:
sentences = train_df.train
labels = train_df.label.values
sentences, labels

In [None]:
# input_ids, attencion 설정
input_ids = []
attention_masks = []

for sent in tqdm(sentences) :
    b_input_texts = tokenizer.encode_plus(
        sent,                               # 입력된 문장
        add_special_tokens = True,          # 
        max_length = MAX_SEQ_LEN,           # 입력된 문장의 최대 길이
        pad_to_max_length = True,           # 
        return_attention_mask = True        # 
	)
    input_ids.append(b_input_texts["input_ids"])
    attention_masks.append(b_input_texts["attention_mask"])

input_ids = np.asarray(input_ids)
attention_masks = np.array(attention_masks)
target = np.array(pd.get_dummies(labels))

In [None]:
tokenizer.convert_ids_to_tokens(101)

In [None]:
# train_set, validation_set 분할
X_train, X_label, y_train, y_label, train_mask, label_mask = train_test_split(
    input_ids,
    target,
    attention_masks,
    test_size=0.1
)
model.summary()

In [None]:
def create_model(model_) :
	input_ids = tf.keras.Input(shape=(MAX_SEQ_LEN,), dtype="int64")
	attention_masks = tf.keras.Input(shape=(MAX_SEQ_LEN, ), dtype="int64")

	output = model_(input_ids, attention_masks)
	output = output[0]
	output = output[:, 0, :]

	output = tf.keras.layers.Dense(BATCH_SIZE, activation="relu")(output)
	output = tf.keras.layers.Dropout(DROPOUT_RATE)(output)
	output = tf.keras.layers.Dense(DENCE_UNIT, activation="softmax")(output)
	model = tf.keras.models.Model(inputs=[input_ids, attention_masks], outputs=output)

	for layer in model.layers[:3] :
		layer.trainable = False

	return model

model = create_model(model)
model.summary()

In [None]:
def model_compile(model) :
	adam = Adam(learning_rate=LEARNING_RATE)
	model.compile(loss="categorical_crossentropy", optimizer=adam, metrics=["accuracy"])
    
	return model

In [None]:
model_compile(model)

In [None]:
early_stopping = EarlyStopping(
    monitor = "val_accuracy",
    min_delta = 1e-3,
    patience = PATIENCE
)

In [None]:
# 모델 학습 후 저장
model_checkpoint = ModelCheckpoint(
    filepath = OUT_DIR,
    monitor = "val_accuracy",
    mode = "max",
    save_best_only = True,
    verbose = VERBOSE
)

In [None]:
# 모델 학습
history = model.fit(
    [X_train, train_mask],
    y_train,
    batch_size = BATCH_SIZE,
    epochs = EPOCHS,
    validation_data = ([X_label, label_mask], y_label),
    callbacks = [early_stopping, model_checkpoint],
    shuffle = True
)

In [None]:
# 정확도 그래프 생성 및 저장
train_accuracy = history.history["accuracy"]
val_accuracy = history.history["val_accuracy"]

epochs = range(1, len(train_accuracy) + 1)
plt.figure(figsize=(14, 7))
plt.title("Bert accuracy")
plt.plot(epochs, train_accuracy, "b", label="train_acc")
plt.plot(epochs, val_accuracy, "r", label="val_acc")
plt.grid()
plt.ylim(0.55, 0.70)
plt.legend()
plt.savefig(OUT_GRAPH_PATH)
plt.show()

In [None]:
best_model = tf.keras.models.load_model(
    OUT_DIR,
    custom_objects = {
        "TFBertModel" : TFBertModel
	})

In [None]:
best_test_res = best_model.evaluate([X_label, label_mask], y_label)
best_test_res

In [None]:
predicted_value = best_model.predict([X_label, label_mask])
predicted_label = np.argmax(predicted_value, axis=1)

In [None]:
new_y_test = []

for value in y_label :
	if value[0] == 0 :
		new_y_test.append(1)
	else :
		new_y_test.append(0)

In [None]:
cl_report = classification_report(new_y_test, predicted_label, output_dict = True)
cl_report_df = pd.DataFrame(cl_report).transpose()
cl_report_df = cl_report_df.round(3)
cl_report_df.to_csv(OUT_REPORTFILE_PATH)
print(cl_report_df)

In [None]:
# confusion_matrix 이미지 생성 및 저장
cf_matrix = confusion_matrix(new_y_test, predicted_label)
fig, ax = plt.subplots(figsize=(6, 4))
sns.heatmap(cf_matrix, annot=True, fmt="d")
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.savefig(OUT_CFMATRIX_PATH)
plt.show()

In [None]:
# 평가 지표 저장하기 위한 변수들
accuracy_score_v = round(accuracy_score(new_y_test, predicted_label), DECIMAL_POINT)
precision_score_v = round(precision_score(new_y_test, predicted_label, average="weighted"), DECIMAL_POINT)
recall_score_v = round(recall_score(new_y_test, predicted_label, average="weighted"), DECIMAL_POINT)
f1_score_v = round(f1_score(new_y_test, predicted_label, average="weighted"), DECIMAL_POINT)
roc_auc_score_v = round(roc_auc_score(new_y_test, predicted_label, average="weighted"), DECIMAL_POINT)
cohen_kappa_score_v = round(cohen_kappa_score(new_y_test, predicted_label), DECIMAL_POINT)
matthews_corrcoef_v = round(matthews_corrcoef(new_y_test, predicted_label), DECIMAL_POINT)
log_loss_v = round(log_loss(new_y_test, predicted_label), DECIMAL_POINT)

In [None]:
# 평가지표 결과 저장 및 출력
metric_total = pd.DataFrame({
    "PLM" : "bert",
    "Optimizer" : "adam",
    "Accuracy" : accuracy_score_v,
    "Precision" : precision_score_v,
    "Recall" : recall_score_v,
    "F1_score" : f1_score_v,
    "ROC_AUC_score" : roc_auc_score_v,
    "Cohen_kappa_score" : cohen_kappa_score_v,
    "Matthews_corrcoef" : matthews_corrcoef_v,
    "Log_loss" : log_loss_v
    },
    index = ["-"]
    )
metric_total.to_csv(OUT_METRIXFILE_PATH)
print(metric_total)