In [1]:
!pip install transformers
!pip install datasets #데이터 세트 다운로드 Hugging Face 연동



In [2]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

import matplotlib.pyplot as plt

plt.rcParams["font.family"] = 'NanumGothic'

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
fonts-nanum is already the newest version (20200506-1).
0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.
/usr/share/fonts: caching, new cache contents: 0 fonts, 1 dirs
/usr/share/fonts/truetype: caching, new cache contents: 0 fonts, 3 dirs
/usr/share/fonts/truetype/humor-sans: caching, new cache contents: 1 fonts, 0 dirs
/usr/share/fonts/truetype/liberation: caching, new cache contents: 16 fonts, 0 dirs
/usr/share/fonts/truetype/nanum: caching, new cache contents: 12 fonts, 0 dirs
/usr/local/share/fonts: caching, new cache contents: 0 fonts, 0 dirs
/root/.local/share/fonts: skipping, no such directory
/root/.fonts: skipping, no such directory
/usr/share/fonts/truetype: skipping, looped directory detected
/usr/share/fonts/truetype/humor-sans: skipping, looped directory detected
/usr/share/fonts/truetype/liberation: skipping, looped directory detected
/usr/share/fonts/truetype/

In [3]:
import copy # 특정한 파이썬 객체를 통째로 메모리에 copy할 때
import json # json 형식으로 데이터를 표현할 때
import logging # 학습 과정 등 전반적인 프로그램의 진행 상황을 로깅할 때
import os # 파일 입출력 등 현재 컴퓨터에 대한 기능 수행할 때

# 경고(warning) 메시지가 너무 많이 나오는 것을 대비하여 무시 처리
import warnings
warnings.filterwarnings("ignore")

# 로깅할 때 기본적으로 오류(error) 사항으로 로그 메시지를 남기겠다는 의미
import logging
logging.basicConfig(level=logging.ERROR)

# 벡터, 행렬 등의 처리를 위한 NumPy, 테이블(엑셀) 형식의 데이터 처리할 때 Pandas
import numpy as np
import pandas as pd

from datasets import load_dataset
# train_test_split: 별도로 구분된 validation 세트가 없을 때
# 학습 데이터 세트에서 일부를 train과 validation으로 나눌 때 자주 사용 (8:2 정도로 나눔)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch

### 학습한 모델 관련 라이브러리 불러오기

In [4]:
import transformers
# Auto Model For Sequence Classification: 텍스트 분류를 위한 모델 → Cross-Entropy loss 사용
from transformers import AutoConfig, AutoModel, AutoModelForSequenceClassification, AutoTokenizer
# linear_schedule_with_warmup: 단계적으로 learning rate 줄여나가는 방법
# AdamW: SGD와 같이 optimization 방법 중 하나
from transformers import AdamW, get_linear_schedule_with_warmup

### 우리가 쓸 모델

- KoBigBird를 사용하고, 다음과 같은 형태로 사용 가능
- KoBigBird: BigBird 특유의 sparse attention 사용 (default)

In [5]:
from transformers import AutoModel, AutoTokenizer

# by default its in `block_sparse` mode with num_random_blocks=3, block_size=64
# 이름에서부터 알 수 있듯이 KoBigBird는 BERT 기반의 모델
model = AutoModel.from_pretrained("monologg/kobigbird-bert-base")

# Tokenizer도 마찬가지로 BERT 기반에서 가져온 것을 확인
tokenizer = AutoTokenizer.from_pretrained("monologg/kobigbird-bert-base")
text = "한국어 BigBird 모델을 공개합니다!"
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

Attention type 'block_sparse' is not possible if sequence_length: 12 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


### 실험을 위한 하이퍼 파라미터 설정

In [6]:
# SimpleNamespace는 온점(.)으로 속성의 값을 정의할 수 있도록 해주는 라이브러리
# config.task = "cls"라고 하면, 나중에 print(config.task)했을 때 "cls"라고 출력
from types import SimpleNamespace

config = SimpleNamespace()

config.task = "cls"
config.dataset = "comment"

config.cache_dir = "cache" # 현재 데이터 세트에 대한 단어집 등 현재 task를 위한 임시적인 폴더
config.output_dir = "output" # 최종적인 모델이 저장되고, 결과가 저장되는 폴더

config.use_tpu = False
config.model_name_or_path = "monologg/kobigbird-bert-base" # Model name or path (HuggingFace에서 불러와 사용할 모델 이름)
config.data_dir = "./" # The input data dir ("10000_labeled.csv"가 있는 경로)


config.train_file = "complete_train.csv" # 미리 준비된 학습 데이터 세트 경로
config.test_file = "complete_test.csv" # 미리 준비된 평가 데이터 세트 경로

config.max_seq_length = 1024 # The maximum total input sequence length after tokenization. (최대 토큰 길이)
config.train_batch_size = 4 # Batch size for training. (학습할 때 batch_size)
config.eval_batch_size = 2 # Batch size for evaluation. (평가할 때 batch_size)

config.learning_rate = 3e-5 # The initial learning rate for Adam. (Adam optimizer에서 쓸 learning rate)
config.num_train_epochs = 10 # Total number of training epochs to perform. (전체 학습 epoch 수)

config.num_labels = 5 # 현재 task에서 선호도(1: 극진보, 2: 진보, 3: 중립, 4: 보수, 5: 극보수)의 개수는 5개이므로
# 5 classes multi-class classification 문제로 이해 할 수 있음
config.gradient_accumulation_steps = 2 # Number of updates steps to accumulate before performing a backward/update pass.
# batch_size가 큰 것처럼 처리하기 위해서, backward()를 매 번 수행하지 않고, gradient를 누적(acculmulation)하는 것

config.threads = 4
config.seed = 42 # random seed for initialization

config.do_train = True # Whether to run training.
config.do_eval_during_train = True
config.do_eval = True # Whether to run prediction.

config.do_lower_case = False
config.weight_decay = 0.0 # Weight decay if we apply some.
config.adam_epsilon = 1e-8 # Epsilon for Adam optimizer.
config.max_grad_norm = 1.0 # Max gradient norm.
config.warmup_proportion = 0.0 # Warmup proportion for linear warmup
# BigBird에서는 full attention을 하면, 메모리는 조금 더 소모되지만, 더 정확도가 향상
# config.attention_type = "original_full"

### 학습 데이터 전처리
- 학습 text를 매번 tokenization을 하지 않고, 모델 학습 시작전에 미리 모든 텍스트를 tokenization 한 결과를 저장한다.

In [7]:
# 본 실습에서 사용할 tokenizer 객체 초기화
tokenizer = AutoTokenizer.from_pretrained(config.model_name_or_path, cache_dir=config.cache_dir)

In [8]:
def train_split(config, texts, labels, is_train):
    if is_train:
        x_train, x_val, y_train, y_val = train_test_split(
            texts, labels, test_size=500, random_state=config.seed, stratify=None
        )
        return x_train, y_train, x_val, y_val
    else:
        return texts, labels

In [9]:
import pandas as pd
import os
import json
from sklearn.model_selection import train_test_split

# 데이터 전처리 및 분리 함수
def process_and_split_dataset(config, data_file):
    is_train = True if 'train' in data_file else False

    df = pd.read_csv(data_file)
    df = df.dropna(subset=['title', 'content', 'label1', 'label2'])

    # 매 줄에서 "label1(정치성향)", "label2(편향여부)" 열 추출
    politics = (df["label1"] - 1).astype(int).values.tolist()
    governments = (df["label2"]).astype(int).values.tolist()
    labels = []
    for i in range(len(politics)):
        politic = politics[i]
        government = governments[i]
        labels.append([politic, government])

    # title과 content를 합쳐서 texts로 표현
    texts = (df["title"].str.replace('\n', ' ') + " " + df["content"].str.replace('\n', ' ')).astype(str).values.tolist()

    # 데이터를 학습 및 검증용으로 분할
    return train_split(config, texts, labels, is_train)





def data_pretokenizing(config, tokenizer, mode="train"):
    # 파일 경로 설정
    data_path = os.path.join(config.data_dir, config.train_file if mode == "train" else config.test_file)
    processed_data = process_and_split_dataset(config, data_path)

    datasets = {}
    if mode == "train":
        train_data_texts, train_data_labels, valid_data_texts, valid_data_labels = processed_data
        datasets["train"] = (train_data_texts, train_data_labels)
        datasets["valid"] = (valid_data_texts, valid_data_labels)
    else:  # mode == "test"
        test_data_texts, test_data_labels = processed_data
        datasets["test"] = (test_data_texts, test_data_labels)

    dataset_files = {}
    for key, (texts, labels) in datasets.items():
        label1 = [l[0] for l in labels]
        label2 = [l[1] for l in labels]

        # 데이터 파일명 설정
        dataset_file = os.path.join(config.data_dir, f"{config.dataset}_{config.model_name_or_path.replace('/', '_')}_{config.max_seq_length}_{key}_dataset.txt")
        print("dataset_file:", dataset_file)

        with open(dataset_file, "w", encoding="utf-8") as writer_file:
            for text, l1, l2 in zip(texts, label1, label2):
                print(f"Processing text: {text[:50]}...")  # 첫 50자만 출력
                print(f"Processing label1: {l1}")
                print(f"Processing label2: {l2}")

                feature = tokenizer(text, max_length=config.max_seq_length, padding="max_length", truncation=True, add_special_tokens=True)

                writed_data = {
                    "input_ids": feature["input_ids"],
                    "attention_mask": feature["attention_mask"],
                    "politic": l1,
                    "government": l2,
                }
                writer_file.write(json.dumps(writed_data) + "\n")

        dataset_files[key] = dataset_file

    return dataset_files



In [10]:
# 학습 및 검증 데이터 토큰화
train_valid_files = data_pretokenizing(config, tokenizer=tokenizer, mode="train")
train_dataset_file = train_valid_files["train"]
valid_dataset_file = train_valid_files["valid"]

# 테스트 데이터 토큰화
test_files = data_pretokenizing(config, tokenizer=tokenizer, mode="test")
test_dataset_file = test_files["test"]


[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
Processing text: 육군, 간부 '복지회관 갑질' 논란에 "전 부대 실태 점검"         임태훈 군인권센...
Processing label1: 2
Processing label2: 2
Processing text: 현대아산? 아산현대? 오염수 선동글 올린 의사... 전문가들 “정치 지망생이시군” 1일 오...
Processing label1: 2
Processing label2: 0
Processing text: 이태원 막말' 김미나, 이번엔 '민주화 유적 비하'…국민의힘, 윤리위 회부도 안해 이태원 ...
Processing label1: 0
Processing label2: 0
Processing text: 민주, 김건희 일가 양평 땅 의혹 ‘고속도로게이트’ 조사키로 윤석열 대통령 부인 김건희 여...
Processing label1: 1
Processing label2: 1
Processing text: [오늘의 주요일정] 정치·정부(3일, 목)         찜통 더위로 전국 대부분 지역에 ...
Processing label1: 2
Processing label2: 3
Processing text: "우리같이 뚱뚱하면…" 尹 '동네 패딩' 리투아니아서도 입었다 북대서양조약기구(NATO·나...
Processing label1: 2
Processing label2: 3
Processing text: 박성민 의원, 민간보조금 개혁 3법 발의…"이권 카르텔 방지"         박성민 국회의...
Processing label1: 3
Processing label2: 0
Processing text: [단독] 문재인 다큐에 1억 지원… 전주영화제 “이게 우리 색깔” 전주국제영화제 조직위원회...
Processing label1: 2
Processing label2: 0
Processing text: 中국가기관 인근서 ‘찰칵’, 체포될 수 있다... 反

### 데이터로더 초기화

#### 데이터패딩

In [11]:
class IterableDatasetPad(torch.utils.data.IterableDataset):
    def __init__(
        self,
        dataset: torch.utils.data.IterableDataset,
        batch_size: int = 1,
        num_devices: int = 1,
        seed: int = 0,
    ):
        self.dataset = dataset
        self.batch_size = batch_size
        self.seed = seed
        self.num_examples = 0

        chunk_size = self.batch_size * num_devices
        length = len(dataset)
        self.length = length + (chunk_size - length % chunk_size)

    def __len__(self):
        return self.length

    def __iter__(self):
        self.num_examples = 0
        if (
            not hasattr(self.dataset, "set_epoch")
            and hasattr(self.dataset, "generator")
            and isinstance(self.dataset.generator, torch.Generator)
        ):
            self.dataset.generator.manual_seed(self.seed + self.epoch)

        first_batch = None
        current_batch = []
        for element in self.dataset:
            self.num_examples += 1
            current_batch.append(element)
            # Wait to have a full batch before yielding elements.
            if len(current_batch) == self.batch_size:
                for batch in current_batch:
                    yield batch
                    if first_batch is None:
                        first_batch = batch.copy()
                current_batch = []

        while self.num_examples < self.length:
            add_num = self.batch_size - len(current_batch)
            self.num_examples += add_num
            current_batch += [first_batch] * add_num
            for batch in current_batch:
                yield batch
            current_batch = []

#### 전처리된 데이터를 DataLoader로 불러옴

In [12]:
import torch.utils.data as torch_data

def collate_fn(features):
    input_ids = [sample["input_ids"] for sample in features]
    attention_mask = [sample["attention_mask"] for sample in features]

    politic = [sample["politic"] for sample in features]
    government = [sample["government"] for sample in features]

    input_ids = torch.tensor(np.array(input_ids).astype(np.int64), dtype=torch.long)
    attention_mask = torch.tensor(np.array(attention_mask).astype(np.int8), dtype=torch.long)
    politic = torch.tensor(np.array(politic).astype(np.int64), dtype=torch.long)
    government = torch.tensor(np.array(government).astype(np.int64), dtype=torch.long)
    inputs = {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
    }
    labels = {
        "politic": politic,
        "government": government
    }
    return inputs, labels

# 본 코드에서 학습을 수행하려는 경우
if config.do_train:
    # 학습 데이터 로더 초기화
    train_dataset = load_dataset("text", data_files=train_dataset_file, download_mode="force_redownload")["train"]
    train_dataset = train_dataset.map(lambda x: json.loads(x["text"]), batched=False)

    train_dataloader = torch_data.DataLoader(
        train_dataset,
        sampler=torch_data.RandomSampler(train_dataset),
        drop_last=False,
        batch_size=config.train_batch_size,
        collate_fn=(collate_fn),
    )

# 검증 데이터 세트 전처리(validation)
valid_dataset = load_dataset("text", data_files=valid_dataset_file, download_mode="force_redownload")["train"]
valid_dataset = valid_dataset.map(lambda x: json.loads(x["text"]), batched=False)
valid_dataset = IterableDatasetPad(
    dataset=valid_dataset,
    batch_size=config.eval_batch_size,
    num_devices=1,
    seed=config.seed,
)

valid_dataloader = torch_data.DataLoader(
    valid_dataset,
    sampler=None,
    drop_last=False,
    batch_size=config.eval_batch_size,
    collate_fn=(collate_fn),
)

# 테스트 데이터 세트 로딩 및 전처리
test_dataset = load_dataset("text", data_files=test_dataset_file, download_mode="force_redownload")["train"]
test_dataset = test_dataset.map(lambda x: json.loads(x["text"]), batched=False)
test_dataset = IterableDatasetPad(
    dataset=test_dataset,
    batch_size=config.eval_batch_size,
    num_devices=1,
    seed=config.seed,
)

test_dataloader = torch_data.DataLoader(
    test_dataset,
    sampler=None,
    drop_last=False,
    batch_size=config.eval_batch_size,
    collate_fn=(collate_fn),
)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

### 텍스트 분류 모델 정의

In [13]:
from transformers import AutoModel
import torch.nn as nn

# 텍스트 분류 모델 정의
class ClsModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        data_file = os.path.join(config.data_dir, str(config.train_file))

        # We change the model to AutoModelForSequenceClassification since it can return attentions
        model_config = AutoConfig.from_pretrained(config.model_name_or_path, num_labels=config.num_labels, output_attentions=True)

        self.model = AutoModelForSequenceClassification.from_pretrained(
            config.model_name_or_path, config=model_config, cache_dir=config.cache_dir
        )

        self.classifier1 = nn.Linear(768, 5)
        self.classifier2 = nn.Linear(768, 6)
        self.tokenizer = tokenizer

    # "학습된 모델"을 save_dir에 저장하는 함수
    def save_pretrained(self, save_dir):
        self.model.save_pretrained(save_dir)
        # Tokenizer는 기본적으로 "special_tokens_map_file", "tokenizer_file"을 가질 수 있음
        # 이러한 값을 제거한 뒤에 save_dir에 저장하겠다는 의미
        for key in ["special_tokens_map_file", "tokenizer_file"]:
            self.tokenizer.init_kwargs.pop(key, None)
        self.tokenizer.save_pretrained(save_dir)

    def get_optimizer(self): # 현재 모델을 학습하기 위한 최적화 방법(AdamW) 객체를 불러오는 함수
        # bias랑 LayerNorm에는 decay 적용하지 않겠다는 의미
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": config.weight_decay,
            },
            {
                "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": config.weight_decay,
            },
        ]
        # AdamW의 첫 번째 인자(params)는 "학습할 가중치", weight_decay는 가중치에 적용되는 regularization 기법
        optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate, eps=config.adam_epsilon)
        return optimizer

    def get_scheduler(self, batch_num, optimizer): # AdamW로 학습할 때, learning rate을 단계적으로 줄이기 위한 함수
        if config.warmup_proportion == 0.0:
            return None

        t_total = batch_num // config.gradient_accumulation_steps * config.num_train_epochs

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(t_total * config.warmup_proportion),
            num_training_steps=t_total,
        )

        return scheduler

    def forward(self, inputs):
        outputs = self.model.base_model(**inputs)
        hidden = outputs.last_hidden_state
        attention = outputs.attentions  # This now contains attention scores

        cls_token_embeddings = hidden[:, 0, :]
        output_1 = self.classifier1(cls_token_embeddings)
        output_2 = self.classifier2(cls_token_embeddings)

        return output_1, output_2, attention  # Return attention along with the outputs

    def eval_step(self, inputs, labels, outputs):
        logits_1 = outputs[0].detach().cpu()
        logits_2 = outputs[1].detach().cpu()
        labels_1 = self.tensor_to_list(labels["politic"])
        labels_2 = self.tensor_to_list(labels["government"])
        predictions_1 = self.tensor_to_list(torch.argmax(logits_1, dim=-1))
        predictions_2 = self.tensor_to_list(torch.argmax(logits_2, dim=-1))
        results_1 = [{"prediction": prediction, "label": label} for prediction, label in zip(predictions_1, labels_1)]
        results_2 = [{"prediction": prediction, "label": label} for prediction, label in zip(predictions_2, labels_2)]
        return {"results_1": results_1, "results_2": results_2}


    def visualize_attention(self, attention, input_tokens):
        # Assuming attention is of shape [num_layers, batch_size, num_heads, seq_length, seq_length]
        attention = attention[-1][0]  # Get the attention from the last layer and first example in batch
        mean_attention = attention.mean(dim=0)  # Average over the attention heads

        plt.figure(figsize=(10, 10))
        sns.heatmap(mean_attention.cpu().detach().numpy(), xticklabels=input_tokens, yticklabels=input_tokens)
        plt.show()


    # PyTorch의 Tensor 객체를 NumPy 객체로 변환
    def tensor_to_array(self, tensor):
        return tensor.detach().cpu().numpy()

    # PyTorch의 Tensor 객체를 Python의 리스트(list) 자료형으로 변환
    def tensor_to_list(self, tensor):
        return self.tensor_to_array(tensor).tolist()

In [14]:
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

def cal_running_avg_loss(loss, running_avg_loss, decay=0.99):
    if running_avg_loss == 0:
        return loss
    running_avg_loss = running_avg_loss * decay + (1 - decay) * loss
    return running_avg_loss

### 모델 학습 및 평가 라이브러리


In [15]:
from functools import partial
import sklearn.metrics as sklearn_metrics

"""binary_metrics = {
    "accuracy": sklearn_metrics.accuracy_score,
    "precision": sklearn_metrics.precision_score, # TP / (TP + FP)
    "recall": sklearn_metrics.recall_score, # recall = sensitivity (민감도)
    "f1": sklearn_metrics.f1_score,
    "matthews_corrcoef": sklearn_metrics.matthews_corrcoef,
    "roc_auc": sklearn_metrics.roc_auc_score,
}""" # 우리는 두가지 task 다 다중분류임으로 안씀

metrics = {
    "accuracy": sklearn_metrics.accuracy_score,
    "f1-macro": partial(sklearn_metrics.f1_score, average="macro"),
}


def eval_cls(results_1, results_2, **kwargs):
    predictions_1 = np.array([result["prediction"] for result in results_1])
    labels_1 = np.array([result["label"] for result in results_1])
    predictions_2 = np.array([result["prediction"] for result in results_2])
    labels_2 = np.array([result["label"] for result in results_2])
    results_1 = {
        metric: round(f(labels_1, predictions_1) * 100, 2)
        for metric, f in metrics.items()
    }
    results_2 = {
        metric: round(f(labels_2, predictions_2) * 100, 2)
        for metric, f in metrics.items()
    }

    return {
        "results_1": results_1,
        "results_2": results_2,
        "best_score_1": results_1["accuracy"],
        "best_score_2": results_2["accuracy"],

    }


### Epoch 동안 학습 및 평가를 수행하는 함수 정의

In [16]:
def _run_epoch(model, loader, device=None, context=None, **kwargs):
    config = kwargs["config"]
    is_train = kwargs["is_train"]

    avg_loss = 0
    results = []
    batch_num = len(loader)

    if is_train:
        model.train()
        if config.use_tpu:
            optimizer = context.getattr_or("optimizer", lambda: model.get_optimizer())
            scheduler = context.getattr_or("scheduler", lambda: model.get_scheduler(batch_num, optimizer))
        else:
            optimizer = kwargs["optimizer"]
            scheduler = kwargs["scheduler"]
    else:
        model.eval()

    is_master = True

    pbar = tqdm(enumerate(loader), total=batch_num, disable=not is_master, dynamic_ncols=True, position=0, leave=True)

    corrected_1 = 0
    corrected_2 = 0
    total = 0

    for i, (inputs, labels) in pbar:
        if not config.use_tpu:
            for k, v in inputs.items():
                if isinstance(v, torch.Tensor):
                    inputs[k] = v.to(device)
            for k, v in labels.items():
                if isinstance(v, torch.Tensor):
                    labels[k] = v.to(device)

        outputs_1, outputs_2, attentions = model(inputs)

        labels_1 = labels["politic"]
        labels_2 = labels["government"]

        loss_function_1 = nn.CrossEntropyLoss()
        loss_1 = loss_function_1(outputs_1, labels_1)

        total += outputs_1.shape[0]

        _, predicted_1 = outputs_1.max(1)
        corrected_1 += predicted_1.eq(labels_1).sum().item()

        loss_function_2 = nn.CrossEntropyLoss()
        loss_2 = loss_function_2(outputs_2, labels_2)

        _, predicted_2 = outputs_2.max(1)
        corrected_2 += predicted_2.eq(labels_2).sum().item()

        w_1 = 1
        w_2 = 1
        loss = w_1 * loss_1 + w_2 * loss_2

        avg_loss = cal_running_avg_loss(loss.item(), avg_loss)
        loss /= config.gradient_accumulation_steps

        if is_train:
            loss.backward()
            if i % config.gradient_accumulation_steps == 0 or i == batch_num - 1:
                if config.max_grad_norm > 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)

                optimizer.step()
                optimizer.zero_grad()

                if scheduler is not None:
                    scheduler.step()
        else:
            result = (model.module if hasattr(model, "module") else model).eval_step(inputs, labels, outputs=(outputs_1, outputs_2))
            result["loss_1"] = loss_1.item()
            result["loss_2"] = loss_2.item()
            result["attentions"] = attentions
            results.append(result)


    return {
        "loss": avg_loss,
        "result": results,
    }


# 학습 코드에서 호출하는 함수
def run_epoch(**kwargs):
    model = kwargs.pop("model")
    if kwargs["config"].use_tpu:
        results = model(_run_epoch, **kwargs)
    else:
        results = _run_epoch(model, **kwargs)

    if isinstance(results, list):
        loss = sum([result["loss"] for result in results]) / len(results)
        result = []
        for res in results:
            result.extend(res["result"])
        results = {"loss": loss, "result": result}

    return results

### 딥러닝 모델 초기화 및 설정

In [17]:
# 현재 모델 이름이 "monologg/kobigbird-bert-base" 이므로, Hugging Face에서 찾아서 불러옴
set_seed(config.seed)

# 딥러닝 모델 초기화
model = ClsModel()

print(f"configuration: {str(config)}")

if torch.cuda.is_available(): # GPU를 사용할 수 있다면
    gpu_count = torch.cuda.device_count()
    print(f"{gpu_count} GPU device detected")
    devices = ["cuda:{}".format(i) for i in range(gpu_count)]
    model_dp = torch.nn.DataParallel(model, device_ids=devices)
    model.to(devices[0])
else: # GPU를 사용할 수 없다면 CPU로 구동
    devices = ["cpu"]
    model_dp = model

# 학습 결과를 저장하기 위한 폴더 만들기
if not os.path.exists(config.cache_dir):
    os.makedirs(config.cache_dir)

output_dir = os.path.join(config.output_dir, config.task, config.dataset)
print("Output directory:", output_dir)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 실제 학습을 위해 optimizer 및 scheduler 초기화
optimizer = None
scheduler = None
if config.do_train: # 학습 모드(train mode)인 경우
    optimizer = model.get_optimizer()
    scheduler = model.get_scheduler(len(train_dataloader), optimizer)

params = {
    "config": config,
    "model": model_dp,
    "optimizer": optimizer,
    "scheduler": scheduler,
}
if not config.use_tpu:
    params["device"] = devices[0]

Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at monologg/kobigbird-bert-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


configuration: namespace(task='cls', dataset='comment', cache_dir='cache', output_dir='output', use_tpu=False, model_name_or_path='monologg/kobigbird-bert-base', data_dir='./', train_file='complete_train.csv', test_file='complete_test.csv', max_seq_length=1024, train_batch_size=4, eval_batch_size=2, learning_rate=3e-05, num_train_epochs=10, num_labels=5, gradient_accumulation_steps=2, threads=4, seed=42, do_train=True, do_eval_during_train=True, do_eval=True, do_lower_case=False, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, warmup_proportion=0.0)
1 GPU device detected
Output directory: output/cls/comment


In [18]:
def do_eval(epoch):
    with torch.no_grad():
        results = run_epoch(loader=valid_dataloader, epoch=epoch, is_train=False, **params)["result"]

        # 검증 손실의 평균을 계산합니다.
        total_loss_1 = sum([result["loss_1"] for result in results if "loss_1" in result])
        total_loss_2 = sum([result["loss_2"] for result in results if "loss_2" in result])
        avg_loss = (total_loss_1 + total_loss_2) / len(results)

        results_1 = [result['results_1'] for result in results]
        results_2 = [result['results_2'] for result in results]

        results_1 = [item for sublist in results_1 for item in sublist]
        results_2 = [item for sublist in results_2 for item in sublist]

        eval_results = eval_cls(
            config=config,
            model=model,
            loader=valid_dataloader,
            tokenizer=model.tokenizer,
            results_1=results_1,
            results_2=results_2,
        )

    print("Eval results for output 1.")
    for k, v in eval_results["results_1"].items():
        print(f"{k} : {v}")

    print("Eval results for output 2.")
    for k, v in eval_results["results_2"].items():
        print(f"{k} : {v}")

    return avg_loss, eval_results["best_score_1"], eval_results["best_score_2"], eval_results


best_model_path = "best_model.pth"  # 최고 정확도 모델 저장 경로

# 초기에 이러한 리스트들을 정의합니다.
train_losses = []
valid_losses = []
valid_accuracies_1 = []
valid_accuracies_2 = []
valid_f1_scores_1 = []
valid_f1_scores_2 = []

best_model_path = "best_model.pth"

if config.do_train:
    best_score = (0, 0)
    for epoch in range(config.num_train_epochs):
        train_results = run_epoch(loader=train_dataloader, epoch=epoch, is_train=True, **params)
        train_losses.append(train_results['loss'])

        if config.do_eval_during_train:
            avg_loss, score1, score2, eval_results = do_eval(epoch)
            valid_losses.append(avg_loss)
            f1_score_1 = eval_results["results_1"].get("f1-macro", 0)
            f1_score_2 = eval_results["results_2"].get("f1-macro", 0)
            valid_f1_scores_1.append(f1_score_1)
            valid_f1_scores_2.append(f1_score_2)
            valid_accuracies_1.append(score1)  # Using best_score_1 as accuracy for label 1
            valid_accuracies_2.append(score2)

            if score1 >= best_score[0] and score2 >= best_score[1]:
                best_score = (score1, score2)
                torch.save(
                    model_dp.module.state_dict()
                    if hasattr(model_dp, "module")
                    else model_dp._models[0].state_dict()
                    if hasattr(model_dp, "_models")
                    else model_dp.state_dict(),
                    best_model_path
                )
                print(f"Best model saved to {best_model_path}.")



100%|██████████| 1000/1000 [16:59<00:00,  1.02s/it]
 13%|█▎        | 32/251 [00:07<00:54,  4.04it/s]


OutOfMemoryError: ignored

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(18, 6))

# Train and Valid Losses 변화를 그립니다.
plt.subplot(1, 3, 1)
plt.plot(train_losses, label="Train Loss")
plt.plot(valid_losses, label="Valid Loss")
plt.title("Losses over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

# Accuracy 변화를 그립니다.
plt.subplot(1, 3, 2)
plt.plot(valid_accuracies_1, label="Accuracy Label 1")
plt.plot(valid_accuracies_2, label="Accuracy Label 2")
plt.title("Accuracy over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()

# F1-Score 변화를 그립니다.
plt.subplot(1, 3, 3)
plt.plot(valid_f1_scores_1, 'g-', alpha=0.7, label="Valid F1 Score Label 1")  # added alpha for transparency
plt.plot(valid_f1_scores_2, 'g--', label="Valid F1 Score Label 2")
plt.title("F1 Scores over Epochs")
plt.xlabel("Epoch")
plt.ylabel("F1 Score")
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import accuracy_score, f1_score

# 모델 가중치 불러오기
model.load_state_dict(torch.load(best_model_path))
model.to(devices[0])  # 필요한 경우 GPU로 모델 이동
model.eval()

all_predictions_1 = []
all_predictions_2 = []
all_labels_1 = []
all_labels_2 = []
top2_correct_1 = 0
top2_correct_2 = 0

# test_dataset 평가
with torch.no_grad():
    for inputs, labels in test_dataloader:
        for k, v in inputs.items():
            if isinstance(v, torch.Tensor):
                inputs[k] = v.to(devices[0])
        for k, v in labels.items():
            if isinstance(v, torch.Tensor):
                labels[k] = v.to(devices[0])

        outputs = model(inputs)
        _, top2_predicted_1 = outputs[0].topk(2, 1, True, True)
        _, top2_predicted_2 = outputs[1].topk(2, 1, True, True)

        all_predictions_1.extend(top2_predicted_1[:, 0].cpu().tolist())
        all_predictions_2.extend(top2_predicted_2[:, 0].cpu().tolist())
        all_labels_1.extend(labels["politic"].cpu().tolist())
        all_labels_2.extend(labels["government"].cpu().tolist())

        # Top-2 정확도 계산
        top2_correct_1 += sum([true_label in pred_labels for true_label, pred_labels in zip(labels["politic"].cpu().tolist(), top2_predicted_1.cpu().tolist())])
        top2_correct_2 += sum([true_label in pred_labels for true_label, pred_labels in zip(labels["government"].cpu().tolist(), top2_predicted_2.cpu().tolist())])

        # 실제 값과 예측 값을 출력
        for true_label_1, pred_label_1, true_label_2, pred_label_2 in zip(labels["politic"].cpu().tolist(), top2_predicted_1[:, 0].cpu().tolist(), labels["government"].cpu().tolist(), top2_predicted_2[:, 0].cpu().tolist()):
            print(f"Output 1 - True Label: {true_label_1}, Predicted: {pred_label_1} | Output 2 - True Label: {true_label_2}, Predicted: {pred_label_2}")

# 정확도 및 f1-score 계산
accuracy_1 = accuracy_score(all_labels_1, all_predictions_1)
f1_1 = f1_score(all_labels_1, all_predictions_1, average='macro')
accuracy_2 = accuracy_score(all_labels_2, all_predictions_2)
f1_2 = f1_score(all_labels_2, all_predictions_2, average='macro')
top2_accuracy_1 = top2_correct_1 / len(all_labels_1)
top2_accuracy_2 = top2_correct_2 / len(all_labels_2)

print(f"\nOutput 1 - Accuracy: {accuracy_1:.4f}, F1 Score: {f1_1:.4f}, Top-2 Accuracy: {top2_accuracy_1:.4f}")
print(f"Output 2 - Accuracy: {accuracy_2:.4f}, F1 Score: {f1_2:.4f}, Top-2 Accuracy: {top2_accuracy_2:.4f}")


In [None]:
# 테스트 데이터셋 불러오기
test_df = pd.read_csv(config.test_file)

# 예측 결과를 새로운 열로 추가
# 리스트의 길이를 테스트 데이터 길이와 맞추기 위해 [:-2]로 슬라이스
test_df["Predicted_Label_1"] = all_predictions_1[:-2]
test_df["Predicted_Label_2"] = all_predictions_2[:-2]

# 결과를 저장할 파일명 정의 (예: "test_with_predictions.csv")
output_file = os.path.join(os.path.dirname(config.test_file), "test_with_predictions.csv")

# DataFrame을 CSV 파일로 저장 (index=False로 설정하여 인덱스 열을 저장하지 않음)
test_df.to_csv(output_file, index=False)
print(f"Predictions saved to {output_file}")



In [None]:
print(len(all_predictions_1))
print(len(all_predictions_2))
print(len(test_df))


In [None]:
test_data_sample = next(iter(test_dataloader))  # 첫 번째 배치만 가져오기
inputs, labels = test_data_sample

# GPU 사용이 가능한 경우 해당 디바이스로 이동
for k, v in inputs.items():
    if isinstance(v, torch.Tensor):
        inputs[k] = v.to(devices[0])

with torch.no_grad():
    outputs = model(inputs)
    attention_scores = outputs[-1]  # attention scores는 마지막에 반환됨


In [None]:
import seaborn as sns

# 첫 번째 데이터의 첫 번째 head의 attention scores를 선택
attention_map = attention_scores[0][0, 0].cpu().detach().numpy()

# attention map 그리기
plt.figure(figsize=(10, 10))
sns.heatmap(attention_map, cmap="YlGnBu")
plt.show()


In [None]:
print(f"Min value in the attention map: {attention_map.min()}")
print(f"Max value in the attention map: {attention_map.max()}")


In [None]:
import numpy as np

# attention_scores는 모델에서 반환받은 attention 값이라고 가정합니다.
attention_data = attention_scores[0][0, 0].cpu().detach().numpy()

# 기본 통계치 계산
mean_val = np.mean(attention_data)
std_val = np.std(attention_data)
min_val = np.min(attention_data)
max_val = np.max(attention_data)

# 출력
print(f"Mean of Attention Weights: {mean_val}")
print(f"Standard Deviation of Attention Weights: {std_val}")
print(f"Minimum Value of Attention Weights: {min_val}")
print(f"Maximum Value of Attention Weights: {max_val}")


In [None]:
# 예시로 첫 번째 데이터의 첫 번째 head의 attention scores 선택
sample_attention = attention_scores[0][0, 0].cpu().detach().numpy()

# 토큰화된 입력 문장 가져오기
input_tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

# 평균 attention score 사용
avg_attention = sample_attention.mean(axis=1)

# scatter plot
plt.figure(figsize=(50, 5))  # 크기를 조절합니다.
plt.scatter(input_tokens, [1] * len(input_tokens), c=avg_attention, s=500, cmap='YlGnBu')
plt.colorbar()
plt.xticks(rotation=45, fontsize=10)  # 폰트 크기를 조절합니다.
plt.yticks([])
plt.show()



In [None]:
# 예시로 첫 번째 데이터의 첫 번째 head의 attention scores 선택
sample_attention = attention_scores[0][0, 0].cpu().detach().numpy()

# 토큰화된 입력 문장 가져오기
input_tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

# 평균 attention score 사용
avg_attention = sample_attention.mean(axis=1)

# 토큰과 그에 해당하는 attention score 출력
for token, score in zip(input_tokens, avg_attention):
    print(f"Token: {token}, Score: {score:.5f}")


In [None]:
# 예시로 첫 번째 데이터의 첫 번째 head의 attention scores 선택
sample_attention = attention_scores[0][0, 0].cpu().detach().numpy()

# 토큰화된 입력 문장 가져오기
input_tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

# 평균 attention score 사용
avg_attention = sample_attention.mean(axis=1)

# 토큰과 그에 해당하는 attention score의 튜플 리스트 생성
token_score_pairs = [(token, score) for token, score in zip(input_tokens, avg_attention)]

# attention score를 기준으로 정렬
sorted_token_score_pairs = sorted(token_score_pairs, key=lambda x: x[1], reverse=True)

# 정렬된 결과 출력
for token, score in sorted_token_score_pairs:
    print(f"Token: {token}, Score: {score:.5f}")
