In [1]:
import os
import math
import random
import pandas as pd
import regex as re
import numpy as np
from typing import Optional, Sequence

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, f1_score

from tqdm import tqdm
import torch
from torch import nn
from torch import Tensor
from torch.nn import functional as F
from torch.optim.lr_scheduler import _LRScheduler
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, EarlyStoppingCallback, AutoModel, AutoConfig

import gc
os.environ["TOKENIZERS_PARALLELISM"] = "false"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train = pd.read_csv('train.csv').drop(['ID'], axis=1)
test = pd.read_csv('test.csv')

In [3]:
train

Unnamed: 0,문장,유형,극성,시제,확실성,label
0,0.75%포인트 금리 인상은 1994년 이후 28년 만에 처음이다.,사실형,긍정,현재,확실,사실형-긍정-현재-확실
1,이어 ＂앞으로 전문가들과 함께 4주 단위로 상황을 재평가할 예정＂이라며 ＂그 이전이...,사실형,긍정,과거,확실,사실형-긍정-과거-확실
2,정부가 고유가 대응을 위해 7월부터 연말까지 유류세 인하 폭을 30%에서 37%까지...,사실형,긍정,미래,확실,사실형-긍정-미래-확실
3,"서울시는 올해 3월 즉시 견인 유예시간 60분을 제공하겠다고 밝혔지만, 하루 만에 ...",사실형,긍정,과거,확실,사실형-긍정-과거-확실
4,익사한 자는 사다리에 태워 거꾸로 놓고 소금으로 코를 막아 가득 채운다.,사실형,긍정,현재,확실,사실형-긍정-현재-확실
...,...,...,...,...,...,...
16536,"＇신동덤＇은 ＇신비한 동물사전＇과 ＇해리 포터＇ 시리즈를 잇는 마법 어드벤처물로, ...",사실형,긍정,과거,확실,사실형-긍정-과거-확실
16537,"수족냉증은 어릴 때부터 심했으며 관절은 어디 한 곳이 아니고 목, 어깨, 팔꿈치, ...",사실형,긍정,과거,확실,사실형-긍정-과거-확실
16538,김금희 소설가는 ＂계약서 조정이 그리 어려운가 작가를 격려한다면서 그런 문구 하나 ...,사실형,긍정,과거,확실,사실형-긍정-과거-확실
16539,1만명이 넘는 방문자수를 기록한 이번 전시회는 총 77개 작품을 넥슨 사옥을 그대로...,사실형,긍정,과거,불확실,사실형-긍정-과거-불확실


In [4]:
train.유형.unique(), train.극성.unique(), train.시제.unique(), train.확실성.unique()

(array(['사실형', '추론형', '예측형', '대화형'], dtype=object),
 array(['긍정', '부정', '미정'], dtype=object),
 array(['현재', '과거', '미래'], dtype=object),
 array(['확실', '불확실'], dtype=object))

In [5]:
train['문장'] = train['문장'].apply(lambda x: re.sub("[^ A-Za-z0-9가-힣]", "", x))
train['문장'] = train['문장'].apply(lambda x: re.sub("[ +]", " ", x))

test['문장'] = test['문장'].apply(lambda x: re.sub("[^ A-Za-z0-9가-힣]", "", x))
test['문장'] = test['문장'].apply(lambda x: re.sub("[ +]", " ", x))

In [6]:
train['문장']

0                       075포인트 금리 인상은 1994년 이후 28년 만에 처음이다
1        이어 앞으로 전문가들과 함께 4주 단위로 상황을 재평가할 예정이라며 그 이전이라도 ...
2        정부가 고유가 대응을 위해 7월부터 연말까지 유류세 인하 폭을 30에서 37까지 확대한다
3        서울시는 올해 3월 즉시 견인 유예시간 60분을 제공하겠다고 밝혔지만 하루 만에 차...
4                  익사한 자는 사다리에 태워 거꾸로 놓고 소금으로 코를 막아 가득 채운다
                               ...                        
16536    신동덤은 신비한 동물사전과 해리 포터 시리즈를 잇는 마법 어드벤처물로 전편에 이어 ...
16537    수족냉증은 어릴 때부터 심했으며 관절은 어디 한 곳이 아니고 목 어깨 팔꿈치 등 허...
16538    김금희 소설가는 계약서 조정이 그리 어려운가 작가를 격려한다면서 그런 문구 하나 고...
16539    1만명이 넘는 방문자수를 기록한 이번 전시회는 총 77개 작품을 넥슨 사옥을 그대로...
16540                                           목민심서의 내용이다
Name: 문장, Length: 16541, dtype: object

# Text Aug

In [7]:
# train, X_val, _, _ = train_test_split(train, train.label, test_size=0.1, random_state=42)

In [8]:
# https://github.com/catSirup/KorEDA/blob/master/eda.py
def swap_word(new_words):
    random_idx_1 = random.randint(0, len(new_words)-1)
    random_idx_2 = random_idx_1
    counter = 0

    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(new_words)-1)
        counter += 1
        if counter > 3:
            return new_words

    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
    return new_words

def random_swap(words, n):
    new_words = words.copy()
    for _ in range(n):
        new_words = swap_word(new_words)
    return new_words

def text_aug(sentence, alpha_rs = 0.1, num_aug=3):
    words = sentence.split(' ')
    words = [word for word in words if word != ""]
    num_words = len(words)

    augmented_sentences = []
    num_new_per_technique = num_aug

    n_rs = max(1, int(alpha_rs*num_words))

    for _ in range(num_new_per_technique):
        a_words = random_swap(words, n_rs)
        augmented_sentences.append(" ".join(a_words))

    augmented_sentences = [sentence for sentence in augmented_sentences]
    random.shuffle(augmented_sentences)

    if num_aug >= 1:
        augmented_sentences = augmented_sentences[:num_aug]
    else:
        keep_prob = num_aug / len(augmented_sentences)
        augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]
    return augmented_sentences

aug = train['문장'].apply(lambda x: text_aug(x))

In [9]:
tmp1 = train.copy()
tmp1['문장'] = list(map(lambda x: x[0], aug))

tmp2 = train.copy()
tmp2['문장'] = list(map(lambda x: x[1], aug))

tmp3 = train.copy()
tmp3['문장'] = list(map(lambda x: x[2], aug))

In [10]:
train = pd.concat([train,tmp1,tmp2,tmp3]).drop_duplicates(keep='first').sample(frac=1).reset_index(drop=True)
train

Unnamed: 0,문장,유형,극성,시제,확실성,label
0,국민 대장주로 불리던 삼성전자가 5만원 선에 머무른 것은 지난 2020년 11월 이...,사실형,긍정,현재,확실,사실형-긍정-현재-확실
1,안 아들 하박은 스무 살도 둘째 누이동생이 빛이 두려워하는 나이였음에도 전혀 없이 ...,사실형,긍정,과거,확실,사실형-긍정-과거-확실
2,감염자 비말이나 감염자와의 직접 전파된다 의해 주로 접촉에,사실형,긍정,현재,확실,사실형-긍정-현재-확실
3,연장 1차전에서는 세 명 모두 버디 그리고 2차전에서 두 번째 샷이 벙커에 빠진 유...,사실형,긍정,과거,확실,사실형-긍정-과거-확실
4,나이 겨우 오십대 초반이었고 통풍을 1주일이 진단받기까지 걸렸다,사실형,긍정,과거,확실,사실형-긍정-과거-확실
...,...,...,...,...,...,...
64629,걸죽하고 진득한 이 막걸리는 산미가 낸다 생기를 주고 달고 진한 당도가 잘 어우러져...,사실형,긍정,현재,확실,사실형-긍정-현재-확실
64630,평소 사용하던 얇은 두꺼운 조금 그립보다 그립으로 바꿨죠,대화형,긍정,과거,확실,대화형-긍정-과거-확실
64631,우리금융지주 이사회는 사람이 회장 연임은 찬성했지만 행장직은 다른 손태승 맡도록 했다,사실형,긍정,과거,확실,사실형-긍정-과거-확실
64632,선조가 후궁에게 불상을 만들어주기 위해 황랍을 소문이 한다는 사용하려고 궐내에 파다...,사실형,긍정,과거,확실,사실형-긍정-과거-확실


In [11]:
train['문장'].str.len().max(), test['문장'].str.len().max()

(496, 378)

# Dataset

In [12]:
device = torch.device("cuda")
model_path = "monologg/kobigbird-bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_path)
length = train['문장'].str.len().max()

In [13]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            st_type = self.labels['type'][idx]
            st_polarity = self.labels['polarity'][idx]
            st_tense = self.labels['tense'][idx]
            st_certainty = self.labels['certainty'][idx]
            item["labels"] = torch.tensor(st_type), torch.tensor(st_polarity), torch.tensor(st_tense), torch.tensor(st_certainty)
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

# HuggingFace Phase

## config

In [14]:
config=AutoConfig.from_pretrained(model_path)
config._name_or_path = 'kr.kim'
print(config.num_hidden_layers)
# config.num_hidden_layers = 17
config

12


BigBirdConfig {
  "_name_or_path": "kr.kim",
  "architectures": [
    "BigBirdForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "attention_type": "block_sparse",
  "block_size": 64,
  "bos_token_id": 5,
  "classifier_dropout": null,
  "eos_token_id": 6,
  "gradient_checkpointing": false,
  "hidden_act": "gelu_new",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 4096,
  "model_type": "big_bird",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_random_blocks": 3,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "rescale_embeddings": false,
  "sep_token_id": 3,
  "tokenizer_class": "BertTokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_bias": true,
  "use_cache": true,
  "vocab_size": 32500
}

## custom model

In [15]:
class CustomModel(nn.Module):
    def __init__(self):
        super(CustomModel, self).__init__()
        if model_path == 'monologg/kobigbird-bert-base':
            config.attention_type = "original_full"
        self.base_model = AutoModel.from_pretrained(model_path, config=config)
        self.out = self.base_model.encoder.layer[-1].output.dense.out_features//2
        self.norm = 384
        
        self.Linear1 = nn.Sequential(
            nn.Linear(768, 384),
            nn.BatchNorm1d(self.norm))
        self.Linear2 = nn.Sequential(
            nn.Linear(768, 384),
            nn.BatchNorm1d(self.norm))
        self.Linear3 = nn.Sequential(
            nn.Linear(768, 384),
            nn.BatchNorm1d(self.norm))
        self.Linear4 = nn.Sequential(
            nn.Linear(768, 384),
            nn.BatchNorm1d(self.norm))
        
        self.type_classifier = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(in_features=self.out, out_features=4),
        )
        self.polarity_classifier = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(in_features=self.out, out_features=3),
        )
        self.tense_classifier = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(in_features=self.out, out_features=3),
        )
        self.certainty_classifier = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(in_features=self.out, out_features=2),
        )
        
    def forward(self, input_ids, attention_mask, labels=None, token_type_ids=None):
        x = self.base_model(input_ids=input_ids, attention_mask=attention_mask)[0]
        x = x[:,0,:]
        # 문장 유형, 극성, 시제, 확실성을 각각 분류
#         out4 = self.Linear4(x)
#         certainty_output = self.certainty_classifier((out4)[:,0,:].view(-1,self.out))
        
#         out3 = self.Linear3(x)
#         tense_output = self.tense_classifier((out3+out4)[:,0,:].view(-1,self.out))
        
#         out2 = self.Linear2(x)
#         polarity_output = self.polarity_classifier((out2+out3+out4)[:,0,:].view(-1,self.out))
        
#         out1 = self.Linear1(x) 
#         type_output = self.type_classifier((out1+out2+out3+out4)[:,0,:].view(-1,self.out))

        out4 = self.Linear4(x)
        certainty_output = self.certainty_classifier(out4)
        
        out3 = self.Linear3(x)
        tense_output = self.tense_classifier((out3+out4))
        
        out2 = self.Linear2(x)
        polarity_output = self.polarity_classifier((out2+out3+out4))
        
        out1 = self.Linear1(x) 
        type_output = self.type_classifier((out1+out2+out3+out4))
        
        return type_output, polarity_output, tense_output, certainty_output

## arg

In [16]:
# Trainer arguments
lr = 1e-4
stop = 3
epoch = 1000
batch = 16
seed = 42

## loss

In [17]:
class FocalLoss(nn.Module):
    """ Focal Loss, as described in https://arxiv.org/abs/1708.02002.
    It is essentially an enhancement to cross entropy loss and is
    useful for classification tasks when there is a large class imbalance.
    x is expected to contain raw, unnormalized scores for each class.
    y is expected to contain class labels.
    Shape:
        - x: (batch_size, C) or (batch_size, C, d1, d2, ..., dK), K > 0.
        - y: (batch_size,) or (batch_size, d1, d2, ..., dK), K > 0.
    """

    def __init__(self,
                 alpha: Optional[Tensor] = None,
                 gamma: float = 0.,
                 reduction: str = 'mean',
                 ignore_index: int = -100):
        """Constructor.
        Args:
            alpha (Tensor, optional): Weights for each class. Defaults to None.
            gamma (float, optional): A constant, as described in the paper.
                Defaults to 0.
            reduction (str, optional): 'mean', 'sum' or 'none'.
                Defaults to 'mean'.
            ignore_index (int, optional): class label to ignore.
                Defaults to -100.
        """
        if reduction not in ('mean', 'sum', 'none'):
            raise ValueError(
                'Reduction must be one of: "mean", "sum", "none".')

        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.ignore_index = ignore_index
        self.reduction = reduction

        self.nll_loss = nn.NLLLoss(
            weight=alpha, reduction='none', ignore_index=ignore_index)

    def __repr__(self):
        arg_keys = ['alpha', 'gamma', 'ignore_index', 'reduction']
        arg_vals = [self.__dict__[k] for k in arg_keys]
        arg_strs = [f'{k}={v!r}' for k, v in zip(arg_keys, arg_vals)]
        arg_str = ', '.join(arg_strs)
        return f'{type(self).__name__}({arg_str})'

    def forward(self, x: Tensor, y: Tensor) -> Tensor:
        if x.ndim > 2:
            # (N, C, d1, d2, ..., dK) --> (N * d1 * ... * dK, C)
            c = x.shape[1]
            x = x.permute(0, *range(2, x.ndim), 1).reshape(-1, c)
            # (N, d1, d2, ..., dK) --> (N * d1 * ... * dK,)
            y = y.view(-1)

        unignored_mask = y != self.ignore_index
        y = y[unignored_mask]
        if len(y) == 0:
            return torch.tensor(0.)
        x = x[unignored_mask]

        # compute weighted cross entropy term: -alpha * log(pt)
        # (alpha is already part of self.nll_loss)
        log_p = F.log_softmax(x, dim=-1)
        ce = self.nll_loss(log_p, y)

        # get true class column from each row
        all_rows = torch.arange(len(x))
        log_pt = log_p[all_rows, y]

        # compute focal term: (1 - pt)^gamma
        pt = log_pt.exp()
        focal_term = (1 - pt)**self.gamma

        # the full loss: -alpha * ((1 - pt)^gamma) * log(pt)
        loss = focal_term * ce

        if self.reduction == 'mean':
            loss = loss.mean()
        elif self.reduction == 'sum':
            loss = loss.sum()

        return loss


def focal_loss(alpha: Optional[Sequence] = None,
               gamma: float = 0.,
               reduction: str = 'mean',
               ignore_index: int = -100,
               device='cpu',
               dtype=torch.float32) -> FocalLoss:
    """Factory function for FocalLoss.
    Args:
        alpha (Sequence, optional): Weights for each class. Will be converted
            to a Tensor if not None. Defaults to None.
        gamma (float, optional): A constant, as described in the paper.
            Defaults to 0.
        reduction (str, optional): 'mean', 'sum' or 'none'.
            Defaults to 'mean'.
        ignore_index (int, optional): class label to ignore.
            Defaults to -100.
        device (str, optional): Device to move alpha to. Defaults to 'cpu'.
        dtype (torch.dtype, optional): dtype to cast alpha to.
            Defaults to torch.float32.
    Returns:
        A FocalLoss object
    """
    if alpha is not None:
        if not isinstance(alpha, Tensor):
            alpha = torch.tensor(alpha)
        alpha = alpha.to(device=device, dtype=dtype)

    fl = FocalLoss(
        alpha=alpha,
        gamma=gamma,
        reduction=reduction,
        ignore_index=ignore_index)
    return fl

class ASLSingleLabel(nn.Module):
    '''
    This loss is intended for single-label classification problems
    '''
    def __init__(self, gamma_pos=0, gamma_neg=4, eps: float = 0.1, reduction='mean'):
        super(ASLSingleLabel, self).__init__()

        self.eps = eps
        self.logsoftmax = nn.LogSoftmax(dim=-1)
        self.targets_classes = []
        self.gamma_pos = gamma_pos
        self.gamma_neg = gamma_neg
        self.reduction = reduction

    def forward(self, inputs, target):
        '''
        "input" dimensions: - (batch_size,number_classes)
        "target" dimensions: - (batch_size)
        '''
        num_classes = inputs.size()[-1]
        log_preds = self.logsoftmax(inputs)
        self.targets_classes = torch.zeros_like(inputs).scatter_(1, target.long().unsqueeze(1), 1)

        # ASL weights
        targets = self.targets_classes
        anti_targets = 1 - targets
        xs_pos = torch.exp(log_preds)
        xs_neg = 1 - xs_pos
        xs_pos = xs_pos * targets
        xs_neg = xs_neg * anti_targets
        asymmetric_w = torch.pow(1 - xs_pos - xs_neg,
                                 self.gamma_pos * targets + self.gamma_neg * anti_targets)
        log_preds = log_preds * asymmetric_w

        if self.eps > 0:  # label smoothing
            self.targets_classes = self.targets_classes.mul(1 - self.eps).add(self.eps / num_classes)

        # loss calculation
        loss = - self.targets_classes.mul(log_preds)

        loss = loss.sum(dim=-1)
        if self.reduction == 'mean':
            loss = loss.mean()

        return loss
        
def compute_metrics(pred):
    # label = [[cls1,cls2,...],]
    # preds = n list
    focal_loss = FocalLoss()
    labels = pred.label_ids
    preds = pred.predictions
    f1 = []
    focal = []
    for i in range(4):
        # focal.append(focal_loss(torch.tensor(preds[i], dtype=torch.float), torch.tensor(labels[::, i],dtype=torch.float)))
        f1.append(f1_score(y_true = labels[::, i], y_pred = preds[i], average='weighted'))
    return {
        #'focal': sum(focal),
        'f1-sum': sum(f1)/4
    }

## trainer

In [19]:
# Define trainer
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def compute_loss(self, model, inputs, return_outputs=False):
        # forward pass
        labels = inputs.pop("labels").to(torch.int64)
        
        type_logit, polarity_logit, tense_logit, certainty_logit = model(**inputs)
        
        # # simple loss
        # criterion = {
        #     'type' : nn.CrossEntropyLoss().to(device),
        #     'polarity' : nn.CrossEntropyLoss().to(device),
        #     'tense' : nn.CrossEntropyLoss().to(device),
        #     'certainty' : nn.CrossEntropyLoss().to(device)
        # }
        # loss = criterion['type'](type_logit, labels[::, 0]) + \
        #             criterion['polarity'](polarity_logit, labels[::, 1]) + \
        #             criterion['tense'](tense_logit,labels[::, 2]) + \
        #             criterion['certainty'](certainty_logit, labels[::, 3])
        
        # # focal loss
        # criterion = {
        #     'type' : FocalLoss().to(device),
        #     'polarity' : FocalLoss().to(device),
        #     'tense' : FocalLoss().to(device),
        #     'certainty' : FocalLoss().to(device)
        # }
        # # labels = labels.type(torch.float).clone().detach()
        # loss = criterion['type'](type_logit, labels[::, 0]) + \
        #             criterion['polarity'](polarity_logit, labels[::, 1]) + \
        #             criterion['tense'](tense_logit, labels[::, 2]) + \
        #             criterion['certainty'](certainty_logit, labels[::, 3])
        
        # ASLoss
        criterion = {
            'type' : ASLSingleLabel().to(device),
            'polarity' : ASLSingleLabel().to(device),
            'tense' : ASLSingleLabel().to(device),
            'certainty' : ASLSingleLabel().to(device)
        }
        # labels = labels.type(torch.float).clone().detach()
        loss = criterion['type'](type_logit, labels[::, 0]) + \
                    criterion['polarity'](polarity_logit, labels[::, 1]) + \
                    criterion['tense'](tense_logit, labels[::, 2]) + \
                    criterion['certainty'](certainty_logit, labels[::, 3])

        outputs = None, \
                    torch.argmax(type_logit, dim = 1), \
                    torch.argmax(polarity_logit, dim = 1),\
                    torch.argmax(tense_logit, dim = 1),\
                    torch.argmax(certainty_logit, dim = 1)
        return (loss, outputs) if return_outputs else loss

# Fold

In [20]:
유형 = LabelEncoder()
유형.fit(train['유형'])

극성 = LabelEncoder()
극성.fit(train['극성'])

시제 = LabelEncoder()
시제.fit(train['시제'])

확실성 = LabelEncoder()
확실성.fit(train['확실성'])

def encoding(X_train, X_val):
    X_train['유형'] = 유형.transform(X_train['유형'])
    X_val['유형'] = 유형.transform(X_val['유형'])

    X_train['극성'] = 극성.transform(X_train['극성'])
    X_val['극성'] = 극성.transform(X_val['극성'])

    X_train['시제'] = 시제.transform(X_train['시제'])
    X_val['시제'] = 시제.transform(X_val['시제'])

    X_train['확실성'] = 확실성.transform(X_train['확실성'])
    X_val['확실성'] = 확실성.transform(X_val['확실성'])

    train_labels = {
        'type' : X_train['유형'].values,
        'polarity' : X_train['극성'].values,
        'tense' : X_train['시제'].values,
        'certainty' : X_train['확실성'].values
    }

    val_labels = {
        'type' : X_val['유형'].values,
        'polarity' : X_val['극성'].values,
        'tense' : X_val['시제'].values,
        'certainty' : X_val['확실성'].values
    }
    return train_labels, val_labels

In [21]:
config=AutoConfig.from_pretrained(model_path)
config._name_or_path = 'kr.kim'
print(f'hidden_layers : {config.num_hidden_layers}')
config.num_hidden_layers = 12
print(f'now_hidden_layers : {config.num_hidden_layers}')

kf = KFold(n_splits=5, random_state=seed, shuffle=True)
for i, (train_index, test_index) in enumerate(kf.split(train)):
    print(f'Round {i}')
    X_train, X_val = train.loc[train_index, :], train.loc[test_index, :]
    train_labels, val_labels = encoding(X_train, X_val)
    token_train, token_val = tokenizer(X_train.문장.tolist(), padding=True, truncation=True, max_length=length), tokenizer(X_val.문장.tolist(), padding=True, truncation=True, max_length=length)
    train_dataset, val_dataset = CustomDataset(token_train, train_labels), CustomDataset(token_val, val_labels)
    model = CustomModel()
    model.to(device)
    args = TrainingArguments(run_name = f'fold_{i}',                                # 모델이름
                             output_dir= f"fold_{i}",                               # 모델저장경로
                             evaluation_strategy="steps",                           # 모델의 평가를 언제 진행할지
                             eval_steps=100,                                        # 500 스텝 마다 모델 평가
                             save_steps=100,                                        # 500 스텝 마다 모델 저장
                             save_total_limit = 2,                                  # 저장할 모델의 갯수
                             logging_steps=100,                                     # 학습로스 로깅
                             per_device_train_batch_size=batch,                     # GPU에 학습데이터를 몇개씩 올려서 학습할지
                             per_device_eval_batch_size=batch,                      # GPU에 학습데이터를 몇개씩 올려서 평가할지
                             gradient_accumulation_steps=16,                        # 가상배치
                             num_train_epochs=epoch,                                # 전체 학습 진행 횟수
                             learning_rate=lr,                                      # 학습률 정의 
                             seed=seed,                                             # seed
                             load_best_model_at_end=True,                           # 평가기준 스코어가 좋은 모델만 저장할지 여부
                             fp16=True,
                             do_train=True,
                             do_eval=True,
                             # metric_for_best_model
                             # greater_is_better = True,
    )
    trainer = CustomTrainer(model=model,
                            args=args,                                                        # args
                            train_dataset=train_dataset,                                      # 학습데이터
                            eval_dataset=val_dataset,                                         # validation 데이터
                            compute_metrics=compute_metrics,                                  # 모델 평가 방식
                            callbacks=[EarlyStoppingCallback(early_stopping_patience=stop)],) # callback
    trainer.train()
    del model
    del trainer
    gc.collect() # python 자원 관리 
    torch.cuda.empty_cache() # gpu 자원관리   

hidden_layers : 12
now_hidden_layers : 12
Round 0


Some weights of the model checkpoint at monologg/kobigbird-bert-base were not used when initializing BigBirdModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BigBirdModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 51707
  Num Epochs 

Step,Training Loss,Validation Loss,F1-sum
100,3.0462,2.056103,0.910028
200,2.2659,1.611284,0.921783
300,1.7215,1.075441,0.940627
400,1.3469,0.827551,0.951954
500,0.9292,0.610854,0.963477
600,0.7047,0.492422,0.970344
700,0.498,0.368659,0.977215
800,0.4199,0.267672,0.982015
900,0.3042,0.235516,0.984779
1000,0.2564,0.245581,0.983976


***** Running Evaluation *****
  Num examples = 12927
  Batch size = 16
Saving model checkpoint to fold_0\checkpoint-100
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Deleting older checkpoint [fold_0\checkpoint-1] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 12927
  Batch size = 16
Saving model checkpoint to fold_0\checkpoint-200
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Deleting older checkpoint [fold_0\checkpoint-2] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 12927
  Batch size = 16
Saving model checkpoint to fold_0\checkpoint-300
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Deleting older checkpoint [fold_0\checkpoint-100] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 12927
  Batch size = 16
Saving model checkpoint to fold_0\checkpoint-400
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Del

Round 1


loading weights file pytorch_model.bin from cache at C:\Users\hist/.cache\huggingface\hub\models--monologg--kobigbird-bert-base\snapshots\ceacda477e20abef2c929adfa4a07c6f811323be\pytorch_model.bin
Some weights of the model checkpoint at monologg/kobigbird-bert-base were not used when initializing BigBirdModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BigBirdModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdModel from the checkpoint of a model that you expect to be exactly identical (in

Step,Training Loss,Validation Loss,F1-sum
100,3.1352,2.153731,0.897209
200,2.2227,1.646648,0.923495
300,1.6557,1.240236,0.939917
400,1.2514,0.85744,0.9512
500,0.854,0.696137,0.956579
600,0.663,0.446088,0.971299
700,0.464,0.377549,0.975649
800,0.3744,0.278187,0.981633
900,0.2824,0.232633,0.983884
1000,0.227,0.191362,0.987649


***** Running Evaluation *****
  Num examples = 12927
  Batch size = 16
Saving model checkpoint to fold_1\checkpoint-100
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Deleting older checkpoint [fold_1\checkpoint-200] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 12927
  Batch size = 16
Saving model checkpoint to fold_1\checkpoint-200
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Deleting older checkpoint [fold_1\checkpoint-500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 12927
  Batch size = 16
Saving model checkpoint to fold_1\checkpoint-300
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Deleting older checkpoint [fold_1\checkpoint-100] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 12927
  Batch size = 16
Saving model checkpoint to fold_1\checkpoint-400
Trainer.model is not a `PreTrainedModel`, only saving its state dict.

Round 2


loading weights file pytorch_model.bin from cache at C:\Users\hist/.cache\huggingface\hub\models--monologg--kobigbird-bert-base\snapshots\ceacda477e20abef2c929adfa4a07c6f811323be\pytorch_model.bin
Some weights of the model checkpoint at monologg/kobigbird-bert-base were not used when initializing BigBirdModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BigBirdModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdModel from the checkpoint of a model that you expect to be exactly identical (in

Step,Training Loss,Validation Loss,F1-sum
100,2.9494,1.954585,0.907448
200,2.1472,1.51684,0.927318
300,1.6265,1.147422,0.942105
400,1.2358,0.858577,0.951348
500,0.8705,0.657138,0.961959
600,0.675,0.425842,0.972619
700,0.4687,0.410099,0.97623
800,0.4032,0.303434,0.979921
900,0.2934,0.235442,0.985335
1000,0.2587,0.210275,0.987337


***** Running Evaluation *****
  Num examples = 12927
  Batch size = 16
Saving model checkpoint to fold_2\checkpoint-100
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Deleting older checkpoint [fold_2\checkpoint-200] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 12927
  Batch size = 16
Saving model checkpoint to fold_2\checkpoint-200
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Deleting older checkpoint [fold_2\checkpoint-500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 12927
  Batch size = 16
Saving model checkpoint to fold_2\checkpoint-300
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Deleting older checkpoint [fold_2\checkpoint-100] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 12927
  Batch size = 16
Saving model checkpoint to fold_2\checkpoint-400
Trainer.model is not a `PreTrainedModel`, only saving its state dict.

Round 3


loading weights file pytorch_model.bin from cache at C:\Users\hist/.cache\huggingface\hub\models--monologg--kobigbird-bert-base\snapshots\ceacda477e20abef2c929adfa4a07c6f811323be\pytorch_model.bin
Some weights of the model checkpoint at monologg/kobigbird-bert-base were not used when initializing BigBirdModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BigBirdModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdModel from the checkpoint of a model that you expect to be exactly identical (in

Step,Training Loss,Validation Loss,F1-sum
100,2.9897,1.969756,0.908166
200,2.2035,1.552715,0.925843
300,1.6852,1.187674,0.939147
400,1.2876,0.842769,0.954004
500,0.8845,0.63471,0.963379
600,0.6787,0.580924,0.965142
700,0.4821,0.35471,0.97773
800,0.386,0.298473,0.979098
900,0.287,0.24957,0.983898
1000,0.2644,0.201522,0.986912


***** Running Evaluation *****
  Num examples = 12927
  Batch size = 16
Saving model checkpoint to fold_3\checkpoint-100
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Deleting older checkpoint [fold_3\checkpoint-200] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 12927
  Batch size = 16
Saving model checkpoint to fold_3\checkpoint-200
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Deleting older checkpoint [fold_3\checkpoint-500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 12927
  Batch size = 16
Saving model checkpoint to fold_3\checkpoint-300
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Deleting older checkpoint [fold_3\checkpoint-100] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 12927
  Batch size = 16
Saving model checkpoint to fold_3\checkpoint-400
Trainer.model is not a `PreTrainedModel`, only saving its state dict.

Round 4


loading weights file pytorch_model.bin from cache at C:\Users\hist/.cache\huggingface\hub\models--monologg--kobigbird-bert-base\snapshots\ceacda477e20abef2c929adfa4a07c6f811323be\pytorch_model.bin
Some weights of the model checkpoint at monologg/kobigbird-bert-base were not used when initializing BigBirdModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BigBirdModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdModel from the checkpoint of a model that you expect to be exactly identical (in

Step,Training Loss,Validation Loss,F1-sum
100,2.9271,1.797583,0.920987
200,2.1144,1.644698,0.930593
300,1.6225,1.022613,0.943607
400,1.2134,0.764746,0.950379
500,0.8378,0.60453,0.962633
600,0.6439,0.424613,0.971782
700,0.4675,0.349737,0.976844
800,0.3796,0.273496,0.981883
900,0.2998,0.240864,0.984266
1000,0.25,0.18923,0.988147


***** Running Evaluation *****
  Num examples = 12926
  Batch size = 16
Saving model checkpoint to fold_4\checkpoint-100
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 12926
  Batch size = 16
Saving model checkpoint to fold_4\checkpoint-200
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Deleting older checkpoint [fold_4\checkpoint-100] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 12926
  Batch size = 16
Saving model checkpoint to fold_4\checkpoint-300
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Deleting older checkpoint [fold_4\checkpoint-400] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 12926
  Batch size = 16
Saving model checkpoint to fold_4\checkpoint-400
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Deleting older checkpoint [fold_4\checkpoint-200] due to args.save_total_limit

# Predict

In [22]:
def recent_file(path):
    file_name_and_time_lst = []
    # 해당 경로에 있는 파일들의 생성시간을 함께 리스트로 넣어줌. 
    for f_name in os.listdir(f"{path}"):
        written_time = os.path.getctime(f"{path}/{f_name}")
        file_name_and_time_lst.append((f_name, written_time))
    # 생성시간 역순으로 정렬하고, 
    sorted_file_lst = sorted(file_name_and_time_lst, key=lambda x: x[1], reverse=True)
    # 가장 앞에 이는 놈을 넣어준다.
    recent_file = sorted_file_lst[0]
    recent_file_name = recent_file[0]
    return f"{path}/{recent_file_name}"

In [23]:
gc.collect() # python 자원 관리 
torch.cuda.empty_cache() # gpu 자원관리
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenized = tokenizer(test.문장.tolist(), padding=True, truncation=True, max_length=length, return_tensors="pt")
test_dataset = CustomDataset(tokenized, None)
test_args = TrainingArguments(
    output_dir = './',
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = 512,   
    dataloader_drop_last = False    
)

tmp = 0
while os.path.isdir(f'fold_{tmp}'):
    tmp += 1

test_results = []
for i in range(tmp):
    print(f'Round {i}')
    # model = AutoModel.from_pretrained(recent_file('custom_model'), config=config)
    model = CustomModel().to(device)
    model.load_state_dict(torch.load(f"{recent_file(f'fold_{i}')}/pytorch_model.bin"))
    trainer = CustomTrainer(
                  model = model, 
                  args = test_args, 
                  compute_metrics = compute_metrics)
    test_results.append(trainer.predict(test_dataset))
    gc.collect() # python 자원 관리 
    torch.cuda.empty_cache() # gpu 자원관리
    del model
    del trainer

loading file vocab.txt from cache at C:\Users\hist/.cache\huggingface\hub\models--monologg--kobigbird-bert-base\snapshots\ceacda477e20abef2c929adfa4a07c6f811323be\vocab.txt
loading file tokenizer.json from cache at C:\Users\hist/.cache\huggingface\hub\models--monologg--kobigbird-bert-base\snapshots\ceacda477e20abef2c929adfa4a07c6f811323be\tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at C:\Users\hist/.cache\huggingface\hub\models--monologg--kobigbird-bert-base\snapshots\ceacda477e20abef2c929adfa4a07c6f811323be\special_tokens_map.json
loading file tokenizer_config.json from cache at C:\Users\hist/.cache\huggingface\hub\models--monologg--kobigbird-bert-base\snapshots\ceacda477e20abef2c929adfa4a07c6f811323be\tokenizer_config.json
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to 

Round 0


Some weights of the model checkpoint at monologg/kobigbird-bert-base were not used when initializing BigBirdModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BigBirdModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of BigBirdModel were initialized from the model checkpoint at monologg/kobigbird-bert-b

loading weights file pytorch_model.bin from cache at C:\Users\hist/.cache\huggingface\hub\models--monologg--kobigbird-bert-base\snapshots\ceacda477e20abef2c929adfa4a07c6f811323be\pytorch_model.bin


Round 1


Some weights of the model checkpoint at monologg/kobigbird-bert-base were not used when initializing BigBirdModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BigBirdModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of BigBirdModel were initialized from the model checkpoint at monologg/kobigbird-bert-b

loading weights file pytorch_model.bin from cache at C:\Users\hist/.cache\huggingface\hub\models--monologg--kobigbird-bert-base\snapshots\ceacda477e20abef2c929adfa4a07c6f811323be\pytorch_model.bin


Round 2


Some weights of the model checkpoint at monologg/kobigbird-bert-base were not used when initializing BigBirdModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BigBirdModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of BigBirdModel were initialized from the model checkpoint at monologg/kobigbird-bert-b

loading weights file pytorch_model.bin from cache at C:\Users\hist/.cache\huggingface\hub\models--monologg--kobigbird-bert-base\snapshots\ceacda477e20abef2c929adfa4a07c6f811323be\pytorch_model.bin


Round 3


Some weights of the model checkpoint at monologg/kobigbird-bert-base were not used when initializing BigBirdModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BigBirdModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of BigBirdModel were initialized from the model checkpoint at monologg/kobigbird-bert-b

loading weights file pytorch_model.bin from cache at C:\Users\hist/.cache\huggingface\hub\models--monologg--kobigbird-bert-base\snapshots\ceacda477e20abef2c929adfa4a07c6f811323be\pytorch_model.bin


Round 4


Some weights of the model checkpoint at monologg/kobigbird-bert-base were not used when initializing BigBirdModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BigBirdModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of BigBirdModel were initialized from the model checkpoint at monologg/kobigbird-bert-b

In [24]:
test['유형'] = list(map(lambda x : 유형.inverse_transform([np.argmax(x)]), sum(list(map(lambda x: x.predictions[0], test_results)))/len(test_results)))
test['극성'] = list(map(lambda x : 극성.inverse_transform([np.argmax(x)]), sum(list(map(lambda x: x.predictions[1], test_results)))/len(test_results)))
test['시제'] = list(map(lambda x : 시제.inverse_transform([np.argmax(x)]), sum(list(map(lambda x: x.predictions[2], test_results)))/len(test_results)))
test['확실성'] = list(map(lambda x : 확실성.inverse_transform([np.argmax(x)]), sum(list(map(lambda x: x.predictions[3], test_results)))/len(test_results)))

test['유형'] = list(map(lambda x : x[0], test['유형']))
test['극성'] = list(map(lambda x : x[0], test['극성']))
test['시제'] = list(map(lambda x : x[0], test['시제']))
test['확실성'] = list(map(lambda x : x[0], test['확실성']))

In [25]:
test

Unnamed: 0,ID,문장,유형,극성,시제,확실성
0,TEST_0000,장욱진의 가족은 허물 없는 가족애를 처음 공개되는 정약용의 정효자전과 정부인전은 강...,사실형,긍정,현재,확실
1,TEST_0001,조지 W 부시 버락 오바마 전 대통령도 전쟁 위험 때문에 버린 카드다,사실형,긍정,현재,확실
2,TEST_0002,지난해 1분기 128억원이었던 영업이익이 올해 1분기 505억원으로 급증했다,사실형,긍정,과거,확실
3,TEST_0003,수상 작가와 맺으려던 계약서 내용 가운데 일부가 독소 조항으로 해석돼 수정을 요청받...,사실형,긍정,현재,확실
4,TEST_0004,결국 최근 KDB산업은행은 대규모 손실 위기에 닥친 에어부산에 140억원 금융지원을...,사실형,긍정,과거,확실
...,...,...,...,...,...,...
7085,TEST_7085,2020 세계국가편람 모바일 앱은 세계 216개국의 국가개황과 주요 경제지표 사회개...,사실형,긍정,현재,확실
7086,TEST_7086,탈세계화 징후들이 반갑지 않은 이유다,추론형,긍정,현재,확실
7087,TEST_7087,틱톡은 6월 인터넷 안전의 달을 맞아 올바른 개인정보 보호 관리 방법 앱 내 유용한...,사실형,긍정,현재,확실
7088,TEST_7088,만약 3개월 간 채굴자들의 투표를 거쳐 23 이상의 해시파워가 채굴세 도입에 찬성한...,추론형,긍정,미래,확실


In [26]:
test['label'] = test['유형'] + '-' + test['극성'] + '-' + test['시제'] + '-' + test['확실성']
test

Unnamed: 0,ID,문장,유형,극성,시제,확실성,label
0,TEST_0000,장욱진의 가족은 허물 없는 가족애를 처음 공개되는 정약용의 정효자전과 정부인전은 강...,사실형,긍정,현재,확실,사실형-긍정-현재-확실
1,TEST_0001,조지 W 부시 버락 오바마 전 대통령도 전쟁 위험 때문에 버린 카드다,사실형,긍정,현재,확실,사실형-긍정-현재-확실
2,TEST_0002,지난해 1분기 128억원이었던 영업이익이 올해 1분기 505억원으로 급증했다,사실형,긍정,과거,확실,사실형-긍정-과거-확실
3,TEST_0003,수상 작가와 맺으려던 계약서 내용 가운데 일부가 독소 조항으로 해석돼 수정을 요청받...,사실형,긍정,현재,확실,사실형-긍정-현재-확실
4,TEST_0004,결국 최근 KDB산업은행은 대규모 손실 위기에 닥친 에어부산에 140억원 금융지원을...,사실형,긍정,과거,확실,사실형-긍정-과거-확실
...,...,...,...,...,...,...,...
7085,TEST_7085,2020 세계국가편람 모바일 앱은 세계 216개국의 국가개황과 주요 경제지표 사회개...,사실형,긍정,현재,확실,사실형-긍정-현재-확실
7086,TEST_7086,탈세계화 징후들이 반갑지 않은 이유다,추론형,긍정,현재,확실,추론형-긍정-현재-확실
7087,TEST_7087,틱톡은 6월 인터넷 안전의 달을 맞아 올바른 개인정보 보호 관리 방법 앱 내 유용한...,사실형,긍정,현재,확실,사실형-긍정-현재-확실
7088,TEST_7088,만약 3개월 간 채굴자들의 투표를 거쳐 23 이상의 해시파워가 채굴세 도입에 찬성한...,추론형,긍정,미래,확실,추론형-긍정-미래-확실


In [27]:
sub = pd.read_csv('sample_submission.csv')
sub['label'] = test['label']
tmp = 0
while os.path.exists(f'제출{tmp}.csv'):
    tmp += 1
sub.to_csv(f'제출{tmp}.csv', index=False, mode='w')