# Import

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import random
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt 

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AutoModel, AutoTokenizer


from tqdm.auto import tqdm
from tqdm.contrib import tzip

import warnings
warnings.filterwarnings(action='ignore') 

In [None]:
from transformers import ElectraModel, ElectraTokenizer
from transformers import AutoModel, AutoTokenizer

In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Setting

### Hyperparameter Setting

In [None]:
CFG = {
    'EPOCHS':70,
    'LEARNING_RATE':1e-6,
    'BATCH_SIZE':16,
    'SEED':41,
    'NSPLITS': 5,
    'MODEL': 'monologg/kobigbird-bert-base',
    'MODEL_NAME' : 'kobigbird-bert-base',
    "save_path" : "/content/drive/MyDrive/Colab_Notebooks/NLP/submission/sentence/",
    "NOTE" : 'k-fold-type_row_lr'  # 모델에 변화를 줄때마다 변경사항 표기.
}

### Data Load

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/NLP/data/sentence_classifi/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/NLP/data/sentence_classifi/test.csv')

In [None]:
train, val, _, _ = train_test_split(df, df['label'], test_size=0.2, random_state=CFG['SEED'])

### Fixed RandomSeed

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED'])

### draw_graph

In [None]:
def draw_graph(title, loss, valid_loss):
    plt.plot(loss, label="Training Loss")
    plt.plot(valid_loss, label="Validation Loss")
    plt.legend(loc='upper right')
    plt.title(title)
    plt.savefig("/content/drive/MyDrive/Colab_Notebooks/NLP/submission/sentence/result/" + title + ".png")
    plt.show()

# Model Build

### Label Encoding

##### label preprocessing encoder

In [None]:
# 2. Label Encoding (유형, 극성, 시제, 확실성)
type_le = preprocessing.LabelEncoder()
train["유형"] = type_le.fit_transform(train["유형"].values)
val["유형"] = type_le.transform(val["유형"].values)

polarity_le = preprocessing.LabelEncoder()
train["극성"] = polarity_le.fit_transform(train["극성"].values)
val["극성"] = polarity_le.transform(val["극성"].values)

tense_le = preprocessing.LabelEncoder()
train["시제"] = tense_le.fit_transform(train["시제"].values)
val["시제"] = tense_le.transform(val["시제"].values)

certainty_le = preprocessing.LabelEncoder()
train["확실성"] = certainty_le.fit_transform(train["확실성"].values)
val["확실성"] = certainty_le.transform(val["확실성"].values)

In [None]:
train_type = train["유형"].values # sentence type
# oneHot_train_type = F.one_hot(train_type)

train_polarity = train["극성"].values # sentence polarity
# oneHot_train_polarity = F.one_hot(train_polarity)

train_tense = train["시제"].values # sentence tense
# oneHot_train_tense = F.one_hot(train_tense)

train_certainty = train["확실성"].values # sentence certainty
# oneHot_ttrain_certainty = F.one_hot(train_certainty)


train_labels = {
    'type' : train_type,
    'polarity' : train_polarity,
    'tense' : train_tense,
    'certainty' : train_certainty
}

In [None]:
val_type = val["유형"].values # sentence type
# oneHot_val_type = F.one_hot(val_type)

val_polarity = val["극성"].values # sentence polarity
# oneHot_val_polarity = F.one_hot(val_polarity)

val_tense = val["시제"].values # sentence tense
# oneHot_val_tense = F.one_hot(val_tense)

val_certainty = val["확실성"].values # sentence certainty
# oneHot_val_certainty = F.one_hot(val_certainty)

val_labels = {
    'type' : val_type,
    'polarity' : val_polarity,
    'tense' : val_tense,
    'certainty' : val_certainty
}

In [None]:
train_labels

{'type': array([1, 1, 1, ..., 0, 1, 1]),
 'polarity': array([0, 0, 0, ..., 0, 0, 0]),
 'tense': array([0, 0, 2, ..., 2, 0, 2]),
 'certainty': array([1, 1, 1, ..., 1, 1, 1])}

### preprocessing

### CustomDataset

In [None]:
class CustomDataset(Dataset):
    def __init__(self, ids, masks, st_labels):
        self.ids = ids
        self.masks = masks
        self.st_labels = st_labels

    def __getitem__(self, index):

        if self.st_labels is not None:
            # st_type = self.st_labels['type'][index]
            # st_polarity = self.st_labels['polarity'][index]
            # st_tense = self.st_labels['tense'][index]
            # st_certainty = self.st_labels['certainty'][index]
            return self.ids[index], self.masks[index], self.st_labels[index] # st_type, st_polarity, st_tense, st_certainty
        else:
            return self.ids[index], self.masks[index]

    def __len__(self):
        return len(self.ids)

In [None]:
# train_dataset = CustomDataset(ids, masks, train_labels)
# train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

# val_dataset = CustomDataset(val_ids, val_masks, val_labels)
# val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

### Data Preprocessing

In [None]:
def convert_examples_to_features(examples, max_seq_len):
    input_ids, attention_masks, token_type_ids = [], [], []
    tokenizer = AutoTokenizer.from_pretrained(CFG['MODEL'])

    for text in examples['문장']:
        inputs = tokenizer.encode_plus(text, return_tensors='pt', padding='max_length', max_length= max_seq_len, truncation= True)
        input_ids.append(inputs['input_ids'][0])
        attention_masks.append(inputs['attention_mask'][0])

    return input_ids, attention_masks

In [None]:
ids, masks = convert_examples_to_features(train, 128)
val_ids, val_masks = convert_examples_to_features(val, 128)
test_ids, test_masks = convert_examples_to_features(test, 128)

In [None]:
ids[0].shape

torch.Size([128])

In [None]:
# 4 dataset
type_train_dataset = CustomDataset(ids, masks, train_type)
type_train_loader = DataLoader(type_train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

polarity_train_dataset = CustomDataset(ids, masks, train_polarity)
polarity_train_loader = DataLoader(polarity_train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

tense_train_dataset = CustomDataset(ids, masks, train_tense)
tense_train_loader = DataLoader(tense_train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

certainty_train_dataset = CustomDataset(ids, masks, train_certainty)
certainty_train_loader = DataLoader(certainty_train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

type_val_dataset = CustomDataset(val_ids, val_masks, val_type)
type_val_loader = DataLoader(type_val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

polarity_val_dataset = CustomDataset(val_ids, val_masks, val_polarity)
polarity_val_loader = DataLoader(polarity_val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

tense_val_dataset = CustomDataset(val_ids, val_masks, val_tense)
tense_val_loader = DataLoader(tense_val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

certainty_val_dataset = CustomDataset(val_ids, val_masks, val_certainty)
certainty_val_loader = DataLoader(certainty_val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [None]:
train_loader_dict = {
    'type' : type_train_loader,
    'polarity' : polarity_train_loader,
    'tense' : tense_train_loader,
    'certainty' : certainty_train_loader
}

val_loader_dict = {
    'type' : type_val_loader,
    'polarity' : polarity_val_loader,
    'tense' : tense_val_loader,
    'certainty' : certainty_val_loader
}

### Model Define

##### Base Model

In [None]:
class BaseModel(nn.Module):
    def __init__(self, input_dim=9351):
        super(BaseModel, self).__init__()
        self.feature_extract = nn.Sequential(
            nn.Linear(in_features=input_dim, out_features=1024),
            nn.BatchNorm1d(1024),
            nn.LeakyReLU(),
            nn.Linear(in_features=1024, out_features=1024),
            nn.BatchNorm1d(1024),
            nn.LeakyReLU(),
            nn.Linear(in_features=1024, out_features=512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(),
        )
        self.type_classifier = nn.Sequential(
            nn.Dropout(p=0.3),
            nn.Linear(in_features=512, out_features=4),
        )
        self.polarity_classifier = nn.Sequential(
            nn.Dropout(p=0.3),
            nn.Linear(in_features=512, out_features=3),
        )
        self.tense_classifier = nn.Sequential(
            nn.Dropout(p=0.3),
            nn.Linear(in_features=512, out_features=3),
        )
        self.certainty_classifier = nn.Sequential(
            nn.Dropout(p=0.3),
            nn.Linear(in_features=512, out_features=2),
        )
            
    def forward(self, x):
        x = self.feature_extract(x)
        # 문장 유형, 극성, 시제, 확실성을 각각 분류
        type_output = self.type_classifier(x)
        polarity_output = self.polarity_classifier(x)
        tense_output = self.tense_classifier(x)
        certainty_output = self.certainty_classifier(x)
        return type_output, polarity_output, tense_output, certainty_output

##### roberta-small

In [None]:
# bert = AutoModel.from_pretrained(CFG['MODEL'])
# bert_pool_iden = bert.pooler
# bert.pooler = nn.Identity()

In [None]:
# pretrained model weight freezing
# bert = AutoModel.from_pretrained(CFG['MODEL'])

# for param in bert.parameters():
#     param.requires_grad = True

In [None]:
class Roberta(nn.Module):

    def __init__(self, dropout=0.5):
        super(Roberta, self).__init__()
        
        self.bert = AutoModel.from_pretrained(CFG['MODEL'])
        # self.bert_pool = bert_pool_iden


        self.type_classifier = nn.Sequential(
            nn.Dropout(p=0.5),
            nn.Linear(in_features=768, out_features=384),
            nn.LeakyReLU(),
            nn.Linear(in_features=384, out_features=4),
        )
        self.polarity_classifier = nn.Sequential(
            nn.Dropout(p=0.5),
            nn.Linear(in_features=768, out_features=384),
            nn.LeakyReLU(),
            nn.Linear(in_features=384, out_features=3),
        )
        self.tense_classifier = nn.Sequential(
            nn.Dropout(p=0.5),
            nn.Linear(in_features=768, out_features=384),
            nn.LeakyReLU(),
            nn.Linear(in_features=384, out_features=3),
        )
        self.certainty_classifier = nn.Sequential(
            nn.Dropout(p=0.5),
            nn.Linear(in_features=768, out_features=384),
            nn.LeakyReLU(),
            nn.Linear(in_features=384, out_features=2),
        )

    def forward(self, ids, masks):

        _, x = self.bert(input_ids= ids, attention_mask=masks,return_dict=False)

        # x = self.bert_pool(x)
        
        type_output = self.type_classifier(x)
        polarity_output = self.polarity_classifier(x)
        tense_output = self.tense_classifier(x)
        certainty_output = self.certainty_classifier(x)

        return type_output, polarity_output, tense_output, certainty_output

#### seperate classification

 - 각 레이블별로 학습을 진행하여 모델의 분류를 실행.

In [None]:
# 4가지 label에 맞추어 각각 bert 모델을 구성.
bert = AutoModel.from_pretrained(CFG['MODEL'])

Some weights of the model checkpoint at monologg/kobigbird-bert-base were not used when initializing BigBirdModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BigBirdModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
print(bert)

In [None]:
class DuplicateModel(nn.Module):

    def __init__(self, dropout=0.5):
        super(DuplicateModel, self).__init__()
        
        self.type_bert = bert
        self.polarity_bert = bert
        self.tense_bert = bert
        self.certainty_bert = bert

        # RoBERTa, kobigbird-bert-base 모두 768 output size

        self.type_classifier = nn.Sequential(
            nn.Dropout(p=0.5),
            nn.Linear(in_features=768, out_features=384),
            nn.LeakyReLU(),
            nn.Linear(in_features=384, out_features=4),
        )
        self.polarity_classifier = nn.Sequential(
            nn.Dropout(p=0.5),
            nn.Linear(in_features=768, out_features=384),
            nn.LeakyReLU(),
            nn.Linear(in_features=384, out_features=3),
        )
        self.tense_classifier = nn.Sequential(
            nn.Dropout(p=0.5),
            nn.Linear(in_features=768, out_features=384),
            nn.LeakyReLU(),
            nn.Linear(in_features=384, out_features=3),
        )
        self.certainty_classifier = nn.Sequential(
            nn.Dropout(p=0.5),
            nn.Linear(in_features=768, out_features=384),
            nn.LeakyReLU(),
            nn.Linear(in_features=384, out_features=2),
        )

    def forward(self, 
                type_ids, type_masks,
                polarity_ids, polarity_masks,
                tense_ids, tense_masks,
                certainty_ids, certainty_masks):

        _, type_x = self.type_bert(input_ids= type_ids, attention_mask=type_masks,return_dict=False)
        _, polarity_x = self.polarity_bert(input_ids= polarity_ids, attention_mask=polarity_masks,return_dict=False)
        _, tense_x = self.tense_bert(input_ids= tense_ids, attention_mask=tense_masks,return_dict=False)
        _, certainty_x = self.certainty_bert(input_ids= certainty_ids, attention_mask=certainty_masks,return_dict=False)
        
        type_output = self.type_classifier(type_x)
        polarity_output = self.polarity_classifier(polarity_x)
        tense_output = self.tense_classifier(tense_x)
        certainty_output = self.certainty_classifier(certainty_x)

        return type_output, polarity_output, tense_output, certainty_output

### Train

In [None]:
def train(model, optimizer, train_loader_dict, val_loader_dict, scheduler, device):
    model.to(device)
    
    criterion = {
        'type' : nn.CrossEntropyLoss().to(device),
        'polarity' : nn.CrossEntropyLoss().to(device),
        'tense' : nn.CrossEntropyLoss().to(device),
        'certainty' : nn.CrossEntropyLoss().to(device)
    }

    type_loader = train_loader_dict['type']
    polarity_loader = train_loader_dict['polarity']
    tense_loader = train_loader_dict['tense']
    certainty_loader = train_loader_dict['certainty']

    
    best_loss = 999999
    best_model = None

    patience = 3
    earlystopping = 0

    loss_plot = []
    val_loss_plot = []


    print("--------------------------------------------")
    print("model : {}".format(CFG['MODEL']))
    print("learning rate : {}".format(CFG['LEARNING_RATE']))
    
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        for type_data, polarity_data, tense_data, certainty_data in zip(tqdm(type_loader), polarity_loader, tense_loader, certainty_loader):
            
            type_id = type_data[0].to(device)
            type_mask = type_data[1].to(device)

            polarity_id = polarity_data[0].to(device)
            polarity_mask = polarity_data[1].to(device)

            tense_id = tense_data[0].to(device)
            tense_mask = tense_data[1].to(device)

            certainty_id = certainty_data[0].to(device)
            certainty_mask = certainty_data[1].to(device)

            type_label = type_data[2].to(device)
            polarity_label = polarity_data[2].to(device)
            tense_label = tense_data[2].to(device)
            certainty_label = certainty_data[2].to(device)

            
            optimizer.zero_grad()
            
            type_logit, polarity_logit, tense_logit, certainty_logit = model(
                type_ids = type_id, type_masks = type_mask,
                polarity_ids= polarity_id, polarity_masks = polarity_mask,
                tense_ids = tense_id, tense_masks = tense_mask,
                certainty_ids = certainty_id, certainty_masks = certainty_mask
            )
            
            loss = 0.25 * criterion['type'](type_logit, type_label) + \
                    0.25 * criterion['polarity'](polarity_logit, polarity_label) + \
                    0.25 * criterion['tense'](tense_logit, tense_label) + \
                    0.25 * criterion['certainty'](certainty_logit, certainty_label)
                        
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
        
        val_loss, val_type_f1, val_polarity_f1, val_tense_f1, val_certainty_f1 = validation(model, val_loader_dict, criterion, device)

        # draw graph
        loss_plot.append(np.mean(train_loss))
        val_loss_plot.append(val_loss)

        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}]] 유형 F1 : [{val_type_f1:.5f}] 극성 F1 : [{val_polarity_f1:.5f}] 시제 F1 : [{val_tense_f1:.5f}] 확실성 F1 : [{val_certainty_f1:.5f}')

        
        if scheduler is not None:
            scheduler.step(val_loss)
            
        if best_loss > val_loss:
            best_loss = val_loss
            best_model = model
            torch.save(model.state_dict(), CFG['save_path'] + 'checkpoint/' + '{}_{}_{}_example.pth'.format(CFG['MODEL_NAME'], 'AdamW', CFG['NOTE']))
            print('------------------ Model Saved ------------------')
            earlystopping = 0

        elif best_loss < val_loss:
            earlystopping += 1
            if earlystopping == patience:
                print("------------stop----------------")
                title = "{}_{}_epoch:{} loss, note: {}".format(CFG['MODEL_NAME'], 'Adam', epoch, CFG['NOTE'])
                draw_graph(title, loss_plot, val_loss_plot)
                break
                
        if epoch == CFG['EPOCHS']:
            title = "{}_{}_epoch:{} loss, note{}".format(CFG['MODEL_NAME'], 'Adam', epoch, CFG['NOTE'])
            draw_graph(title, loss_plot, val_loss_plot)

    return best_model

#### validation

In [None]:
def validation(model, val_loader_dict, criterion, device):
    model.eval()
    val_loss = []
    
    type_preds, polarity_preds, tense_preds, certainty_preds = [], [], [], []
    type_labels, polarity_labels, tense_labels, certainty_labels = [], [], [], []
    
    type_val = val_loader_dict['type']
    polarity_val = val_loader_dict['polarity']
    tense_val = val_loader_dict['tense']
    certainty_val = val_loader_dict['certainty']


    with torch.no_grad():
        for type_val, polarity_val, tense_val, certainty_val in zip(tqdm(type_val), polarity_val, tense_val, certainty_val):
    

            type_val_id = type_val[0].to(device)
            type_val_mask = type_val[1].to(device)

            polarity_val_id = polarity_val[0].to(device)
            polarity_val_mask = polarity_val[1].to(device)

            tense_val_id = tense_val[0].to(device)
            tense_val_mask = tense_val[1].to(device)

            certainty_val_id = certainty_val[0].to(device)
            certainty_val_mask = certainty_val[1].to(device)

            type_val_label = type_val[2].to(device)
            polarity_val_label = polarity_val[2].to(device)
            tense_val_label = tense_val[2].to(device)
            certainty_val_label = certainty_val[2].to(device)


            type_logit, polarity_logit, tense_logit, certainty_logit = model(
                type_ids = type_val_id, type_masks = type_val_mask,
                polarity_ids= polarity_val_id, polarity_masks = polarity_val_mask,
                tense_ids = tense_val_id, tense_masks = tense_val_mask,
                certainty_ids = certainty_val_id, certainty_masks = certainty_val_mask
            )
            

            loss = 0.25 * criterion['type'](type_logit, type_val_label) + \
                    0.25 * criterion['polarity'](polarity_logit, polarity_val_label) + \
                    0.25 * criterion['tense'](tense_logit, tense_val_label) + \
                    0.25 * criterion['certainty'](certainty_logit, certainty_val_label)
            
            val_loss.append(loss.item())
            
            type_preds += type_logit.argmax(1).detach().cpu().numpy().tolist()
            type_labels += type_val_label.detach().cpu().numpy().tolist()
            
            polarity_preds += polarity_logit.argmax(1).detach().cpu().numpy().tolist()
            polarity_labels += polarity_val_label.detach().cpu().numpy().tolist()
            
            tense_preds += tense_logit.argmax(1).detach().cpu().numpy().tolist()
            tense_labels += tense_val_label.detach().cpu().numpy().tolist()
            
            certainty_preds += certainty_logit.argmax(1).detach().cpu().numpy().tolist()
            certainty_labels += certainty_val_label.detach().cpu().numpy().tolist()
    
    type_f1 = f1_score(type_labels, type_preds, average='weighted')
    polarity_f1 = f1_score(polarity_labels, polarity_preds, average='weighted')
    tense_f1 = f1_score(tense_labels, tense_preds, average='weighted')
    certainty_f1 = f1_score(certainty_labels, certainty_preds, average='weighted')
    
    return np.mean(val_loss), type_f1, polarity_f1, tense_f1, certainty_f1

### Run

In [None]:
model = DuplicateModel().to(device)
save_model = torch.load("/content/drive/MyDrive/Colab_Notebooks/NLP/submission/sentence/checkpoint/kobigbird-bert-base_AdamW_k-fold-type_example.pth", map_location=device)
model.load_state_dict(save_model)
model.eval()
optimizer = torch.optim.AdamW(params = model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2,threshold_mode='abs',min_lr=1e-8, verbose=True)

infer_model = train(model, optimizer, train_loader_dict, val_loader_dict, scheduler, device)

--------------------------------------------
model : monologg/kobigbird-bert-base
learning rate : 1e-06


  0%|          | 0/827 [00:00<?, ?it/s]

Attention type 'block_sparse' is not possible if sequence_length: 128 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


  0%|          | 0/207 [00:00<?, ?it/s]

Epoch : [1] Train Loss : [0.16417] Val Loss : [0.21758]] 유형 F1 : [0.88616] 극성 F1 : [0.97457] 시제 F1 : [0.90274] 확실성 F1 : [0.92524
------------------ Model Saved ------------------


  0%|          | 0/827 [00:00<?, ?it/s]

  0%|          | 0/207 [00:00<?, ?it/s]

Epoch : [2] Train Loss : [0.15973] Val Loss : [0.21913]] 유형 F1 : [0.88598] 극성 F1 : [0.97565] 시제 F1 : [0.90262] 확실성 F1 : [0.92541


  0%|          | 0/827 [00:00<?, ?it/s]

  0%|          | 0/207 [00:00<?, ?it/s]

Epoch : [3] Train Loss : [0.15416] Val Loss : [0.21856]] 유형 F1 : [0.88632] 극성 F1 : [0.97515] 시제 F1 : [0.90178] 확실성 F1 : [0.92662


  0%|          | 0/827 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

### Inference

In [None]:
test_dataset = CustomDataset(test_ids, test_masks, None)
test_loader = DataLoader(test_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [None]:
def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    
    type_preds, polarity_preds, tense_preds, certainty_preds = [], [], [], []
    
    with torch.no_grad():
        for ids, masks in tqdm(test_loader):

            type_test_id = ids.to(device)
            type_test_mask = masks.to(device)

            polarity_test_id = ids.to(device)
            polarity_test_mask = masks.to(device)

            tense_test_id = ids.to(device)
            tense_test_mask = masks.to(device)

            certainty_test_id = ids.to(device)
            certainty_test_mask = masks.to(device)
            
            type_logit, polarity_logit, tense_logit, certainty_logit = model(
                type_ids = type_test_id, type_masks = type_test_mask,
                polarity_ids= polarity_test_id, polarity_masks = polarity_test_mask,
                tense_ids = tense_test_id, tense_masks = tense_test_mask,
                certainty_ids = certainty_test_id, certainty_masks = certainty_test_mask
            )
            
            type_preds += type_logit.argmax(1).detach().cpu().numpy().tolist()
            polarity_preds += polarity_logit.argmax(1).detach().cpu().numpy().tolist()
            tense_preds += tense_logit.argmax(1).detach().cpu().numpy().tolist()
            certainty_preds += certainty_logit.argmax(1).detach().cpu().numpy().tolist()
            
    return type_preds, polarity_preds, tense_preds, certainty_preds

In [None]:
type_preds, polarity_preds, tense_preds, certainty_preds = inference(model, test_loader, device)

  0%|          | 0/444 [00:00<?, ?it/s]

In [None]:
# test_type = ['대화형' if i==0 else '사실형' if i==1 else '예측형' if i==2 else '추론형' for i in [np.argmax(p) for p in type_preds]]
# test_polarity = ['긍정' if i==0 else '미정' if i==1 else '부정' for i in [np.argmax(p) for p in polarity_preds]]
# test_tense = ['과거' if i==0 else '미래' if i==1 else '현재' for i in [np.argmax(p) for p in tense_preds]]
# test_certainty = ['불확실' if i==0 else '확실' for i in [np.argmax(p) for p in certainty_preds]]

In [None]:
type_preds = type_le.inverse_transform(type_preds)
polarity_preds = polarity_le.inverse_transform(polarity_preds)
tense_preds = tense_le.inverse_transform(tense_preds)
certainty_preds = certainty_le.inverse_transform(certainty_preds)

In [None]:
predictions = []
for type_pred, polarity_pred, tense_pred, certainty_pred in zip(type_preds, polarity_preds, tense_preds, certainty_preds):
    predictions.append(type_pred+'-'+polarity_pred+'-'+tense_pred+'-'+certainty_pred)

### Submission

In [None]:
submit = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/NLP/data/sentence_classifi/sample_submission.csv')
submit['label'] = predictions

In [None]:
submit.head()

Unnamed: 0,ID,label
0,TEST_0000,사실형-긍정-현재-확실
1,TEST_0001,사실형-긍정-과거-확실
2,TEST_0002,사실형-긍정-과거-확실
3,TEST_0003,사실형-부정-현재-확실
4,TEST_0004,사실형-긍정-과거-확실


In [None]:
submit.to_csv('/content/drive/MyDrive/Colab_Notebooks/NLP/submission/sentence/submit_{}_{}_{}.csv'.format(CFG['MODEL_NAME'], 'AdamW', CFG['NOTE']), index=False)

# EDA

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
max_len = ''
count = 0
for idx, i in enumerate(df['문장']):
    if len(i) > len(max_len):
        max_len = i
        count = idx

print(max_len)
print(count)
print(len(df['문장'][8838]))
display(df['문장'][8838])

반작(봄에 환곡을 나눠줄 때는 주지 않고도 주었다고 하고, 가을에 거두어들일 때는 회수하고도 회수하지 않았다고 해 중간에 이득을 나눔), 입본(농사 상황과 곡식 시세를 살펴서 돈과 곡식 간의 교환을 통해 이득을 챙김), 가집(상급 부서에서 지시한 것보다 더 많은 곡식을 방출하고 남는 것을 횡령), 암류(환곡을 제때에 대출하지 않고 창고에 쌓아 두었다가 값이 오르면 팔고 내리면 사들임), 반백(농민을 속여 대출 때 곡식의 절반을 가로채고 갚을 때는 모두 갚게 함), 분석(곡식에 돌, 쭉정이를 섞어 늘어난 양만큼 횡령), 집신(묵은 곡식은 나눠주고 햇곡식은 자기들이 가짐), 탄정(흉년이 들면 정부에서 환곡의 수량을 감해주는데 백성들에게는 환곡을 전량 징수하고 감액만큼 착복), 세전(환곡으로 받은 곡식과 세금으로 받은 곡식을 이리저리 돌려 이익을 남김), 요합(민간이 부역 대신 곡식으로 납부할 때 거슬러주어야 할 쌀을 횡령), 사흔(아전이 환곡을 징수하면서 자기들의 수고비를 같이 징수), 채륵(아전이 개인 채무까지 환곡과 혼합해 착복)이 그것이다.
8838
534


'반작(봄에 환곡을 나눠줄 때는 주지 않고도 주었다고 하고, 가을에 거두어들일 때는 회수하고도 회수하지 않았다고 해 중간에 이득을 나눔), 입본(농사 상황과 곡식 시세를 살펴서 돈과 곡식 간의 교환을 통해 이득을 챙김), 가집(상급 부서에서 지시한 것보다 더 많은 곡식을 방출하고 남는 것을 횡령), 암류(환곡을 제때에 대출하지 않고 창고에 쌓아 두었다가 값이 오르면 팔고 내리면 사들임), 반백(농민을 속여 대출 때 곡식의 절반을 가로채고 갚을 때는 모두 갚게 함), 분석(곡식에 돌, 쭉정이를 섞어 늘어난 양만큼 횡령), 집신(묵은 곡식은 나눠주고 햇곡식은 자기들이 가짐), 탄정(흉년이 들면 정부에서 환곡의 수량을 감해주는데 백성들에게는 환곡을 전량 징수하고 감액만큼 착복), 세전(환곡으로 받은 곡식과 세금으로 받은 곡식을 이리저리 돌려 이익을 남김), 요합(민간이 부역 대신 곡식으로 납부할 때 거슬러주어야 할 쌀을 횡령), 사흔(아전이 환곡을 징수하면서 자기들의 수고비를 같이 징수), 채륵(아전이 개인 채무까지 환곡과 혼합해 착복)이 그것이다.'

In [None]:
display(df.head())
display(df.describe())

Unnamed: 0,ID,문장,유형,극성,시제,확실성,label
0,TRAIN_00000,0.75%포인트 금리 인상은 1994년 이후 28년 만에 처음이다.,사실형,긍정,현재,확실,사실형-긍정-현재-확실
1,TRAIN_00001,이어 ＂앞으로 전문가들과 함께 4주 단위로 상황을 재평가할 예정＂이라며 ＂그 이전이...,사실형,긍정,과거,확실,사실형-긍정-과거-확실
2,TRAIN_00002,정부가 고유가 대응을 위해 7월부터 연말까지 유류세 인하 폭을 30%에서 37%까지...,사실형,긍정,미래,확실,사실형-긍정-미래-확실
3,TRAIN_00003,"서울시는 올해 3월 즉시 견인 유예시간 60분을 제공하겠다고 밝혔지만, 하루 만에 ...",사실형,긍정,과거,확실,사실형-긍정-과거-확실
4,TRAIN_00004,익사한 자는 사다리에 태워 거꾸로 놓고 소금으로 코를 막아 가득 채운다.,사실형,긍정,현재,확실,사실형-긍정-현재-확실


Unnamed: 0,ID,문장,유형,극성,시제,확실성,label
count,16541,16541,16541,16541,16541,16541,16541
unique,16541,16506,4,3,3,2,64
top,TRAIN_00000,이들 게임은 국내 구글 플레이 매출 톱10 진입이 예상되는 기대작이다.,사실형,긍정,과거,확실,사실형-긍정-과거-확실
freq,1,2,13558,15793,8032,15192,7113


In [None]:
train = df.groupby('label')

train_type  = df.groupby('유형')
train_polarity = df.groupby('극성')
train_tense = df.groupby('시제')
train_certainty = df.groupby('확실성')

display(train_type.count())
display(train_polarity.count())
display(train_tense.count())
display(train_certainty.count())

Unnamed: 0_level_0,ID,문장,극성,시제,확실성,label
유형,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
대화형,575,575,575,575,575,575
사실형,13558,13558,13558,13558,13558,13558
예측형,257,257,257,257,257,257
추론형,2151,2151,2151,2151,2151,2151


Unnamed: 0_level_0,ID,문장,유형,시제,확실성,label
극성,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
긍정,15793,15793,15793,15793,15793,15793
미정,183,183,183,183,183,183
부정,565,565,565,565,565,565


Unnamed: 0_level_0,ID,문장,유형,극성,확실성,label
시제,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
과거,8032,8032,8032,8032,8032,8032
미래,1643,1643,1643,1643,1643,1643
현재,6866,6866,6866,6866,6866,6866


Unnamed: 0_level_0,ID,문장,유형,극성,시제,label
확실성,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
불확실,1349,1349,1349,1349,1349,1349
확실,15192,15192,15192,15192,15192,15192
