# Dacon 한국어 문장 관계 분류

Kaggle notebook gpu 환경에서 학습을 진행하였습니다.  
데이터: 기존 학습데이터 + klue 검증데이터(기존 test셋에서 중복되는 데이터 6개 제거)  
모델: roberta-large + self explaining structures improve NLP models를 사용하였습니다.  
5 fold 전략과 가중치 초기화를 적용하였습니다.


추가 데이터: https://aistages-prod-server-public.s3.amazonaws.com/app/Competitions/000068/data/klue-nli-v1.1.tar.gz  
self explaing NLP: https://arxiv.org/abs/2012.01786

In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from transformers import AdamW, AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler
from sklearn.model_selection import train_test_split
import random
import os
from tqdm import tqdm
from transformers import get_linear_schedule_with_warmup
import torch.nn.functional as F
from torch.nn import Parameter
import math
from torch.optim import Adam
from sklearn.model_selection import KFold
import urllib.request
from typing import List
from functools import partial
import torchmetrics

In [2]:
seed_num = 22
random.seed(seed_num)
np.random.seed(seed_num)
torch.manual_seed(seed_num)
torch.cuda.manual_seed_all(seed_num)
kf = KFold(n_splits=5, random_state=seed_num, shuffle=True)

In [3]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

In [4]:
train = pd.read_csv('/kaggle/input/addtrain/add_train.csv')
test = pd.read_csv('/kaggle/input/kornli/test_data.csv')
submission = pd.read_csv('/kaggle/input/kornli/sample_submission.csv')

In [5]:
len(train)

In [6]:
train = train.drop_duplicates(['premise', 'hypothesis'])
len(train)

In [7]:
train = train.reset_index()

In [8]:
train = train[['index', 'premise', 'hypothesis', 'label']]
train['index'] = list(range(len(train)))

In [9]:
class NLIDataset(Dataset):
    def __init__(self, data, is_train=True):
        super().__init__()
        self.max_length = 70
        self.label_dict = {"entailment" : 0, "contradiction" : 1, "neutral" : 2}
        self.data = data
        self.tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")
        self.is_train = is_train
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.is_train:
            sentence_1, sentence_2, label = self.data['premise'][idx], self.data['hypothesis'][idx], self.data['label'][idx]
            label = self.label_dict[label]
            label = torch.LongTensor([label])
        else:
            sentence_1, sentence_2 = self.data['premise'][idx], self.data['hypothesis'][idx]
        sentence_1_input_ids = self.tokenizer.encode(sentence_1, add_special_tokens=False)
        sentence_2_input_ids = self.tokenizer.encode(sentence_2, add_special_tokens=False)
        input_ids = sentence_1_input_ids + [2] + sentence_2_input_ids
        if len(input_ids) > self.max_length - 2:
            input_ids = input_ids[:self.max_length - 2]
        length = torch.LongTensor([len(input_ids) + 2])
        input_ids = torch.LongTensor([0] + input_ids + [2])
        if self.is_train:
            return input_ids, label, length
        else:
            return input_ids, length

In [10]:
batch_size = 32

In [11]:
train_dataset = NLIDataset(train)

In [12]:
def collate_to_max_length(batch: List[List[torch.Tensor]], max_len: int = None, fill_values: List[float] = None) -> \
    List[torch.Tensor]:
    lengths = np.array([[len(field_data) for field_data in sample] for sample in batch])
    batch_size, num_fields = lengths.shape
    fill_values = fill_values or [0.0] * num_fields
    max_lengths = lengths.max(axis=0)
    if max_len:
        assert max_lengths.max() <= max_len
        max_lengths = np.ones_like(max_lengths) * max_len

    output = [torch.full([batch_size, max_lengths[field_idx]],
                         fill_value=fill_values[field_idx],
                         dtype=batch[0][field_idx].dtype)
              for field_idx in range(num_fields)]
    for sample_idx in range(batch_size):
        for field_idx in range(num_fields):
            data = batch[sample_idx][field_idx]
            output[field_idx][sample_idx][: data.shape[0]] = data
    max_sentence_length = max_lengths[0]
    start_indexs = []
    end_indexs = []
    for i in range(1, max_sentence_length - 1):
        for j in range(i, max_sentence_length - 1):
            start_indexs.append(i)
            end_indexs.append(j)
    span_masks = []
    for input_ids, label, length in batch:
        span_mask = []
        middle_index = input_ids.tolist().index(2)
        for start_index, end_index in zip(start_indexs, end_indexs):
            if 1 <= start_index <= length.item() - 2 and 1 <= end_index <= length.item() - 2 and (
                start_index > middle_index or end_index < middle_index):
                span_mask.append(0)
            else:
                span_mask.append(1e6)
        span_masks.append(span_mask)
    output.append(torch.LongTensor(start_indexs))
    output.append(torch.LongTensor(end_indexs))
    output.append(torch.LongTensor(span_masks))
    return output

In [13]:
class SICModel(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size

        self.W_1 = nn.Linear(hidden_size, hidden_size)
        self.W_2 = nn.Linear(hidden_size, hidden_size)
        self.W_3 = nn.Linear(hidden_size, hidden_size)
        self.W_4 = nn.Linear(hidden_size, hidden_size)
        self._init_params()
        
    def _init_params(self):
        nn.init.xavier_normal_(self.W_1.weight)
        nn.init.constant_(self.W_1.bias, 0)
        nn.init.xavier_normal_(self.W_2.weight)
        nn.init.constant_(self.W_2.bias, 0)
        nn.init.xavier_normal_(self.W_3.weight)
        nn.init.constant_(self.W_3.bias, 0)
        nn.init.xavier_normal_(self.W_4.weight)
        nn.init.constant_(self.W_4.bias, 0)


    def forward(self, hidden_states, start_indexs, end_indexs):
        W1_h = self.W_1(hidden_states)
        W2_h = self.W_2(hidden_states)
        W3_h = self.W_3(hidden_states)
        W4_h = self.W_4(hidden_states)

        W1_hi_emb = torch.index_select(W1_h, 1, start_indexs)
        W2_hj_emb = torch.index_select(W2_h, 1, end_indexs)
        W3_hi_start_emb = torch.index_select(W3_h, 1, start_indexs)
        W3_hi_end_emb = torch.index_select(W3_h, 1, end_indexs)
        W4_hj_start_emb = torch.index_select(W4_h, 1, start_indexs)
        W4_hj_end_emb = torch.index_select(W4_h, 1, end_indexs)

        span = W1_hi_emb + W2_hj_emb + (W3_hi_start_emb - W3_hi_end_emb) + torch.mul(W4_hj_start_emb, W4_hj_end_emb)
        h_ij = torch.tanh(span)
        return h_ij

In [14]:
class InterpretationModel(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.h_t = nn.Linear(hidden_size, 1)

    def forward(self, h_ij, span_masks):
        o_ij = self.h_t(h_ij).squeeze(-1)
        o_ij = o_ij - span_masks
        a_ij = nn.functional.softmax(o_ij, dim=1)
        H = (a_ij.unsqueeze(-1) * h_ij).sum(dim=1)
        return H, a_ij

In [15]:
class ExplainableModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.intermediate = AutoModel.from_pretrained("klue/roberta-large")
        hidden_size = 1024
        self.span_info_collect = SICModel(hidden_size)
        self.interpretation = InterpretationModel(hidden_size)
        self.output = nn.Linear(hidden_size, 3)
        self._init_params()
        
    def _init_params(self):
        nn.init.xavier_normal_(self.output.weight)
        nn.init.constant_(self.output.bias, 0)
        

    def forward(self, input_ids, start_indexs, end_indexs, span_masks):
        attention_mask = (input_ids != 1).long()
        x= self.intermediate(input_ids, attention_mask=attention_mask)
        h_ij = self.span_info_collect(x.last_hidden_state, start_indexs, end_indexs)
        H, a_ij = self.interpretation(h_ij, span_masks)
        out = self.output(H)
        return out, a_ij

In [16]:
epochs = 3
criterion = nn.CrossEntropyLoss()

In [17]:
train_acc = torchmetrics.Accuracy()
def cal_accuracy(X,Y):
    predict_scores = F.softmax(X, dim=1)
    predict_labels = torch.argmax(predict_scores, dim=-1)
    acc = train_acc(predict_labels.to('cpu'), y.cpu())
    return acc

In [18]:
for fold,(train_idx,valid_idx) in enumerate(kf.split(train_dataset)):
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
    valid_subsampler = torch.utils.data.SubsetRandomSampler(valid_idx)
    train_dataLoader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_subsampler, collate_fn=partial(collate_to_max_length, fill_values=[1, 0, 0]))
    valid_dataLoader = DataLoader(train_dataset, batch_size=batch_size, sampler=valid_subsampler, collate_fn=partial(collate_to_max_length, fill_values=[1, 0, 0]))
    best_acc = 0
    best_loss = 10
    model = ExplainableModel().to(device)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, betas=(0.9, 0.98), lr=2e-5, eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
                                                    num_training_steps=len(train_dataLoader) * epochs)
    model.zero_grad()
    print(f'------------fold no---------{fold + 1}----------------------')
    for epoch_i in range(0, epochs):
        model.train()
        total_loss = 0
        train_accuracy = 0
        nb_train_steps = 0
        for batch in tqdm(train_dataLoader):
            batch = tuple(t.to(device) for t in batch)
            sen, label, length, start, end, span = batch
            outputs, a_ij = model(sen, start, end, span)
            y = label.view(-1)
            ce_loss = criterion(outputs, y)
            reg_loss = 1.0 * a_ij.pow(2).sum(dim=1).mean()
            loss = ce_loss + reg_loss
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
            logits = outputs
            tmp_train_accuracy = cal_accuracy(logits, label.to('cpu').numpy())
            train_accuracy += tmp_train_accuracy
            nb_train_steps += 1
        avg_train_loss = total_loss / len(train_dataLoader)
        print('')
        print(epoch_i + 1, f'  Average training loss: {avg_train_loss:.4f}')
        print(f'  Accuracy: {train_accuracy/(nb_train_steps):.4f}')
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        valid_loss = 0
        for batch in tqdm(valid_dataLoader):
            batch = tuple(t.to(device) for t in batch)
            sen, label, length, start, end, span = batch
            with torch.no_grad():     
                outputs, a_ij = model(sen, start, end, span)
            y = label.view(-1)
            ce_loss = criterion(outputs, y)
            loss = ce_loss
            valid_loss += ce_loss.item()
            logits = outputs
            tmp_eval_accuracy = cal_accuracy(logits, label.to('cpu').numpy())
            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1
        avg_valid_loss = valid_loss / len(valid_dataLoader)
        valid_accuracy = eval_accuracy/(nb_eval_steps)
        if best_acc < valid_accuracy:
            best_acc = valid_accuracy
            torch.save(model, f'/kaggle/working/model{fold + 1}')
            print(f'model{fold + 1} saved')
        print(epoch_i + 1, f'  Average valid loss: {avg_valid_loss:.4f}')
        print(f'  Accuracy: {valid_accuracy:.4f}')

In [19]:
test_dataset = SNLIDataset(test, False)

In [20]:
def collate_test(batch: List[List[torch.Tensor]], max_len: int = None, fill_values: List[float] = None) -> \
    List[torch.Tensor]:
    lengths = np.array([[len(field_data) for field_data in sample] for sample in batch])
    batch_size, num_fields = lengths.shape
    fill_values = fill_values or [0.0] * num_fields
    max_lengths = lengths.max(axis=0)
    if max_len:
        assert max_lengths.max() <= max_len
        max_lengths = np.ones_like(max_lengths) * max_len

    output = [torch.full([batch_size, max_lengths[field_idx]],
                         fill_value=fill_values[field_idx],
                         dtype=batch[0][field_idx].dtype)
              for field_idx in range(num_fields)]
    for sample_idx in range(batch_size):
        for field_idx in range(num_fields):
            data = batch[sample_idx][field_idx]
            output[field_idx][sample_idx][: data.shape[0]] = data
    max_sentence_length = max_lengths[0]
    start_indexs = []
    end_indexs = []
    for i in range(1, max_sentence_length - 1):
        for j in range(i, max_sentence_length - 1):
            start_indexs.append(i)
            end_indexs.append(j)
    span_masks = []
    for input_ids, length in batch:
        span_mask = []
        middle_index = input_ids.tolist().index(2)
        for start_index, end_index in zip(start_indexs, end_indexs):
            if 1 <= start_index <= length.item() - 2 and 1 <= end_index <= length.item() - 2 and (
                start_index > middle_index or end_index < middle_index):
                span_mask.append(0)
            else:
                span_mask.append(1e6)
        span_masks.append(span_mask)
    output.append(torch.LongTensor(start_indexs))
    output.append(torch.LongTensor(end_indexs))
    output.append(torch.LongTensor(span_masks))
    return output

In [21]:
test_dataLoader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=partial(collate_test, fill_values=[1, 0]))

In [22]:
folds = 5
pred = []
for i in range(folds) : 
    model = torch.load(f'/kaggle/working/model{i + 1}')
    model.eval()
    result = []
    for batch in tqdm(test_dataLoader):
        batch = tuple(t.to(device) for t in batch)
        sen, length, start, end, span = batch
        with torch.no_grad():     
            outputs, a_ij = model(sen, start, end, span)
        result.extend(outputs)    
    pred.append(result)

In [24]:
output = []
for pred1, pred2, pred3, pred4, pred5 in zip(pred[0], pred[1], pred[2], pred[3], pred[4]):
    output.append(int(torch.argmax(pred1 + pred2 + pred3 + pred4 + pred5)))

In [25]:
import datetime

dt_now = datetime.datetime.now()
print(dt_now)
fname = str(dt_now.date())

In [26]:
label_dict = {"entailment" : 0, "contradiction" : 1, "neutral" : 2}
out = [list(label_dict.keys())[_] for _ in output]

submission["label"] = out

In [27]:
submission

In [29]:
submission.to_csv(f'/kaggle/working/'+ fname + "_1" + ".csv", index = False)