In [1]:
## best_score_code

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
import torch
from torch import nn
from transformers import BertModel, AdamW, BertTokenizer, RobertaTokenizer, RobertaModel, AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler
from sklearn.model_selection import train_test_split
import random
import os
from tqdm import tqdm
from transformers import get_linear_schedule_with_warmup
import torch.nn.functional as F
from torch.nn import Parameter
import math
from torch.optim import Adam
from sklearn.model_selection import KFold

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
seed_num = 22
random.seed(seed_num)
np.random.seed(seed_num)
torch.manual_seed(seed_num)
torch.cuda.manual_seed_all(seed_num)
kf = KFold(n_splits=5, random_state=seed_num, shuffle=True)

In [None]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

In [None]:
train = pd.read_csv('/kaggle/input/kor-nli/dacon/open/train_data.csv')
test = pd.read_csv('/kaggle/input/kor-nli/dacon/open/test_data.csv')
submission = pd.read_csv('/kaggle/input/kor-nli/dacon/open/sample_submission.csv')

In [None]:
train["premise_"] = "[CLS]" + train["premise"] + "[SEP]"
train["hypothesis_"] = train["hypothesis"] + "[SEP]"

test["premise_"] = "[CLS]" + test["premise"] + "[SEP]"
test["hypothesis_"] = test["hypothesis"] + "[SEP]"

train["text_sum"] = train.premise_ + " " + train.hypothesis_
test["text_sum"] = test.premise_ + " " + test.hypothesis_

In [None]:
# tokenizer = BertTokenizer.from_pretrained('klue/bert-base')
# tokenizer = BertTokenizer.from_pretrained("klue/roberta-large")
# tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-discriminator")
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")

In [None]:
label_dict = {"entailment" : 0, "contradiction" : 1, "neutral" : 2}

train['label'] = train['label'].apply(lambda x: label_dict[x])

sen1 = train.premise[0]

sen2 = train.hypothesis[0]

In [None]:
def encoding1(sen):
    output = tokenizer(sen, truncation=True)
    return output['input_ids']

def encoding2(sen):
    output = tokenizer(sen, truncation=True)
    return output['attention_mask']

In [None]:
sen = train['text_sum'].apply(encoding1)
att = train['text_sum'].apply(encoding2)

In [None]:
def make_token(sen1, sen2):
    output = tokenizer(sen1, sen2, truncation=True, padding=True, max_length=70)
    return output['token_type_ids']

In [None]:
result = []
for sen1, sen2 in zip(train.premise_, train.hypothesis_):
    result.append(make_token(sen1, sen2))
train['token_type'] = result

In [None]:
def padding(sentence):
    max_len = 70
    l = len(sentence)
    if l <= max_len:
        sentence = sentence + [0] * (max_len - l)
    else:
        sentence = sentence[:max_len]
    return sentence

In [None]:
# sen1 = sen1.apply(padding)
# att1 = att1.apply(padding)
# sen2 = sen2.apply(padding)
# att2 = att2.apply(padding)
sen = sen.apply(padding)
att = att.apply(padding)
tok = train['token_type'].apply(padding)

In [None]:
# input_ids1 = torch.tensor(sen1)
# att_mask1 = torch.tensor(att1)
# input_ids2 = torch.tensor(sen2)
# att_mask2 = torch.tensor(att2)
# label = torch.tensor(train['label'])
input_ids = torch.tensor(sen)
att_mask = torch.tensor(att)
token_type = torch.tensor(tok)
label = torch.tensor(train['label'])

In [None]:
# train_dataset = TensorDataset(input_ids1, att_mask1, input_ids2, att_mask2, label)
train_dataset = TensorDataset(input_ids, att_mask, token_type, label)

In [None]:
batch_size = 32

# train_dataLoader = DataLoader(train_dataset, batch_size=batch_size)

In [None]:
class STSClassifier(nn.Module):
    def __init__(self, batch_size):
        super().__init__()
        self.fc_dim = 1024
#         self.bert_model = BertModel.from_pretrained("klue/bert-base")
        self.bert_model = AutoModel.from_pretrained('klue/roberta-large')
        self.fc = nn.Linear(self.bert_model.config.hidden_size, self.fc_dim)
        self.bn = nn.BatchNorm1d(self.fc_dim)
        self.fc2=  nn.Linear(self.bert_model.config.hidden_size * 3, 3)
        self._init_params()
        self.sigmoid = torch.nn.Sigmoid()
        self.batch_size = batch_size
        self.dropout = nn.Dropout(p=0.5)
#         self.fc2 = nn.Linear(self.fc_dim, 3)
        self.fc3 = nn.Linear(self.fc_dim, 3)
        self.bn2 = nn.BatchNorm1d(3)
        self.act = nn.ReLU()

        
    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)
    
#     def forward(self, sen1, mask1, sen2, mask2, label=None):
#         x1 = self.bert_model(input_ids=sen1, attention_mask=mask1)
#         x1 = torch.sum(x1.last_hidden_state * mask1.unsqueeze(-1), dim=1) / mask1.sum(dim=1, keepdims=True)
#         # x1 = self.fc(x1)
#         # x1 = self.bn(x1)
#         x2 = self.bert_model(input_ids=sen2, attention_mask=mask2)
#         x2 = torch.sum(x2.last_hidden_state * mask2.unsqueeze(-1), dim=1) / mask2.sum(dim=1, keepdims=True)
#         # x2 = self.fc(x2)
#         # x2 = self.bn(x2)
# #         output = torch.stack([x1, x2, abs(x1 - x2), x1*x2])
#         output = torch.cat([x1, x2, x1-x2], dim=1)
# #         output = output.view(-1, self.fc_dim * 4)
#         output = self.bn2(self.fc2(self.act(output)))
#         return output
    def forward(self, sen, mask, token, label=None):
        x = self.bert_model(input_ids=sen, token_type_ids=token, attention_mask=mask)
        x = torch.sum(x.last_hidden_state * mask.unsqueeze(-1), dim=1) / mask.sum(dim=1, keepdims=True)
#         x = self.fc(x)
#         x = self.bn(x)
        return self.fc3(self.dropout(x))

In [None]:
model = STSClassifier(batch_size).to(device)

epochs = 3

model.zero_grad()
criterion = nn.CrossEntropyLoss()

In [None]:
# def cal_accuracy(preds, labels):
# #     pred_flat = preds>0.5
#     pred_flat = np.argmax(preds, axis=0).flatten()
#     labels_flat = labels
#     return np.sum(pred_flat == labels_flat) / len(labels_flat)
def cal_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [None]:
for fold,(train_idx,valid_idx) in enumerate(kf.split(train_dataset)):
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
    valid_subsampler = torch.utils.data.SubsetRandomSampler(valid_idx)
    train_dataLoader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_subsampler)
    valid_dataLoader = DataLoader(train_dataset, batch_size=batch_size, sampler=valid_subsampler)
    best_acc = 0
    model = STSClassifier(batch_size).to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
    model.zero_grad()
    print(f'------------fold no---------{fold + 1}----------------------')
    for epoch_i in range(0, epochs):
        # model.train(False)
        model.train()
        total_loss = 0
        train_accuracy = 0
        nb_train_steps = 0
        for batch in tqdm(train_dataLoader):
            batch = tuple(t.to(device) for t in batch)
            sen, att, tok, label = batch
#             s1, m1, s2, m2, label = batch
            outputs = model(sen, att, tok)
#             outputs = model(s1, m1, s2, m2)
#             outputs = model(s1, s2, m1, m2)
            # outputs = Arcface(outputs, label)
            # outputs = sigmoid(outputs)
            # loss = cal_mse(outputs, label)
#             loss = criterion(outputs.to(torch.float32), label.unsqueeze(-1).to(torch.float32))
            loss = criterion(outputs.to(torch.float32), label.to(torch.int64))
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            optimizer.zero_grad()
            logits = outputs
#             logits = logits.detach().cpu().numpy()
#             label = label.unsqueeze(-1).to('cpu').numpy()
            tmp_train_accuracy = cal_accuracy(logits, label)
            train_accuracy += tmp_train_accuracy
            nb_train_steps += 1
        avg_train_loss = total_loss / len(train_dataLoader)
        print("")
        print(epoch_i + 1, "  Average training loss: {0:.4f}".format(avg_train_loss))
        print("  Accuracy: {0:.4f}".format(train_accuracy/(nb_train_steps)))
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        valid_loss = 0
        for batch in tqdm(valid_dataLoader):
            batch = tuple(t.to(device) for t in batch)
#             s1, m1, s2, m2, label = batch
            sen, att, tok, label = batch
            with torch.no_grad():     
#                 outputs = model(s1, m1, s2, m2)
                outputs = model(sen, att, tok)
            # loss = cal_mse(outputs, label)
            # outputs = Arcface(outputs, label)
            # outputs = sigmoid(outputs)
            # print(outputs)
#             loss = criterion(outputs.to(torch.float32), label.to(torch.float32))
#             loss = criterion(outputs.to(torch.float32), label.unsqueeze(-1).to(torch.float32))
            loss = criterion(outputs.to(torch.float32), label.to(torch.int64))
            valid_loss += loss.item()
            logits = outputs
#             logits = logits.detach().cpu().numpy()
#             label = label.unsqueeze(-1).to('cpu').numpy()
            tmp_eval_accuracy = cal_accuracy(logits, label)
            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1
        avg_valid_loss = valid_loss / len(valid_dataLoader)
        valid_accuracy = eval_accuracy/(nb_eval_steps)
#         if avg_valid_loss <= best_loss:
        if best_acc <= valid_accuracy:
            best_acc = valid_accuracy
#             best_loss = avg_valid_loss
            torch.save(model, f'/kaggle/working/model{fold + 1}')
            print(f'model{fold + 1} saved')
        print(epoch_i + 1, "  Average valid loss: {0:.4f}".format(avg_valid_loss))
        print("  Accuracy: {0:.4f}".format(valid_accuracy))

In [None]:
## new best 0.87

for fold,(train_idx,valid_idx) in enumerate(kf.split(train_dataset)):
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
    valid_subsampler = torch.utils.data.SubsetRandomSampler(valid_idx)
    train_dataLoader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_subsampler)
    valid_dataLoader = DataLoader(train_dataset, batch_size=batch_size, sampler=valid_subsampler)
    best_acc = 0
    model = STSClassifier(batch_size).to(device)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, eps=1e-8)
    model.zero_grad()
    print(f'------------fold no---------{fold + 1}----------------------')
    for epoch_i in range(0, epochs):
        # model.train(False)
        model.train()
        total_loss = 0
        train_accuracy = 0
        nb_train_steps = 0
        for batch in tqdm(train_dataLoader):
            batch = tuple(t.to(device) for t in batch)
            sen, att, tok, label = batch
#             s1, m1, s2, m2, label = batch
            outputs = model(sen, att, tok)
#             outputs = model(s1, m1, s2, m2)
#             outputs = model(s1, s2, m1, m2)
            # outputs = Arcface(outputs, label)
            # outputs = sigmoid(outputs)
            # loss = cal_mse(outputs, label)
#             loss = criterion(outputs.to(torch.float32), label.unsqueeze(-1).to(torch.float32))
            loss = criterion(outputs.to(torch.float32), label.to(torch.int64))
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            optimizer.zero_grad()
            logits = outputs
#             logits = logits.detach().cpu().numpy()
#             label = label.unsqueeze(-1).to('cpu').numpy()
            tmp_train_accuracy = cal_accuracy(logits, label)
            train_accuracy += tmp_train_accuracy
            nb_train_steps += 1
        avg_train_loss = total_loss / len(train_dataLoader)
        print("")
        print(epoch_i + 1, "  Average training loss: {0:.4f}".format(avg_train_loss))
        print("  Accuracy: {0:.4f}".format(train_accuracy/(nb_train_steps)))
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        valid_loss = 0
        for batch in tqdm(valid_dataLoader):
            batch = tuple(t.to(device) for t in batch)
#             s1, m1, s2, m2, label = batch
            sen, att, tok, label = batch
            with torch.no_grad():     
#                 outputs = model(s1, m1, s2, m2)
                outputs = model(sen, att, tok)
            # loss = cal_mse(outputs, label)
            # outputs = Arcface(outputs, label)
            # outputs = sigmoid(outputs)
            # print(outputs)
#             loss = criterion(outputs.to(torch.float32), label.to(torch.float32))
#             loss = criterion(outputs.to(torch.float32), label.unsqueeze(-1).to(torch.float32))
            loss = criterion(outputs.to(torch.float32), label.to(torch.int64))
            valid_loss += loss.item()
            logits = outputs
#             logits = logits.detach().cpu().numpy()
#             label = label.unsqueeze(-1).to('cpu').numpy()
            tmp_eval_accuracy = cal_accuracy(logits, label)
            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1
        avg_valid_loss = valid_loss / len(valid_dataLoader)
        valid_accuracy = eval_accuracy/(nb_eval_steps)
#         if avg_valid_loss <= best_loss:
        if best_acc <= valid_accuracy:
            best_acc = valid_accuracy
#             best_loss = avg_valid_loss
            torch.save(model, f'/kaggle/working/model{fold + 1}')
            print(f'model{fold + 1} saved')
        print(epoch_i + 1, "  Average valid loss: {0:.4f}".format(avg_valid_loss))
        print("  Accuracy: {0:.4f}".format(valid_accuracy))