In [None]:
# 참(Entailment) 또는 거짓(Contradiction) 또는 중립(Neutral)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install mxnet
!pip install gluonnlp
!pip install transformers
!pip install torch
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

Collecting mxnet
  Downloading mxnet-1.9.0-py3-none-manylinux2014_x86_64.whl (47.3 MB)
[K     |████████████████████████████████| 47.3 MB 1.2 MB/s 
[?25hCollecting graphviz<0.9.0,>=0.8.1
  Downloading graphviz-0.8.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: graphviz, mxnet
  Attempting uninstall: graphviz
    Found existing installation: graphviz 0.10.1
    Uninstalling graphviz-0.10.1:
      Successfully uninstalled graphviz-0.10.1
Successfully installed graphviz-0.8.4 mxnet-1.9.0
Collecting gluonnlp
  Downloading gluonnlp-0.10.0.tar.gz (344 kB)
[K     |████████████████████████████████| 344 kB 5.4 MB/s 
Building wheels for collected packages: gluonnlp
  Building wheel for gluonnlp (setup.py) ... [?25l[?25hdone
  Created wheel for gluonnlp: filename=gluonnlp-0.10.0-cp37-cp37m-linux_x86_64.whl size=595739 sha256=8986dd3ac8a0f7f4cf858a4fc25def80bf524fab3e70b528a261c2d09c978550
  Stored in directory: /root/.cache/pip/wheels/be/b4/06/7f3fdfaf707e6b5e98b79c041e023acffb

In [None]:
import pandas as pd
from glob import glob
import os
import numpy as np
from tqdm import tqdm, tqdm_notebook


import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import gluonnlp as nlp
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from transformers import AdamW

In [None]:
# 데이터 가져오기
train = pd.read_csv("drive/MyDrive/open/train_data.csv")
test = pd.read_csv("drive/MyDrive/open/test_data.csv")
submission = pd.read_csv("drive/MyDrive/open/sample_submission.csv")

In [None]:
max_len = 70 # 최대 길이
batch_size = 64 # 배치
warmup_ratio = 0.1 # ???
num_epochs = 10 # epoch 2번
max_grad_norm = 1 # ???
log_interval = 200 # ???
learning_rate = 5e-5 # Leaning rate

device = torch.device("cuda:0") # device는 cuda:0을 사용

In [None]:
bertmodel, vocab = get_pytorch_kobert_model(cachedir = ".cache") # bert모델, vocab 불러오기

tokenizer = get_tokenizer() # get_tokenizer 클래스
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower = False) # 가져온 vocab을 tokenizer수행

/content/.cache/kobert_v1.zip[██████████████████████████████████████████████████]
/content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece[██████████████████████████████████████████████████]
using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [None]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                pad, pair, mode = "train"):
        self.mode = mode
        transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_seq_length = max_len, pad = pad, pair = pair)
        if self.mode == "train":
            self.sentences = [transform([i[sent_idx]]) for i in dataset]
            self.labels = [np.int32(i[label_idx]) for i in dataset]
            
        else:
            self.sentences = [transform(i) for i in dataset]
        
    def __getitem__(self, i): # 불렸을 때
        if self.mode == 'train':
            return (self.sentences[i] + (self.labels[i], )) # train mode일 경우 index = i 에 해당한 sentence와 정답 출력 
        else:
            return self.sentences[i] # test의 경우 sentence만 출력
    
    def __len__(self):
        return (len(self.sentences)) # setence의 길이를 반환

In [None]:
print(pd.unique(train["label"])) # 중복제거한 -> 종류

label_dict = {"entailment" : 0, "contradiction" : 1, "neutral" : 2}

['contradiction' 'entailment' 'neutral']


In [None]:
train["premise_"] = "[CLS]" + train["premise"] + "[SEP]" # [CLS] train["premise"] [SEP]
train["hypothesis_"] = train["hypothesis"] + "[SEP]" # train["hypothesis"] [SEP]

# test도 동일
test["premise_"] = "[CLS]" + test["premise"] + "[SEP]"
test["hypothesis_"] = test["hypothesis"] + "[SEP]"

# 위에서 만든 두 문장 합치기 [CLS] premise [SEP] hypotheis [SEP] 꼴
train["text_sum"] = train.premise_ + " " + train.hypothesis_
test["text_sum"] = test.premise_ + " " + test.hypothesis_

train_content = []
test_content = []

for i, text in enumerate(train.text_sum): # 위에서 만든 train["text_sum"] iter함
    train_content.append(list([text, str(label_dict[train.label[i]])])) # 형태 : [['text', 'label'], ... ]

for i, text in enumerate(test.text_sum): # test도 똑같이 하지만 정답 label이 없어서 text만
    test_content.append([text])
    
dataset_train = train_content[:20000] # 20000개의 train_data
dataset_valid = train_content[20000:] # 4998개의 val_data
dataset_test = test_content # test_data

data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False, mode = "train") # data_train 각각 text와 label을 array, tok형태로 전달
data_valid = BERTDataset(dataset_valid, 0, 1, tok, max_len, True, False, mode = "train")
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False, mode = "test")

train_dataloader = torch.utils.data.DataLoader(data_train, batch_size = batch_size, num_workers = 5) # batch형태로 iter가능하게 dataloader를 이용
valid_dataloader = torch.utils.data.DataLoader(data_valid, batch_size = batch_size, num_workers = 5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size = batch_size, num_workers = 5)

  cpuset_checked))


In [None]:
print(train["premise_"][0])
print(train["hypothesis_"][0])
print(train["text_sum"][0])
# print(train.text_sum)
print(train_content[0])
print(test_content[0])
print(len(dataset_train))
print(len(dataset_valid))
print(data_train[0])

[CLS]씨름은 상고시대로부터 전해져 내려오는 남자들의 대표적인 놀이로서, 소년이나 장정들이 넓고 평평한 백사장이나 마당에서 모여 서로 힘과 슬기를 겨루는 것이다.[SEP]
씨름의 여자들의 놀이이다.[SEP]
[CLS]씨름은 상고시대로부터 전해져 내려오는 남자들의 대표적인 놀이로서, 소년이나 장정들이 넓고 평평한 백사장이나 마당에서 모여 서로 힘과 슬기를 겨루는 것이다.[SEP] 씨름의 여자들의 놀이이다.[SEP]
['[CLS]씨름은 상고시대로부터 전해져 내려오는 남자들의 대표적인 놀이로서, 소년이나 장정들이 넓고 평평한 백사장이나 마당에서 모여 서로 힘과 슬기를 겨루는 것이다.[SEP] 씨름의 여자들의 놀이이다.[SEP]', '1']
['[CLS]다만 조금 좁아서 케리어를 펼치기 불편합니다.[SEP] 케리어를 펼치기에 공간이 충분했습니다.[SEP]']
20000
4998
(array([   2,  702,  638,  315,  517,  363, 3088, 6117, 7086, 2658, 5439,
       6708, 6080, 4059, 7245, 1442, 6965, 1423, 5939, 1678, 1504, 7096,
       6081,  517,   46, 2822, 5712, 7098, 3954, 7227, 5940, 1459, 5439,
       4841, 7724, 7828, 2298, 6493, 7178, 7098, 1907, 5804, 6903, 2064,
       2720, 5211, 5468, 2948, 5573,  517, 5411, 6095, 5760,  913,  517,
         54,  702,  687,  282,  333,  517,  363, 3088, 6117, 7095, 3318,
       5939, 1504, 7096,    3], dtype=int32), array(70, dtype=int32), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self, bert, hidden_size = 768, num_classes=3, dr_rate=None, params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes) # ???
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids) # toekn_ids와 같은 format? 사이즈?를 0으로 채운 텐서 = attention_mask
        for i, v in enumerate(valid_length): # 유효한 범위까지 1로 바꾸고 나머지 죽이기 = 0
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length) # attention_mask 만듬
        
        # ???
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler) # 위에서 정의한 Dropout layer거침
        return self.classifier(out) # Linear layer 거침

In [None]:
model = BERTClassifier(bertmodel, dr_rate = 0.5).to(device) # bertmodel에서 dropout과 linera layer를 거침 -> model

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate) # optimizer, loss 정함
loss_fn = nn.CrossEntropyLoss()



In [None]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1) # 최댓값 반환
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [None]:
for e in range(num_epochs):
    train_acc = 0.0
    valid_acc = 0.0
    # model train, tqdm은 아래 진행률
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)): 
        optimizer.zero_grad() # zero_grad?
        token_ids = token_ids.long().to(device) # long?
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length # 필요없을듯, .to(device) ??
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward() # backward?
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # max_grad_norm = 1?
        optimizer.step() # ???
        train_acc += calc_accuracy(out, label)

    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(valid_dataloader), total=len(valid_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        valid_acc += calc_accuracy(out, label)
    print("epoch {} valid acc {}".format(e+1, valid_acc / (batch_id+1)))

  cpuset_checked))
100%|██████████| 313/313 [07:25<00:00,  1.42s/it]

epoch 1 train acc 0.5373901757188498



100%|██████████| 79/79 [00:41<00:00,  1.93it/s]

epoch 1 valid acc 0.7360232067510549



100%|██████████| 313/313 [07:25<00:00,  1.42s/it]

epoch 2 train acc 0.7614816293929713



100%|██████████| 79/79 [00:41<00:00,  1.93it/s]

epoch 2 valid acc 0.7552742616033755



100%|██████████| 313/313 [07:25<00:00,  1.42s/it]

epoch 3 train acc 0.8351138178913738



100%|██████████| 79/79 [00:41<00:00,  1.93it/s]

epoch 3 valid acc 0.7570543248945147



100%|██████████| 313/313 [07:25<00:00,  1.42s/it]

epoch 4 train acc 0.8866313897763578



100%|██████████| 79/79 [00:40<00:00,  1.93it/s]

epoch 4 valid acc 0.751714135021097



100%|██████████| 313/313 [07:25<00:00,  1.42s/it]

epoch 5 train acc 0.9112420127795527



100%|██████████| 79/79 [00:41<00:00,  1.93it/s]

epoch 5 valid acc 0.7289688818565401



100%|██████████| 313/313 [07:25<00:00,  1.42s/it]

epoch 6 train acc 0.9312100638977636



100%|██████████| 79/79 [00:41<00:00,  1.93it/s]

epoch 6 valid acc 0.7560654008438819



100%|██████████| 313/313 [07:25<00:00,  1.42s/it]

epoch 7 train acc 0.9469349041533547



100%|██████████| 79/79 [00:41<00:00,  1.93it/s]

epoch 7 valid acc 0.7554720464135021



100%|██████████| 313/313 [07:25<00:00,  1.42s/it]

epoch 8 train acc 0.9543730031948882



100%|██████████| 79/79 [00:40<00:00,  1.93it/s]

epoch 8 valid acc 0.7616033755274262



100%|██████████| 313/313 [07:25<00:00,  1.42s/it]

epoch 9 train acc 0.959564696485623



100%|██████████| 79/79 [00:40<00:00,  1.93it/s]

epoch 9 valid acc 0.7572521097046413



100%|██████████| 313/313 [07:25<00:00,  1.42s/it]

epoch 10 train acc 0.9651557507987221



100%|██████████| 79/79 [00:41<00:00,  1.93it/s]

epoch 10 valid acc 0.7513185654008439





In [None]:
result = []
model.eval()
with torch.no_grad():
    for batch_id, (token_ids, valid_length, segment_ids) in tqdm(enumerate(test_dataloader), total=len(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        result.append(model(token_ids, valid_length, segment_ids))

  cpuset_checked))
100%|██████████| 27/27 [00:13<00:00,  1.94it/s]


In [None]:
result_ = []
for i in result:
    for j in i:
        result_.append(int(torch.argmax(j)))
        
out = [list(label_dict.keys())[_] for _ in result_]

submission["label"] = out

submission.to_csv("drive/MyDrive/open/submission.csv", index = False)

In [None]:
submission.sample(3)

Unnamed: 0,index,label
1192,1192,entailment
1390,1390,contradiction
27,27,entailment
