In [1]:
import os
from tqdm import tqdm

import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from pytorch_lightning import loggers as pl_loggers
from transformers.optimization import AdamW, get_cosine_schedule_with_warmup
from transformers import BartForSequenceClassification, PreTrainedTokenizerFast

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_train = pd.read_csv('../5/data/train_data.csv')
df_test = pd.read_csv('../5/data/test_data.csv')

df_train.head()

Unnamed: 0,index,premise,hypothesis,label
0,0,"씨름은 상고시대로부터 전해져 내려오는 남자들의 대표적인 놀이로서, 소년이나 장정들이...",씨름의 여자들의 놀이이다.,contradiction
1,1,"삼성은 자작극을 벌인 2명에게 형사 고소 등의 법적 대응을 검토 중이라고 하였으나,...",자작극을 벌인 이는 3명이다.,contradiction
2,2,이를 위해 예측적 범죄예방 시스템을 구축하고 고도화한다.,예측적 범죄예방 시스템 구축하고 고도화하는 것은 목적이 있기 때문이다.,entailment
3,3,광주광역시가 재개발 정비사업 원주민들에 대한 종합대책을 마련하는 등 원주민 보호에 ...,원주민들은 종합대책에 만족했다.,neutral
4,4,"진정 소비자와 직원들에게 사랑 받는 기업으로 오래 지속되고 싶으면, 이런 상황에서는...",이런 상황에서 책임 있는 모습을 보여주는 기업은 아주 드물다.,neutral


In [3]:
def preprocessing(df):
    df['premise'] = df['premise'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 0-9]', "")
    df['hypothesis'] = df['hypothesis'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 0-9]', "")
    df['premise'] = '<s>' + df['premise'] + '<unused0>'
    df['hypothesis'] = df['hypothesis'] + '</s>'
    return df

df_train = preprocessing(df_train)
df_test = preprocessing(df_test)

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [5]:
from transformers import AutoModelForSequenceClassification, AutoConfig, AutoTokenizer
import gc
gc.collect()
torch.cuda.empty_cache()


MODEL_NAME = 'gogamza/kobart-base-v2'

tokenizer = PreTrainedTokenizerFast.from_pretrained(MODEL_NAME)

config = AutoConfig.from_pretrained(MODEL_NAME)
config.num_labels = 3

model = BartForSequenceClassification.from_pretrained(MODEL_NAME, config=config)

print(model)
print(config)

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at gogamza/kobart-base-v2 and are newly initialized: ['classification_head.out_proj.weight', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BartForSequenceClassification(
  (model): BartModel(
    (shared): Embedding(30000, 768, padding_idx=3)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(30000, 768, padding_idx=3)
      (embed_positions): BartLearnedPositionalEmbedding(1028, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
       

In [6]:
print(torch.cuda.memory_allocated() / 1024 /1024)

0.0


In [7]:
from sklearn.model_selection import train_test_split

train_dataset, eval_dataset = train_test_split(df_train, test_size=0.2, shuffle=True, stratify=df_train['label'])

tokenized_train = tokenizer(
    list(train_dataset['premise']),
    list(train_dataset['hypothesis']),
    return_tensors="pt",
    max_length=256, # Max_Length = 190
    padding=True,
    truncation=True,
    add_special_tokens=True
)

tokenized_eval = tokenizer(
    list(eval_dataset['premise']),
    list(eval_dataset['hypothesis']),
    return_tensors="pt",
    max_length=256,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

print(tokenized_train['input_ids'][0])
print(tokenizer.decode(tokenized_train['input_ids'][0]))

tensor([    0, 14137, 11265, 11821, 22954, 15107, 15509, 10948, 26139, 16116,
        16464, 27140, 15033, 15105, 15926, 15581, 14432, 15313,     7, 14137,
        11265, 11821, 22954, 15107, 15509, 10948, 26139, 16154, 27140, 15033,
        20131, 14270, 19649, 15581, 14432, 15313,     1,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3])
<s> 장석영 과학기술정보통신부 제2차관을 비롯한 간담회 참석자들이 기념촬영 하고 있다<unused0> 장석영 과학기술정보통신부 제2차관은 간담회 참석자들과 함께기념촬영 하고 있다</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>

In [8]:
class kobartDataset(torch.utils.data.Dataset):
    def __init__(self, pair_dataset, label):
        self.pair_dataset = pair_dataset
        self.label = label

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.pair_dataset.items()}
        item['label'] = torch.tensor(self.label[idx])
        
        return item

    def __len__(self):
        return len(self.label)

def label_to_num(label):
    label_dict = {"entailment": 0, "contradiction": 1, "neutral": 2, "answer": 3}
    num_label = []

    for v in label:
        num_label.append(label_dict[v])
    
    return num_label


train_label = label_to_num(train_dataset['label'].values)
eval_label = label_to_num(eval_dataset['label'].values)

In [9]:
import copy

processed_train = copy.copy(tokenized_train)
processed_train.pop('token_type_ids')
processed_eval = copy.copy(tokenized_eval)
processed_eval.pop('token_type_ids')

train_dataset = kobartDataset(processed_train, train_label)
eval_dataset = kobartDataset(processed_eval, eval_label)

model = model.to(device)

print(train_dataset.__len__())
print(train_dataset.__getitem__(19997))
print(tokenizer.decode(train_dataset.__getitem__(19997)['input_ids']))

19998
{'input_ids': tensor([    0, 16263, 16954, 15606, 14358, 15015, 15352, 14453, 25156, 21786,
        15119, 25492, 22480, 14103, 15173, 15853, 17514, 14518, 16239, 22929,
        18809, 24258, 16982,     7, 16954, 15606, 14358, 17514, 14518, 16239,
        22929, 18809, 24258, 14242, 14677,  9754,     1,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'label': tenso

In [10]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
  """ validation을 위한 metrics function """
  labels = pred.label_ids
  preds = pred.predictions[0].argmax(-1)
  probs = pred.predictions

  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds) # 리더보드 평가에는 포함되지 않습니다.

  return {
      'accuracy': acc,
  }

In [11]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./result',
    num_train_epochs=7,
    per_device_train_batch_size=32,
    save_strategy = 'no',
    save_total_limit=5,
    save_steps=500,
    evaluation_strategy='steps',
    eval_steps = 500,
    load_best_model_at_end = False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [12]:
trainer.train()
model.save_pretrained('./result/best_model')

 11%|█▏        | 500/4375 [02:07<17:03,  3.78it/s]

{'loss': 0.8158, 'learning_rate': 4.428571428571428e-05, 'epoch': 0.8}


                                                  
 11%|█▏        | 500/4375 [02:40<17:03,  3.78it/s]

{'eval_loss': 0.604905903339386, 'eval_accuracy': 0.7598, 'eval_runtime': 33.0555, 'eval_samples_per_second': 151.261, 'epoch': 0.8}


 23%|██▎       | 1000/4375 [04:49<14:15,  3.95it/s]  

{'loss': 0.4782, 'learning_rate': 3.857142857142858e-05, 'epoch': 1.6}


                                                   
 23%|██▎       | 1000/4375 [05:06<14:15,  3.95it/s]

{'eval_loss': 0.5557742118835449, 'eval_accuracy': 0.7956, 'eval_runtime': 16.888, 'eval_samples_per_second': 296.068, 'epoch': 1.6}


 34%|███▍      | 1500/4375 [07:13<12:11,  3.93it/s]  

{'loss': 0.314, 'learning_rate': 3.285714285714286e-05, 'epoch': 2.4}


                                                   
 34%|███▍      | 1500/4375 [07:30<12:11,  3.93it/s]

{'eval_loss': 0.7383450269699097, 'eval_accuracy': 0.792, 'eval_runtime': 16.897, 'eval_samples_per_second': 295.911, 'epoch': 2.4}


 46%|████▌     | 2000/4375 [09:34<09:39,  4.10it/s]  

{'loss': 0.2006, 'learning_rate': 2.714285714285714e-05, 'epoch': 3.2}


                                                   
 46%|████▌     | 2000/4375 [09:51<09:39,  4.10it/s]

{'eval_loss': 0.9102541208267212, 'eval_accuracy': 0.7976, 'eval_runtime': 16.33, 'eval_samples_per_second': 306.185, 'epoch': 3.2}


 57%|█████▋    | 2500/4375 [11:54<07:33,  4.13it/s]  

{'loss': 0.1138, 'learning_rate': 2.1428571428571428e-05, 'epoch': 4.0}



 57%|█████▋    | 2500/4375 [12:10<07:33,  4.13it/s]

{'eval_loss': 0.9224607348442078, 'eval_accuracy': 0.798, 'eval_runtime': 16.32, 'eval_samples_per_second': 306.373, 'epoch': 4.0}


 69%|██████▊   | 3000/4375 [14:12<05:34,  4.11it/s]  

{'loss': 0.0575, 'learning_rate': 1.5714285714285715e-05, 'epoch': 4.8}


                                                   
 69%|██████▊   | 3000/4375 [14:29<05:34,  4.11it/s]

{'eval_loss': 1.2661964893341064, 'eval_accuracy': 0.7994, 'eval_runtime': 16.288, 'eval_samples_per_second': 306.975, 'epoch': 4.8}


 80%|████████  | 3500/4375 [16:30<03:31,  4.15it/s]  

{'loss': 0.0378, 'learning_rate': 1e-05, 'epoch': 5.6}


                                                   
 80%|████████  | 3500/4375 [16:46<03:31,  4.15it/s]

{'eval_loss': 1.415041208267212, 'eval_accuracy': 0.7986, 'eval_runtime': 16.264, 'eval_samples_per_second': 307.427, 'epoch': 5.6}


 91%|█████████▏| 4000/4375 [18:48<01:31,  4.11it/s]  

{'loss': 0.0253, 'learning_rate': 4.285714285714286e-06, 'epoch': 6.4}


                                                   
 91%|█████████▏| 4000/4375 [19:04<01:31,  4.11it/s]

{'eval_loss': 1.4580540657043457, 'eval_accuracy': 0.8006, 'eval_runtime': 16.303, 'eval_samples_per_second': 306.692, 'epoch': 6.4}


100%|██████████| 4375/4375 [20:35<00:00,  3.54it/s]


{'train_runtime': 1235.7925, 'train_samples_per_second': 3.54, 'epoch': 7.0}


In [13]:
test_label = label_to_num(df_test['label'].values)

tokenized_test = tokenizer(
    list(df_test['premise']),
    list(df_test['hypothesis']),
    return_tensors="pt",
    max_length=128,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

processed_test = copy.copy(tokenized_test)
processed_test.pop('token_type_ids')
test_dataset = kobartDataset(processed_test, test_label)

print(test_dataset.__len__())
print(test_dataset.__getitem__(1665))
print(tokenizer.decode(test_dataset.__getitem__(6)['input_ids']))

1666
{'input_ids': tensor([    0, 14784, 20343, 27194, 10047, 14584, 24093, 14200, 14205,  9561,
        14304,     7, 14784, 20343,  1700, 10079,  9495, 14380,  9866, 21693,
        22717,     1,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'label': tensor(3)}
<s> 18일 귀국이라 발인도 지켜드리지 못해 더욱 죄송할 따름입니다<unused0> 18일 배를 타고 여행을 떠났습니다</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>

In [14]:


dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

model.eval()
output_pred = []
output_prob = []

for i, data in enumerate(tqdm(dataloader)):
    with torch.no_grad():
        outputs = model(
            input_ids=data['input_ids'].to(device),
            attention_mask=data['attention_mask'].to(device),
        )
    logits = outputs[0]
    prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
    logits = logits.detach().cpu().numpy()
    result = np.argmax(logits, axis=-1)

    output_pred.append(result)
    output_prob.append(prob)
  
pred_answer, output_prob = np.concatenate(output_pred).tolist(), np.concatenate(output_prob, axis=0).tolist()
print(pred_answer)

100%|██████████| 105/105 [00:03<00:00, 30.81it/s]

[1, 2, 2, 1, 1, 1, 2, 0, 0, 0, 1, 0, 1, 0, 2, 2, 0, 2, 1, 2, 2, 1, 1, 0, 1, 1, 1, 2, 2, 2, 1, 0, 1, 2, 2, 1, 0, 1, 0, 1, 2, 0, 1, 2, 2, 1, 2, 0, 1, 0, 1, 0, 1, 2, 0, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 1, 2, 2, 2, 1, 0, 2, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 0, 0, 2, 1, 0, 2, 1, 2, 0, 1, 0, 2, 0, 1, 0, 2, 2, 0, 2, 0, 2, 0, 2, 2, 1, 1, 1, 2, 1, 0, 2, 2, 1, 1, 0, 2, 1, 2, 1, 2, 0, 0, 0, 0, 1, 2, 1, 2, 0, 0, 2, 1, 2, 2, 1, 2, 2, 2, 1, 0, 2, 2, 0, 1, 2, 2, 2, 0, 0, 2, 0, 0, 0, 1, 1, 2, 2, 0, 2, 0, 2, 1, 2, 0, 2, 0, 2, 2, 1, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 0, 2, 2, 0, 1, 2, 2, 0, 1, 2, 1, 2, 2, 2, 0, 1, 1, 0, 2, 2, 0, 2, 1, 1, 1, 2, 1, 1, 2, 0, 1, 1, 2, 1, 0, 1, 2, 0, 2, 2, 1, 1, 0, 2, 1, 0, 0, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 0, 1, 2, 2, 1, 0, 1, 2, 0, 1, 2, 2, 1, 0, 0, 2, 1, 1, 2, 2, 2, 1, 2, 1, 0, 2, 2, 0, 0, 2, 1, 1, 1, 2, 2, 2, 0, 0, 0, 2, 0, 2, 1, 0, 2, 0, 1, 0, 2, 0, 1, 1, 1, 2, 2, 2, 0, 2, 1, 1, 0, 0, 1, 0, 0, 2, 1, 1, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 0, 1, 0, 1, 2, 1, 1, 2, 0, 2, 




In [15]:
def num_to_label(label):
    label_dict = {0: "entailment", 1: "contradiction", 2: "neutral"}
    str_label = []

    for i, v in enumerate(label):
        str_label.append([i,label_dict[v]])
    
    return str_label

answer = num_to_label(pred_answer)
print(answer)

[[0, 'contradiction'], [1, 'neutral'], [2, 'neutral'], [3, 'contradiction'], [4, 'contradiction'], [5, 'contradiction'], [6, 'neutral'], [7, 'entailment'], [8, 'entailment'], [9, 'entailment'], [10, 'contradiction'], [11, 'entailment'], [12, 'contradiction'], [13, 'entailment'], [14, 'neutral'], [15, 'neutral'], [16, 'entailment'], [17, 'neutral'], [18, 'contradiction'], [19, 'neutral'], [20, 'neutral'], [21, 'contradiction'], [22, 'contradiction'], [23, 'entailment'], [24, 'contradiction'], [25, 'contradiction'], [26, 'contradiction'], [27, 'neutral'], [28, 'neutral'], [29, 'neutral'], [30, 'contradiction'], [31, 'entailment'], [32, 'contradiction'], [33, 'neutral'], [34, 'neutral'], [35, 'contradiction'], [36, 'entailment'], [37, 'contradiction'], [38, 'entailment'], [39, 'contradiction'], [40, 'neutral'], [41, 'entailment'], [42, 'contradiction'], [43, 'neutral'], [44, 'neutral'], [45, 'contradiction'], [46, 'neutral'], [47, 'entailment'], [48, 'contradiction'], [49, 'entailment'], 

In [16]:
df = pd.DataFrame(answer, columns=['index', 'label'])

df.to_csv('./result/submission.csv', index=False)

print(df)

      index          label
0         0  contradiction
1         1        neutral
2         2        neutral
3         3  contradiction
4         4  contradiction
...     ...            ...
1661   1661        neutral
1662   1662     entailment
1663   1663        neutral
1664   1664        neutral
1665   1665        neutral

[1666 rows x 2 columns]
