In [None]:
!nvidia-smi
from google.colab import drive
drive.mount('/content/drive')
!ls drive/'My Drive'/'Colab Notebooks'/
!pip install -r drive/'My Drive'/'Colab Notebooks'/test_project/requirements.txt
import sys
sys.path.append('drive/My Drive/Colab Notebooks/')

Tue Jan 25 07:34:16 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel

In [None]:
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
tokenizer = PreTrainedTokenizerFast.from_pretrained(
    "skt/kogpt2-base-v2", bos_token='<s>', eos_token='</s>', unk_token='<unk>', pad_token='<pad>', mask_token='<unused0>')

root_path='drive/My Drive/Colab Notebooks/test_project'
data_path = f"{root_path}/Chatbot_data/ChatbotData.csv"

Chatbot_Data = pd.read_csv(data_path)
# Test 용으로 300개 데이터만 처리한다.
Chatbot_Data = Chatbot_Data[:300]
Chatbot_Data.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [None]:
class ChatbotDataset(Dataset):
    def __init__(self, chats, max_len=40):  # 데이터셋의 전처리를 해주는 부분
        self._data = chats
        self.max_len = max_len
        self.q_token = '<usr>'
        self.a_token = '<sys>'
        self.sent_token = '<unused1>'
        self.eos = '</s>'
        self.mask = '<unused0>'
        self.tokenizer = PreTrainedTokenizerFast.from_pretrained(
            "skt/kogpt2-base-v2", bos_token='<s>', eos_token='</s>',
            unk_token='<unk>', pad_token='<pad>', mask_token='<unused0>')

    def __len__(self):  # chatbotdata 의 길이를 리턴한다.
        return len(self._data)

    def __getitem__(self, idx):  # 로드한 챗봇 데이터를 차례차례 DataLoader로 넘겨주는 메서드
        turn = self._data.iloc[idx]
        q = turn['Q']
        a = turn['A']
        sentiment = str(turn['label'])
        q_toked = self.tokenizer.tokenize(self.q_token + q + \
                                          self.sent_token + sentiment)
        q_len = len(q_toked)
        a_toked = self.tokenizer.tokenize(self.a_token + a + self.eos)
        a_len = len(a_toked)
        if q_len + a_len > self.max_len:
            a_len = self.max_len - q_len
            if a_len <= 0:
                q_toked = q_toked[-(int(self.max_len / 2)):]
                q_len = len(q_toked)
                a_len = self.max_len - q_len
                assert a_len > 0
            a_toked = a_toked[:a_len]
            a_len = len(a_toked)
            assert a_len == len(a_toked), f'{a_len} ==? {len(a_toked)}'
        # [mask, mask, ...., mask, ..., <bos>,..A.. <eos>, <pad>....]
        labels = [
                     self.mask,
                 ] * q_len + a_toked[1:]

        # mask = 질문길이 0 + 답변길이 1 + 나머지 0
        mask = [0] * q_len + [1] * a_len + [0] * (self.max_len - q_len - a_len)
        self.max_len
        labels_ids = self.tokenizer.convert_tokens_to_ids(labels)
        while len(labels_ids) < self.max_len:
            labels_ids += [self.tokenizer.pad_token_id]
        token_ids = self.tokenizer.convert_tokens_to_ids(q_toked + a_toked)
        while len(token_ids) < self.max_len:
            token_ids += [self.tokenizer.pad_token_id]
        return (token_ids, np.array(mask),
                labels_ids)

In [None]:
def collate_batch(batch):
    data = [item[0] for item in batch]
    mask = [item[1] for item in batch]
    label = [item[2] for item in batch]
    return torch.LongTensor(data), torch.LongTensor(mask), torch.LongTensor(label)

train_set = ChatbotDataset(Chatbot_Data, max_len=40)
train_dataloader = DataLoader(train_set, batch_size=32, num_workers=0, shuffle=True, collate_fn=collate_batch,)

In [None]:
ctx = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(ctx)
model.to(device)
model.train()

learning_rate = 3e-5
criterion = torch.nn.CrossEntropyLoss(reduction="none")
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

epoch = 1
Sneg = -1e18

ctx = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
save_ckpt_path = '/content/drive/MyDrive/Colab Notebooks/test_project/checkpoint/test.pth'
save_step = 100


pre_epoch, pre_loss, train_step = 0, 0, 0
########################################### 학습 모델 로딩  ##################
if os.path.isfile(save_ckpt_path):
    checkpoint = torch.load(save_ckpt_path, map_location=device)
    pre_epoch = checkpoint['epoch']
    pre_loss = checkpoint['loss']
    train_step =  checkpoint['train_no']
    total_train_step =  checkpoint['total_train_step']

    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    print(f"load pretrain from: {save_ckpt_path}, epoch={pre_epoch}")  #, loss={pre_loss}\n")
    # best_epoch += 1
########################################################################################
count = 0

load pretrain from: /content/drive/MyDrive/Colab Notebooks/test_project/checkpoint/test.pth, epoch=0


In [None]:
print ("start")
for epoch in range(epoch):
    for batch_idx, samples in enumerate(train_dataloader):
        optimizer.zero_grad()
        token_ids, mask, label = samples
        if ctx == 'cuda':
            token_ids, mask, label = token_ids.to(ctx), mask.to(ctx), label.to(ctx)
        out = model(token_ids)
        out = out.logits      #Returns a new tensor with the logit of the elements of input
        mask_3d = mask.unsqueeze(dim=2).repeat_interleave(repeats=out.shape[2], dim=2)
        mask_out = torch.where(mask_3d == 1, out, Sneg * torch.ones_like(out))
        loss = criterion(mask_out.transpose(2, 1), label)
        # 평균 loss 만들기 avg_loss[0] / avg_loss[1] <- loss 정규화
        avg_loss = loss.sum() / mask.sum()
        avg_loss.backward()
        # 학습 끝
        optimizer.step()

        if (count > 0 and count % save_step == 0):

            torch.save({
                'epoch': epoch,
                'train_no': count,
                'model_state_dict': model.state_dict(),
                'total_train_step': len(train_dataloader),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss
            }, save_ckpt_path)
        count += 1

torch.save({
    'epoch': epoch,
    'train_no': count,
    'model_state_dict': model.state_dict(),
    'total_train_step': len(train_dataloader),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss
    }, save_ckpt_path)
print ("end")

start
end


In [None]:
# sent = '0'
# with torch.no_grad():
#     while 1:
#         q = input("user > ").strip()
#         if q == "quit":
#             break
#         a = ""
#         while 1:
#             input_ids = torch.LongTensor(tokenizer.encode("<usr>" + q + '<unused1>' + sent + "<sys>" + a)).unsqueeze(dim=0)
#             if ctx == 'cuda':
#                 input_ids = input_ids.to(ctx)
#             pred = model(input_ids)
#             pred = pred.logits
#             if ctx == 'cuda':
#                 pred = pred.to(ctx)
#             gen = tokenizer.convert_ids_to_tokens(torch.argmax(pred, dim=-1).squeeze().cpu().numpy().tolist())[-1]
#             if gen == '</s>':
#                 break
#             a += gen.replace("▁", " ")
#         print("Chatbot > {}".format(a.strip()))