In [1]:
import math
import numpy as np
import pandas as pd
import random
import re
import torch
import urllib.request
from torch.utils.data import DataLoader, Dataset
from transformers import PreTrainedTokenizerFast
import urllib.request

## **전처리**

In [3]:
all_data = pd.read_csv("/content/drive/MyDrive/KUBIG/여름CONTEST/AllData.csv")


In [4]:
all_data.head()

Unnamed: 0,req,res
0,너 좋아하는 차 종류 있어?,무슨 차? 자동차? 마시는 차?
1,ㅋㅋ 마시는 차 말한 거야!,"아하 나 둥글레, 옥수수, 보리차 좋아해"
2,완전 곡물류 좋아하네 ㅋㅋ,야쓰 끓이기 귀찮아서 냉침해 먹어
3,그럼 오래 걸리지 않아?,끓이는 것보다는 훨씬 오래 걸리지 ㅠ
4,근데 냉침 하는 것도 귀찮겠다 ㅜㅠ,응! 그래서 매일은 안 먹고 가끔 마셔


In [5]:
all_data.dropna(inplace=True)

In [6]:
# 찾은 규칙 기반 텍스트 정제
import re

def preprocess(input_string) :
    cleaned1 = re.sub(r'(~|#)(?=\S)|(?<=\S)(~|#)', '', input_string)

    cleaned2 = re.sub(r'&+\w+&|@+\w+&|\@\w+', '<고유명사>', cleaned1)

    cleaned3 = re.sub(r'\S*\(\(\w*\)\)\S*', '', cleaned2)

    # Excluding <unk>
    special = re.compile(r'([^가-힣\s\d.,?,<알수없음>])')
    cleaned4 = special.sub('', cleaned3)

    # 특정 단어 제거
    words_to_remove = ['아', '그', '음', '오', '힉', '에', '아하', '엥', '흑', '헐', '아아', '후', '홀', '웅', '마자', '아아아', '아아아아',
                       '헤헷', '앗', '어', '헉', '휴', '흠', '하', '야쓰', '열', '궈궈궈', '키키', '글구', '마즈마즈', '쏴리', '흐흐']
    pattern = r'\b(' + '|'.join(map(re.escape, words_to_remove)) + r')~?\b'
    cleaned5 = re.sub(pattern, '', cleaned4)


    cleaned6 = re.sub(r'\*\*.*?\*\*', '', cleaned5)

    cleaned_text_final = re.sub(r"[^가-힣0-9a-zA-Z\s.,!?]", "", cleaned6)

    cleaned_text_final = re.sub(r'\s+', ' ', cleaned_text_final).strip()

    return cleaned_text_final

In [7]:
# 정제
all_data['req'] = all_data['req'].apply(preprocess)
all_data['res'] = all_data['res'].apply(preprocess)

In [8]:
all_data.replace("", float("NaN"), inplace=True)
print(all_data.isnull().values.any())
all_data.dropna(inplace=True) # False

True


In [9]:
all_data = all_data.drop_duplicates(['req']).reset_index(drop=True)
print(f"필터링된 데이터셋 총 개수 : {len(all_data)}") # 88691

필터링된 데이터셋 총 개수 : 88691


In [10]:
all_data = all_data.drop_duplicates(['res']).reset_index(drop=True)
print(f"필터링된 데이터셋 총 개수 : {len(all_data)}") # 87968

필터링된 데이터셋 총 개수 : 87968


In [11]:
# 한 글자 이하로 되어 있는 모든 행 제거
all_data = all_data[~all_data['req'].str.len().le(1) & ~all_data['res'].str.len().le(1)]

# 100자 이상인 행 모두 제거
all_data = all_data[~all_data['req'].str.len().gt(100) & ~all_data['res'].str.len().gt(100)]

## **Pre-trained KoGPT2 fine tuning**

In [12]:
import torch
from torch.utils.data import DataLoader
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel

Q_TKN = "<usr>"
A_TKN = "<sys>"
BOS = '</s>'
EOS = '</s>'
MASK = '<unused0>'
SENT = '<unused1>'
PAD = '<pad>'

In [13]:
koGPT2_TOKENIZER = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
            bos_token=BOS, eos_token=EOS, unk_token='<unk>',
            pad_token=PAD, mask_token=MASK)
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [14]:
class ChatbotDataset(Dataset):
    def __init__(self, chats, max_len=40):  # 데이터셋의 전처리를 해주는 부분
        self._data = chats
        self.max_len = max_len
        self.q_token = Q_TKN
        self.a_token = A_TKN
        self.sent_token = SENT
        self.eos = EOS
        self.mask = MASK
        self.tokenizer = koGPT2_TOKENIZER

    def __len__(self):  # chatbotdata 의 길이를 리턴
        return len(self._data)

    def __getitem__(self, idx):  # 챗봇 데이터를 차례차례 DataLoader로 넘겨주는 메서드
        turn = self._data.iloc[idx]
        q = turn["req"]  # 질문을 가져온다.
        q = re.sub(r"([?.!,])", r" ", q)  # 구둣점들을 제거한다.

        a = turn["res"]  # 답변을 가져온다.
        a = re.sub(r"([?.!,])", r" ", a)  # 구둣점들을 제거한다.

        q_toked = self.tokenizer.tokenize(self.q_token + q + self.sent_token)
        q_len = len(q_toked)

        a_toked = self.tokenizer.tokenize(self.a_token + a + self.eos)
        a_len = len(a_toked)

        #질문의 길이가 최대길이보다 크면
        if q_len > self.max_len:
            a_len = self.max_len - q_len        #답변의 길이를 최대길이 - 질문길이
            if a_len <= 0:       #질문의 길이가 너무 길어 질문만으로 최대 길이를 초과 한다면
                q_toked = q_toked[-(int(self.max_len / 2)) :]   #질문길이를 최대길이의 반으로
                q_len = len(q_toked)
                a_len = self.max_len - q_len              #답변의 길이를 최대길이 - 질문길이
            a_toked = a_toked[:a_len]
            a_len = len(a_toked)

        #질문의 길이 + 답변의 길이가 최대길이보다 크면
        if q_len + a_len > self.max_len:
            a_len = self.max_len - q_len        #답변의 길이를 최대길이 - 질문길이
            if a_len <= 0:       #질문의 길이가 너무 길어 질문만으로 최대 길이를 초과 한다면
                q_toked = q_toked[-(int(self.max_len / 2)) :]   #질문길이를 최대길이의 반으로
                q_len = len(q_toked)
                a_len = self.max_len - q_len              #답변의 길이를 최대길이 - 질문길이
            a_toked = a_toked[:a_len]
            a_len = len(a_toked)

        # 답변 labels = [mask, mask, ...., mask, ..., <bos>,..답변.. <eos>, <pad>....]
        labels = [self.mask,] * q_len + a_toked[1:]

        # mask = 질문길이 0 + 답변길이 1 + 나머지 0
        mask = [0] * q_len + [1] * a_len + [0] * (self.max_len - q_len - a_len)
        # 답변 labels을 index 로 만든다.
        labels_ids = self.tokenizer.convert_tokens_to_ids(labels)
        # 최대길이만큼 PADDING
        while len(labels_ids) < self.max_len:
            labels_ids += [self.tokenizer.pad_token_id]

        # 질문 + 답변을 index 로 만든다.
        token_ids = self.tokenizer.convert_tokens_to_ids(q_toked + a_toked)
        # 최대길이만큼 PADDING
        while len(token_ids) < self.max_len:
            token_ids += [self.tokenizer.pad_token_id]

        #질문+답변, 마스크, 답변
        return (token_ids, np.array(mask), labels_ids)

In [15]:
def collate_batch(batch):
    data = [item[0] for item in batch]
    mask = [item[1] for item in batch]
    label = [item[2] for item in batch]
    return torch.LongTensor(data), torch.LongTensor(mask), torch.LongTensor(label)

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_set = ChatbotDataset(all_data, max_len=40)
train_dataloader = DataLoader(train_set,
                              batch_size=32,
                              num_workers=0,
                              shuffle=True,
                              collate_fn=collate_batch,
                              )

model.to(device)
model.train()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=51200, bias=False)
)

In [17]:
learning_rate = 3e-5
criterion = torch.nn.CrossEntropyLoss(reduction="none")
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

Sneg = -1e18

In [18]:
from tqdm import tqdm

print("start")
num_epochs = 10

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    for batch_idx, samples in enumerate(tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}/{num_epochs}")):
        optimizer.zero_grad()
        token_ids, mask, label = samples
        token_ids = token_ids.to(device)
        mask = mask.to(device)
        label = label.to(device)
        out = model(token_ids)
        out = out.logits      #Returns a new tensor with the logit of the elements of input
        mask_3d = mask.unsqueeze(dim=2).repeat_interleave(repeats=out.shape[2], dim=2)
        mask_out = torch.where(mask_3d == 1, out, Sneg * torch.ones_like(out))
        loss = criterion(mask_out.transpose(2, 1), label)
        avg_loss = loss.sum() / mask.sum()
        avg_loss.backward()
        optimizer.step()

print("end")

start
Epoch 1/10


  return torch.LongTensor(data), torch.LongTensor(mask), torch.LongTensor(label)
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Training Epoch 1/10:  46%|████▌     | 1261/2746 [03:59<04:41,  5.27it/s]


KeyboardInterrupt: 

In [None]:
with torch.no_grad():
    while 1:
        q = input("user > ").strip()
        if q == "quit":
            break
        a = ""
        while 1:
            input_ids = torch.LongTensor(koGPT2_TOKENIZER.encode(Q_TKN + q + SENT + A_TKN + a)).unsqueeze(dim=0)
            model = model.to('cpu')
            pred = model(input_ids)
            pred = pred.logits
            gen = koGPT2_TOKENIZER.convert_ids_to_tokens(torch.argmax(pred, dim=-1).squeeze().numpy().tolist())[-1]
            if gen == EOS:
                break
            a += gen.replace("▁", " ")
        print("Chatbot > {}".format(a.strip()))

In [None]:
import torch

model = torch.load('/content/drive/MyDrive/KUBIG/여름CONTEST/model_final')
tokenizer = torch.load("/content/drive/MyDrive/KUBIG/여름CONTEST/tokenizer_final")

model.eval()

In [None]:
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast

# 모델과 토크나이저 저장 경로
model_save_path = "/content/drive/MyDrive/KUBIG/여름CONTEST/model_final"
tokenizer_save_path = "/content/drive/MyDrive/KUBIG/여름CONTEST/tokenizer_final"

# 모델 저장
model.save_pretrained(model_save_path)

# 토크나이저 저장
koGPT2_TOKENIZER.save_pretrained(tokenizer_save_path)


In [None]:
# ### 기록1
# with torch.no_grad():
#     while 1:
#         q = input("user > ").strip()
#         if q == "quit":
#             break
#         a = ""
#         while 1:
#             input_ids = torch.LongTensor(koGPT2_TOKENIZER.encode(Q_TKN + q + SENT + A_TKN + a)).unsqueeze(dim=0)
#             model = model.to('cpu')
#             pred = model(input_ids)
#             pred = pred.logits
#             gen = koGPT2_TOKENIZER.convert_ids_to_tokens(torch.argmax(pred, dim=-1).squeeze().numpy().tolist())[-1]
#             if gen == EOS:
#                 break
#             a += gen.replace("▁", " ")
#         print("Chatbot > {}".format(a.strip()))

In [None]:
# from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast

# # 모델과 토크나이저 저장 경로
# model_save_path = "/content/drive/MyDrive/KUBIG/여름CONTEST/model"
# tokenizer_save_path = "/content/drive/MyDrive/KUBIG/여름CONTEST/tokenizer"

# # 모델 저장
# model.save_pretrained(model_save_path)

# # 토크나이저 저장
# koGPT2_TOKENIZER.save_pretrained(tokenizer_save_path)


In [7]:
import torch
from torch.utils.data import DataLoader
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel

Q_TKN = "<usr>"
A_TKN = "<sys>"
BOS = '</s>'
EOS = '</s>'
MASK = '<unused0>'
SENT = '<unused1>'
PAD = '<pad>'

In [5]:
model_save_path = '/content/drive/MyDrive/KUBIG/여름CONTEST/model_final'

tokenizer_save_path = '/content/drive/MyDrive/KUBIG/여름CONTEST/tokenizer_final'

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast

model_chatbot = GPT2LMHeadModel.from_pretrained(model_save_path)
tokenizer_chatbot = PreTrainedTokenizerFast.from_pretrained(tokenizer_save_path)
model_chatbot.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=51200, bias=False)
)

In [8]:
with torch.no_grad():
    while 1:
        q = input("user > ").strip()
        if q == "quit":
            break
        a = ""
        while 1:
            input_ids = torch.LongTensor(tokenizer_chatbot.encode(Q_TKN + q + SENT + A_TKN + a)).unsqueeze(dim=0)
            model_chatbot = model_chatbot.to('cpu')
            pred = model_chatbot(input_ids)
            pred = pred.logits
            gen = tokenizer_chatbot.convert_ids_to_tokens(torch.argmax(pred, dim=-1).squeeze().numpy().tolist())[-1]
            if gen == EOS:
                break
            a += gen.replace("▁", " ")
        # input_ids = tokenizer_int_cc.encode(a, return_tensors = "pt").to(device)
        # # 모델 예측
        # output_ids = model_int_cc.generate(input_ids, max_length = 50, num_beams = 5, early_stopping = True)
        # # 결과 해독
        # output_sentence = tokenizer_int_cc.decode(output_ids[0], skip_special_tokens=True)
        print("Chatbot > {}".format(a.strip()))

user > 요즘 날씨 어때?
Chatbot > 진짜 오락가락해
user > 둘이 사귀면 잘 어울리겠다 으휴
Chatbot > 근데 요즘은 다들 결혼하고 잘 살더라
user > 난 커널스 카라멜이 젤 맛있어 아삭바삭해
Chatbot > 난 지금 배고파서 그냥 그랬어
user > 저녁 뭐 먹을까?
Chatbot > 오늘은 뭐 먹게 점심
user > 오늘 고기 먹고 싶었는데
Chatbot > 고기 파티 했니
user > quit
