In [6]:
import pandas as pd
import numpy as np
import json
import re

import torch
import torch.nn as nn
import transformers
from collections import Counter
from transformers import T5TokenizerFast, T5ForConditionalGeneration, AdamW
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from transformers import TrainerCallback
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


### Data 로드

In [7]:
def open_json(url):
  with open(url) as f:
    data = json.load(f)
  return data

In [8]:
# 초등
train1 = open_json('/content/drive/MyDrive/2. KOREA UNIV./2024 Fall/청소년 데이터 공모전/DATA/01-1.정식개방데이터/Training/01.원천데이터/TS_01. 학교급_01. 초등/상담기록_데이터_초등학교.json')
# 중등
train2 = open_json('/content/drive/MyDrive/2. KOREA UNIV./2024 Fall/청소년 데이터 공모전/DATA/01-1.정식개방데이터/Training/01.원천데이터/TS_01. 학교급_02. 중등/상담기록_데이터_중학교.json')
# 고등
train3 = open_json('/content/drive/MyDrive/2. KOREA UNIV./2024 Fall/청소년 데이터 공모전/DATA/01-1.정식개방데이터/Training/01.원천데이터/TS_01. 학교급_03. 고등/상담기록_데이터_고등학교.json')

In [9]:
# 초등
test1 = open_json('/content/drive/MyDrive/2. KOREA UNIV./2024 Fall/청소년 데이터 공모전/DATA/01-1.정식개방데이터/Validation/01.원천데이터/VS_01. 학교급_01. 초등/상담기록_데이터_초등학교.json')
# 중등
test2 = open_json('/content/drive/MyDrive/2. KOREA UNIV./2024 Fall/청소년 데이터 공모전/DATA/01-1.정식개방데이터/Validation/01.원천데이터/VS_01. 학교급_02. 중등/상담기록_데이터_중학교.json')
# 고등
test3 = open_json('/content/drive/MyDrive/2. KOREA UNIV./2024 Fall/청소년 데이터 공모전/DATA/01-1.정식개방데이터/Validation/01.원천데이터/VS_01. 학교급_03. 고등/상담기록_데이터_고등학교.json')

### 함수 정의

#### 대화 추출

In [10]:
def get_conversation(data):
  conversation = []

  for i in range(len(data)): # 각 학생별로
    student = data[str(i)]['conversation'] # 각 학생의 대화
    # student['conversation'] # 대화 내용
    # print(len(student))

    for j in range(len(student)):
      talk = student[j]['utterances'] # 각 학생의 진짜 대화
      utter = []
      first_speech = True

      for k in range(len(talk)):

        if talk[k]['speaker_idx'].startswith("S"): # 학생이 하는 말이면
          if (
              any(word in talk[k]['utterance'] for word in ["아니오","아니요","안녕","감사"]) # 얘네가 있으면
              or talk[k]['utterance'] in ['네!', '네 [이모티콘]', '네', '네~', '네~!', '네.', '넵', '넵!', '넵~', '넵 [이모티콘]', '네 하하', '하하하', '하하','넵 하하']):  # 네 라면
            continue

        if talk[k]['speaker_idx'].startswith("T") and talk[k]['utterance'] == '파일':
          continue

        if len(talk[k]['utterance']) < 5: # 10글자 이하라면
          continue

        if first_speech: # 각 학생의 첫 번째 대화
          speaker = talk[k]['speaker_idx']
          utter = [talk[k]['utterance']]
          first_speech = False

        else: # 그 다음 대화
          if talk[k]['speaker_idx'] == speaker: # 같은 사람이면
            utter.append(talk[k]['utterance']) # 이번 발화를 저번 발화에 이어 붙이고

          else: # 다른 사람이 되면
            conversation.append({
                'speaker' : speaker,
                'utterance' : ' '.join(utter)
            })
            utter = [talk[k]['utterance']]

          speaker = talk[k]['speaker_idx'] # speaker 갱신

      if not first_speech and utter:
        conversation.append({
            'speaker' : speaker,
            'utterance' : ' '.join(utter)
        })

  return conversation

#### 기호, [이모티콘] 처리

In [11]:
def remove_punc(data):
  for sent in data:
    # '[이모티콘]' -> ~
    sent['utterance'] = sent['utterance'].replace('[이모티콘]','')
    # 링크 제거
    sent['utterance'] = re.sub(r'https://\S+', '', sent['utterance'])
    sent['utterance'] = re.sub(r'http://\S+', '', sent['utterance'])
    # , . ? ! 는 그대로 살려 (앞뒤 공백만 추가) -> 공백이 너무 많아서 일단 이거 제거
    # sent['utterance'] = re.sub(r'([,.~!?])', r' \1 ', sent['utterance'])
    # 한글, 영어, 숫자, 앞서 살리기로 한 기호가 아니면 제거
    sent['utterance'] = re.sub(r'[^ㄱ-ㅎ가-힣,.~!?\d\w ]', '', sent['utterance'])

  return data

#### 띄어쓰기 한 번으로

In [12]:
def one_space(data):
  for sent in data:
    sent['utterance'] = re.sub(r'\s+',' ', sent['utterance'])
  return data

#### 질문 / 답변 페어링

In [26]:
def make_pairs(data):
  # pairs = pd.DataFrame()
  questions = []
  answers = []

  for i in range(len(data)-1): # 각 줄 별로

    num = 2*i

    if num+1 == len(data):
      break

    question = data[num]['utterance']
    answer = data[num+1]['utterance']

    questions.append(question)
    answers.append(answer)


  # pairs['Question'] = questions
  # pairs['Answer'] = answers

  return questions, answers

### 함수 적용 - 전처리 완료

In [13]:
# 대화 추출
train_conv1 = get_conversation(train1)
train_conv2 = get_conversation(train2)
train_conv3 = get_conversation(train3)
# ------
test_conv1 = get_conversation(test1)
test_conv2 = get_conversation(test2)
test_conv3 = get_conversation(test3)

In [14]:
# 통합
train_conv = train_conv1 + train_conv2 + train_conv3
test_conv = test_conv1 + test_conv2 + test_conv3

In [15]:
# 기호 제거
train_wo_punc = remove_punc(train_conv)
test_wo_punc = remove_punc(test_conv)

In [16]:
# 띄어쓰기 제거
train_wo_space = one_space(train_wo_punc)
test_wo_space = one_space(test_wo_punc)

In [None]:
# 질문-답변 페어링
train_question, train_answer = make_pairs(train_wo_space)
test_question, test_answer = make_pairs(test_wo_space)

In [None]:
train_question[:10]

['어서 오세요 방학인데 휴가는 다녀왔어요? 어디로 다녀왔나요? 네~ 조부모님 댁인가요?',
 '여행이요 하하',
 '학생은 형제자매가 어떻게 돼요?',
 '여동생 한 명이에요!',
 '동생이 몇 살인가요?',
 '9살이에요',
 '동생이 있어서 좋은 점은 뭘까요?',
 '심심하지 않다는 점?',
 '네~ 그럼 불편한 점도 있나요?',
 '많이 시끄러워요 장난도 심해요']

In [None]:
train_answer[:10]

['여행이요 하하',
 '학생은 형제자매가 어떻게 돼요?',
 '여동생 한 명이에요!',
 '동생이 몇 살인가요?',
 '9살이에요',
 '동생이 있어서 좋은 점은 뭘까요?',
 '심심하지 않다는 점?',
 '네~ 그럼 불편한 점도 있나요?',
 '많이 시끄러워요 장난도 심해요',
 '어떤 장난을 해요?']

#### 문장 길이 확인

In [None]:
train_question_len = [len(sent) for sent in train_question]
train_answer_len = [len(sent) for sent in train_answer]

In [None]:
print(pd.Series(train_question_len).describe())

count    302072.000000
mean         49.247640
std          70.344794
min           0.000000
25%          16.000000
50%          30.000000
75%          57.000000
max        5100.000000
dtype: float64


In [None]:
print(pd.Series(train_answer_len).describe())

count    302072.000000
mean         49.247818
std          70.344856
min           0.000000
25%          16.000000
50%          30.000000
75%          57.000000
max        5100.000000
dtype: float64


### Tokenize & Encoding

In [None]:
## paust/pko-chat-t5-large <- 실패..
tokenizer = T5TokenizerFast.from_pretrained('paust/pko-t5-small')
max_length = 70

#### 토큰 개수 확인

In [None]:
for sent in train_wo_space[:10]:
  print(sent['utterance'])

어서 오세요 방학인데 휴가는 다녀왔어요? 어디로 다녀왔나요? 네~ 조부모님 댁인가요?
여행이요 하하
학생은 형제자매가 어떻게 돼요?
여동생 한 명이에요!
동생이 몇 살인가요?
9살이에요
동생이 있어서 좋은 점은 뭘까요?
심심하지 않다는 점?
네~ 그럼 불편한 점도 있나요?
많이 시끄러워요 장난도 심해요


In [None]:
check_tokens = []
for sent in train_wo_space: # 페어링 안 된 상태에서
  token_ids = tokenizer.encode(sent['utterance']) # 토큰화된 상태의 id
  tokens = [tokenizer.decode([id]) for id in token_ids] # id -> 토큰으로
  check_tokens.append(tokens)

In [None]:
flattened = [token for tokens in check_tokens for token in tokens]

In [None]:
counts = Counter(flattened)

In [None]:
counts.most_common(10)

[(' ', 3463372),
 ('요', 322884),
 ('</s>', 302073),
 ('.', 151358),
 ('이', 147261),
 ('?', 139914),
 ('을', 123349),
 ('에', 102827),
 (',', 98579),
 ('것', 86706)]

#### ChatBotDataset Class 정의

In [None]:
class ChatBotDataset(Dataset):
  def __init__(self, questions, answers, tokenizer, max_length):
    self.questions = questions
    self.answers = answers
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.questions)

  def __getitem__(self, idx): # 한 문장씩 받아와서
    question = self.questions[idx] # 문자열인지 확인
    answer = self.answers[idx]

    inputs = self.tokenizer(question,
                            max_length=self.max_length,
                            padding='max_length',
                            truncation=True,
                            return_tensors='pt')

    labels = self.tokenizer(answer,
                            max_length=self.max_length,
                            padding='max_length',
                            truncation=True,
                            return_tensors = 'pt')

    return{
        'input_ids' : inputs.input_ids[0],
        'labels' : labels.input_ids[0]
    }

In [None]:
train_data = ChatBotDataset(train_question, train_answer, tokenizer, max_length)
dev_data = ChatBotDataset(test_question, test_answer, tokenizer, max_length)

### 모델 불러오기

In [None]:
# model 정의
## paust/pko-chat-t5-large <- 실패..
model = T5ForConditionalGeneration.from_pretrained('paust/pko-t5-small')
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(50358, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(50358, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

### 학습 진행

In [None]:
# optimizer, loss function은 모델에 내장된 거 사용
training_arguments = TrainingArguments(
    # fp16 = True, # 메모리 줄이기 위함 - 성능이 약간은 떨어질 수 있다고 하지만 일단은... 시도해보자
    output_dir = '/content/drive/MyDrive/2. KOREA UNIV./2024 Fall/청소년 데이터 공모전/Model',
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    learning_rate = 2e-3, # 기존 2e-5
    per_device_train_batch_size = 64, # 기존 64
    per_device_eval_batch_size = 64, # 기존 64
    gradient_accumulation_steps = 2, # 중간에  GPU가 부족하면 batch size 조정
    num_train_epochs = 5,
    weight_decay = 0.01,
    logging_dir = './logs',
    logging_steps = 500,
    load_best_model_at_end = True,
    metric_for_best_model = 'eval_loss',
    greater_is_better = False,
    save_total_limit = 2, # 저장할 체크포인트 개수 (최신 2개만)
    report_to = 'none' # W&B 비활성화,,
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience = 3,
    early_stopping_threshold = 0.001
)

class SaveModelCallback(TrainerCallback): # epoch 완료시 저장

    def on_epoch_end(self, args, state, control, **kwargs):
        save_path = f"/content/drive/MyDrive/2. KOREA UNIV./2024 Fall/청소년 데이터 공모전/Model/epoch-{state.epoch}"
        print(f"Epoch {state.epoch} 완료. 모델 저장 중: {save_path}")
        kwargs['model'].save_pretrained(save_path)

trainer = Trainer(
    model = model,
    args = training_arguments,
    callbacks = [early_stopping_callback, SaveModelCallback()],
    train_dataset = train_data,
    eval_dataset = dev_data
)



#### 학습 첫 실행

In [None]:
trainer.train(resume_from_checkpoint = False)
trainer.evaluate()

model.save_pretrained('/content/drive/MyDrive/2. KOREA UNIV./2024 Fall/청소년 데이터 공모전/Model/End/')
tokenizer.save_pretrained('/content/drive/MyDrive/2. KOREA UNIV./2024 Fall/청소년 데이터 공모전/Model/End/')
print(" 모델 저장 완료! :D ")

#### 학습 이어서 하기

In [None]:
model_path = "/content/drive/MyDrive/2. KOREA UNIV./2024 Fall/청소년 데이터 공모전/Model/epoch-1.0" ## 어디서 멈췄는지 알아야
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5TokenizerFast.from_pretrained('paust/pko-t5-small')
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(50358, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(50358, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

In [None]:
trainer = Trainer(
    model = model,
    args = training_arguments,
    callbacks = [early_stopping_callback, SaveModelCallback()],
    train_dataset = train_data,
    eval_dataset = dev_data
)

In [None]:
try:
  trainer.train(resume_from_checkpoint = True)
  trainer.evaluate()
except Exception as e:
  print(f"오류 발생! {e}")

model.save_pretrained('/content/drive/MyDrive/2. KOREA UNIV./2024 Fall/청소년 데이터 공모전/Model/')
tokenizer.save_pretrained('/content/drive/MyDrive/2. KOREA UNIV./2024 Fall/청소년 데이터 공모전/Model/')

print(" 모델 저장 완료! :D ")

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
  checkpoint_rng_state = torch.load(rng_file)


Epoch,Training Loss,Validation Loss
2,0.8099,0.814245
3,0.7725,0.801747
4,0.7247,0.799309
5,0.6835,0.803274


Epoch 2.0 완료. 모델 저장 중: /content/drive/MyDrive/2. KOREA UNIV./2024 Fall/청소년 데이터 공모전/Model/epoch-2.0
Epoch 3.0 완료. 모델 저장 중: /content/drive/MyDrive/2. KOREA UNIV./2024 Fall/청소년 데이터 공모전/Model/epoch-3.0
Epoch 4.0 완료. 모델 저장 중: /content/drive/MyDrive/2. KOREA UNIV./2024 Fall/청소년 데이터 공모전/Model/epoch-4.0
Epoch 5.0 완료. 모델 저장 중: /content/drive/MyDrive/2. KOREA UNIV./2024 Fall/청소년 데이터 공모전/Model/epoch-5.0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


 모델 저장 완료! :D 


### 응답 생성하기

In [3]:
model_path = "/content/drive/MyDrive/2. KOREA UNIV./2024 Fall/청소년 데이터 공모전/Model/epoch-4.0" ## 어디서 멈췄는지 알아야
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5TokenizerFast.from_pretrained('paust/pko-t5-small')
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.95k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(50358, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(50358, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

In [5]:
# 이전 3개 대화 참고하게끔

conversation_history = []

while (1):
  user_input = input("사용자: ")

  conversation_history.append(f"사용자 : {user_input}")
  if len(conversation_history) > 3: # 3개보다 많이 입력되면
    conversation_history.pop(0) # 가장 오래된 대화 제거

  if user_input == '그만할래요':
    break

  conversation_text = "\n".join(conversation_history)

  # 프롬프트 추가
  prompt = f"너는 사용자의 고민과 이야기를 들어주는 진로 추천 전문 상담사야. 사용자와 대화를 주고받으며 사용자에게 가장 어울리는 진로를 추천하거나 조언을 해 줘. \n\n{conversation_text}\n\n답:\n"
  input_ids = tokenizer(prompt, return_tensors = 'pt').input_ids.to(device)

  # 모델에 입력
  logits = model.generate(
      input_ids,
      max_length = 150,
      temperature = 0.9,
      no_repeat_ngram_size = 6,
      do_sample = True,
      num_return_sequences = 1
  )

  # 모델 응답 추출
  model_response = tokenizer.batch_decode(logits, skip_special_tokens = True)[0]

  # 답변에 응답 추가
  conversation_history.append(f"답: {model_response}")
  if len(conversation_history) > 3:
    conversation_history.pop(0)

  print("답: ",model_response)

사용자: 뭐를 하고 싶은지 모르겠어요ㅠㅜ 도와주세요!
답:  네. 좋아요, 답변 부탁드릴게요.
사용자: MBTI 검사 결과 ENTP가 나왔는데, 저한테 어울리는 직업은 뭘까요?
답:  INFP가 나온 것 같아요. 제 기억으로는 앱 개발자 이런 거였던 거 같아요.
사용자: 저는 ENTP인걸요..
답:  아 그렇군요!
사용자: 저는 뭐를 하면 좋을까요? ENTP에게 어울리는 직업!
답:  그럼 과학 고등학교는 어디로 가고 싶은가요?
사용자: 과학 고등학교를 가면 무엇을 할 수 있죠?
답:  저는 대학교 때 공과대학교를 전공해서 로봇 공학 쪽으로 가고 싶어요


KeyboardInterrupt: Interrupted by user