In [4]:
from transformers import GPT2Tokenizer
from transformers import GPT2LMHeadModel
from torch.utils.data import Dataset, DataLoader
from flask import Flask, request, jsonify
from google.colab import drive
import pandas as pd
import numpy as np
import torch
import os


In [5]:
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
raw_data = pd.read_csv('AI_chatbot(4th_prj)/data/preprocessed_qna_hs_v2.csv')
df = pd.DataFrame(raw_data)

In [7]:
questions = df['questions'].tolist()
answers = df['answers'].tolist()

for q, a in zip(questions[:5], answers[:5]):
    print(f"Q: {q}\nA: {a}\n")

Q: 대성각의 대표적인 메뉴를 알 수 있나요?
A: 대성각의 메뉴에는 짜장면, 짬뽕이 있습니다

Q: 대성각의 영업시간은 어떻게 되나요?
A: 대성각의 영업시간은 11:00 - 15:00/21:00입니다

Q: 대성각의 연락처를 알 수 있나요?
A: 대성각의 연락처는 02-356-2194입니다

Q: 대성각에 인접한 시설을 알 수 있나요?
A: 대성각에 인접한 시설에는 역촌역 3번출구, 그림나라아동미술, 메리피아노음악교습소가 있습니다

Q: 대성각의 휴무일을 알 수 있나요?
A: 대성각의 휴무일은 매주 일요일입니다



In [8]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [9]:
class QADataset(Dataset):
    def __init__(self, questions, answers, tokenizer, max_length=128):
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        answer = self.answers[idx]
        input_ids = self.tokenizer.encode(question, return_tensors='pt', max_length=self.max_length, truncation=True, padding='max_length').squeeze()
        label_ids = self.tokenizer.encode(answer, return_tensors='pt', max_length=self.max_length, truncation=True, padding='max_length').squeeze()
        return input_ids, label_ids

In [10]:
qa_dataset = QADataset(questions, answers, tokenizer)
qa_dataloader = DataLoader(qa_dataset, batch_size=4, shuffle=True)

model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model.resize_token_embeddings(len(tokenizer))
model.to('cuda')

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

In [11]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

In [12]:
class EarlyStopping:
    def __init__(self, patience=3, delta=0):
        self.patience = patience
        self.delta = delta
        self.best_loss = float('inf')
        self.patience_counter = 0
        self.early_stop = False

    def __call__(self, loss):
        if self.best_loss - loss > self.delta:
            self.best_loss = loss
            self.patience_counter = 0
        else:
            self.patience_counter += 1
            if self.patience_counter >= self.patience:
                self.early_stop = True

def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for input_ids, label_ids in dataloader:
            input_ids = input_ids.to(device)
            label_ids = label_ids.to(device)
            outputs = model(input_ids, labels=label_ids)
            loss = outputs.loss
            total_loss += loss.item()
    return total_loss / len(dataloader)

In [13]:
early_stopping = EarlyStopping(patience=3, delta=0.01)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

In [None]:
for epoch in range(50):
    model.train()
    total_loss = 0
    for input_ids, label_ids in qa_dataloader:
        input_ids = input_ids.to(device)
        label_ids = label_ids.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, labels=label_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(qa_dataloader)
    print(f"Epoch: {epoch}, Loss: {avg_loss}")

    val_loss = evaluate(model, qa_dataloader, device)
    print(f"Validation Loss: {val_loss}")

    early_stopping(val_loss)

    if early_stopping.early_stop:
        print("Early stopping triggered.")
        break

    if val_loss < early_stopping.best_loss:
        print("Saving the best model...")
        torch.save(model.state_dict(), 'best_model.pth')

# 모델 저장
torch.save(model.state_dict(), 'final_model.pth')

Epoch: 0, Loss: 1.1844942704835801
Validation Loss: 0.9939034619064535
Epoch: 1, Loss: 0.9862318199569711
Validation Loss: 0.8991119613186463


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained('gpt2')
model.to(device)
model.load_state_dict(torch.load('best_model.pth'))
model.eval()

def generate_response(question, tokenizer, model, max_length=512):
    inputs = tokenizer.encode(question, return_tensors='pt').to(device)

    with torch.no_grad():
        outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

if __name__ == "__main__":
    while True:
        user_input = input("사용자 질문: ")
        if user_input.lower() in ['exit', 'quit']:
            print("종료합니다.")
            break

        response = generate_response(user_input, tokenizer, model)

        print(f"챗봇 응답: {response}")