<a href="https://colab.research.google.com/github/MrKkondae/practiceLLM/blob/main/miniLLM_GPT2_JSON_DataSet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. 라이브러리 설치 및 환경 설정

In [None]:
!pip install transformers datasets accelerate --upgrade

import torch
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (

2. KoGPT2 모델 및 토크나이저 로드

In [None]:
tokenizer = PreTrainedTokenizerFast.from_pretrained(
    "skt/kogpt2-base-v2",
    bos_token='</s>', eos_token='</s>', unk_token='<unk>', pad_token='<pad>', mask_token='<mask>'
)

model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")

3. 파인튜닝용 대화 데이터셋 가져오기
📌 1단계: Colab에서 파일 업로드

In [None]:
from google.colab import files

# 파일 업로드 창 열기
uploaded = files.upload()

📌 2단계: JSON 파일을 Hugging Face Dataset으로 로딩

In [None]:
import json
from datasets import Dataset

# 파일 읽기
with open("대화_데이터셋_UTF8.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Dataset으로 변환
dataset = Dataset.from_list(data)

# 확인
print(dataset[0])


4. 데이터셋 토큰화

In [None]:
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

# ✨ text 컬럼 제거 + tokenizer 출력 유지
tokenized_dataset = dataset.map(tokenize_function, remove_columns=["text"])

5. 학습설정 및 Trainer 구성

In [None]:
training_args = TrainingArguments(
    output_dir="./kogpt2-finetuned",
    num_train_epochs=5,
    per_device_train_batch_size=2,
    logging_steps=5,
    save_steps=10,
    save_total_limit=2,
    remove_unused_columns=False,
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)


6. 파인튜닝 실행

In [None]:
trainer.train()

✅ 7. 파인튜닝된 모델로 응답 테스트

In [None]:
def chat_kogpt2(prompt, max_length=100):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    input_ids = input_ids.to(model.device)
    output = model.generate(
        input_ids,
        max_length=max_length,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response[len(prompt):].strip()



# 대화 루프 시작
print("🗨️ KoGPT2 챗봇입니다. '종료'라고 입력하면 끝나요.")
while True:
    user_input = input("👤 사용자: ")
    if user_input.strip() == "종료":
        print("🤖 챗봇: 안녕히 가세요!")
        break
    full_prompt = f"사용자: {user_input}\n챗봇:"
    response = chat_kogpt2(full_prompt)
    print(f"🤖 챗봇: {response}"