In [None]:
import os
os.environ["NCCL_P2P_DISABLE"] = "2"
os.environ["NCCL_IB_DISABLE"] = "2"
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_dataset, Dataset
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
import torch
import pandas as pd
import io

### Config
BASE_MODEL = "google/flan-t5-base"
GENERATE_ANSWER_MODEL = "../../experiment/generate_data/generate_answer_flan_t5_all_squad"
TRAIN_SQUAD_PATH = '../../data/squad/train-v1.1.json'
DEV_SQUAD_PATH = '../../data/squad/dev-v1.1.json'
TRAIN_HOTPOT_PATH = '../../data/target/hotpotqa_train_classified.jsonl'
DEV_HOTPOT_PATH = '../../data/target/hotpotqa_dev.jsonl'

with open(TRAIN_SQUAD_PATH, 'r') as f:
    train_squad_data = json.load(f)
with open(DEV_SQUAD_PATH, 'r') as f:
    dev_squad_data = json.load(f)
# with open(TRAIN_CNN_PATH, 'r') as f:
#     train_cnn_data = json.load(f)
# with open(DEV_CNN_PATH, 'r') as f:
#     dev_cnn_data = json.load(f)
TRAIN_HOTPOT = []
with io.open(TRAIN_HOTPOT_PATH, 'r', encoding='utf-8') as f:
    for sample in f:
        TRAIN_HOTPOT.append(json.loads(sample))
    
target_context = []
# for article in train_cnn_data['data']:
#     for para in article['paragraphs']:
#         target_context.append(para['context'])
for article in TRAIN_HOTPOT:
    target_context.append(article['context'])
        
target_question = []
# for article in train_cnn_data['data']:
#     for para in article['paragraphs']:
#         for qa in para['qas']:
#             target_question.append(qa['question'])
for article in TRAIN_HOTPOT:
    target_question.append(article['qas'][0]['question'])

  from .autonotebook import tqdm as notebook_tqdm
2025-10-01 13:37:48.888321: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load base model 
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL, device_map="auto", torch_dtype=torch.float16)
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none"
)
model = get_peft_model(model, peft_config)

`torch_dtype` is deprecated! Use `dtype` instead!
We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


In [None]:
def create_answer_prompt(context, question):
    """
    Context와 Question이 주어졌을 때 Answer 추출을 위한 프롬프트 생성
    """
    prompt = f"""Given the context and question, extract the answer from the context that best answers the question.
    Context: "{context}"
    Question: "{question}"
    Answer: [extracted_answer]"""
    return prompt.strip()

def tokenize_gen_answer(example, tokenizer, max_input_length=512, max_target_length=128):
    prompt = create_answer_prompt(example["context"], example["question"])
    """
    QA 데이터를 토크나이징하는 함수
    """
    inputs = tokenizer(
        prompt,
        padding="max_length",
        truncation=True,
        max_length=max_input_length,
        return_tensors="pt"
    )

    targets = tokenizer(
        example["answer"],
        padding="max_length",
        truncation=True,
        max_length=max_target_length,
        return_tensors="pt"
    )

    labels = targets["input_ids"].squeeze()
    labels[labels == tokenizer.pad_token_id] = -100  # 중요!

    return {
        "input_ids": inputs["input_ids"].squeeze(), # 모델이 인코더에 넣을 토큰 ID들(독립변수)
        "attention_mask": inputs["attention_mask"].squeeze(),# 패딩 부분 무시하도록 표시
        "labels": labels # 모델이 디코더에서 예측해야 할 정답 토큰 ID들(종속변수)
    }

In [34]:
train_qa_data = []

for idx1 in range(0,len(train_squad_data['data'][:5])): ### 자르기
    for idx2 in range(0,len(train_squad_data['data'][idx1]['paragraphs'])):
        qas = train_squad_data['data'][idx1]['paragraphs'][idx2]['qas']
        question_li = [qas[i]['question'] for i in range(len(qas))]
        answer_li = [qas[i]['answers'][0]['text'] for i in range(len(qas))]
        context_li = [train_squad_data['data'][idx1]['paragraphs'][idx2]['context']] * (len(qas))

        for i,j,x in zip(context_li, question_li, answer_li):    
            train_qa_data.append({"context":i, "question":j, "answer":x})

# 데이터셋 생성 및 토크나이징 적용
train_dataset = Dataset.from_pandas(pd.DataFrame(train_qa_data))
train_dataset = train_dataset.map(lambda x: tokenize_gen_answer(x, tokenizer), batched=False)

dev_qa_data = []
for idx1 in range(len(dev_squad_data['data'][:2])):   # dev 전체 사용
    for idx2 in range(len(dev_squad_data['data'][idx1]['paragraphs'])):
        qas = dev_squad_data['data'][idx1]['paragraphs'][idx2]['qas']
        question_li = [qas[i]['question'] for i in range(len(qas))]
        answer_li = [qas[i]['answers'][0]['text'] for i in range(len(qas))]
        context_li = [dev_squad_data['data'][idx1]['paragraphs'][idx2]['context']] * len(qas)

        for context, question, answer in zip(context_li, question_li, answer_li):
            dev_qa_data.append({
                "context": context, 
                "question": question, 
                "answer": answer
            })
test_dataset = Dataset.from_pandas(pd.DataFrame(dev_qa_data))
test_dataset = test_dataset.map(lambda x: tokenize_gen_answer(x, tokenizer), batched=False)

Map: 100%|██████████| 1483/1483 [00:01<00:00, 1314.74 examples/s]
Map: 100%|██████████| 1057/1057 [00:00<00:00, 1268.64 examples/s]


In [35]:
train_sample = train_dataset[0]
print(train_sample["context"])
print(train_sample["question"])
print(train_sample["answer"])
print(train_sample["labels"])
print(tokenizer.decode([id for id in train_sample["labels"] if id != -100]))

Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Saint Bernadette Soubirous
[2788, 8942, 9, 26, 1954, 264, 8371, 8283, 1, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -10

In [36]:
# LoRA 적용되었는지 확인
model.print_trainable_parameters()

trainable params: 884,736 || all params: 248,462,592 || trainable%: 0.3561


In [37]:
# 평가에서 loss 확인
trainer.evaluate(tokenized_dataset.select(range(100)))

{'eval_loss': 0.13497143983840942,
 'eval_runtime': 1.2666,
 'eval_samples_per_second': 78.951,
 'eval_steps_per_second': 10.264,
 'epoch': 3.0}

- Train

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=GENERATE_ANSWER_MODEL,      # 모델 저장 경로
    per_device_train_batch_size=4,         # 배치 크기
    num_train_epochs=3,                    # 학습 epoch 수
    learning_rate=5e-5,                    # 학습률 (조금 낮게 설정 권장)
    logging_strategy="steps",              # 로깅 전략
    logging_steps=200,   
    logging_first_step=True,               #  50 step마다 로그 출력
    save_strategy="epoch",                 # epoch 단위로 저장
    save_total_limit=2,                    # 최대 2개 모델만 저장
    # evaluation_strategy="steps",           # 일정 step마다 평가
    eval_steps=500,                        # 평가 주기
    # predict_with_generate=True,            # generate() 기반 평가 활성화
    fp16=True,                             # GPU가 FP16 지원 시 속도 ↑
    report_to="none"                       # wandb 등 외부 리포팅 비활성화
)

# Data collator (padding, label shift 자동 처리)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Trainer 생성
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset, 
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Start training
trainer.train()

# Save final model
trainer.save_model(GENERATE_ANSWER_MODEL)

- Test

In [4]:
# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(GENERATE_ANSWER_MODEL)
base_model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL, device_map="auto", torch_dtype=torch.float16)
# Load the PEFT model
model = PeftModel.from_pretrained(base_model, GENERATE_ANSWER_MODEL)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# results = trainer.evaluate(test_dataset)
# print(results)

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.
  trainer = Trainer(


- Inference

In [8]:
# Define a function to generate answers based on context and question
def generate_answer(context, question):
    """
    Context와 Question이 주어졌을 때 Answer를 생성하는 함수
    """
    # T5 모델용 프롬프트 생성 (question answering format)
    prompt = f"question: {question} context: {context}"
    
    # Tokenize the prompt
    inputs = tokenizer(
        prompt, 
        return_tensors="pt", 
        padding="max_length", 
        truncation=True, 
        max_length=512
    )
    
    # Generate the answer using the model
    outputs = model.generate(
        input_ids=inputs["input_ids"], 
        attention_mask=inputs["attention_mask"], 
        max_length=128,     # Answer는 보통 question보다 길 수 있음
        min_length=5,       # 최소 길이 설정
        num_beams=4,        # Beam search for better quality
        early_stopping=True,
        do_sample=False,    # Deterministic generation
        temperature=1.0     # 필요시 조정
    )
    
    # Decode the generated answer
    generated_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_answer

# Generate the answer
generated_answer = generate_answer(target_context[0], target_question[0])

# Print the generated answer
print(f"Context: {target_context[0]}")
print(f"Question: {target_question[0]}")
print(f"Generated Answer: {generated_answer}")
print(f"Answer: {TRAIN_HOTPOT[0]['qas'][0]['answers'][0]}")

Context: [PAR] [TLE] The Oberoi Group [SEP] The Oberoi Group is a hotel company with its head office in Delhi.  Founded in 1934, the company owns and/or operates 30+ luxury hotels and two river cruise ships in six countries, primarily under its Oberoi Hotels & Resorts and Trident Hotels brands. [PAR] [TLE] Oberoi family [SEP] The Oberoi family is an Indian family that is famous for its involvement in hotels, namely through The Oberoi Group. 
Question: The Oberoi family is part of a hotel company that has a head office in what city?
Generated Answer: Delhi. Founded in 1934
Answer: Delhi


In [None]:
# # Answer Extraction용 데이터셋
# qa_data = [
#     {
#         "context": "NASA is the United States government agency responsible for the civilian space program, as well as aeronautics and space research.",
#         "question": "What does NASA stand for?",
#         "answer": "National Aeronautics and Space Administration"
#     },
#     {
#         "context": "Photosynthesis is the process by which green plants use sunlight to synthesize foods from carbon dioxide and water. This process occurs mainly in the leaves.",
#         "question": "What is photosynthesis?", 
#         "answer": "the process by which green plants use sunlight to synthesize foods from carbon dioxide and water"
#     },
#     {
#         "context": "Albert Einstein was a German-born theoretical physicist who developed the theory of relativity, one of the two pillars of modern physics.",
#         "question": "Who was Albert Einstein?",
#         "answer": "a German-born theoretical physicist who developed the theory of relativity"
#     },
# ]