In [1]:
import os
os.environ["NCCL_P2P_DISABLE"] = "3"
os.environ["NCCL_IB_DISABLE"] = "3"
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_dataset, Dataset
from peft import get_peft_model, LoraConfig, TaskType
import torch
import pandas as pd

### Config
BASE_MODEL = "google/flan-t5-base"
GENERATE_QUESTION_MODEL = "../../experiment/generate_data/generate_question_flan_t5"
TRAIN_SQUAD_PATH = '../../data/squad/train-v1.1_classified.json'
DEV_SQUAD_PATH = '../../data/squad/dev-v1.1.json'
TRAIN_CNN_PATH = '../../data/cnn/cnn_train_classified.json'
DEV_CNN_PATH = '../../data/cnn/cnn_dev.json'

with open(TRAIN_SQUAD_PATH, 'r') as f:
    train_squad_data = json.load(f)
with open(DEV_SQUAD_PATH, 'r') as f:
    dev_squad_data = json.load(f)
with open(TRAIN_CNN_PATH, 'r') as f:
    train_cnn_data = json.load(f)
with open(DEV_CNN_PATH, 'r') as f:
    dev_cnn_data = json.load(f)
    
target_context = []
for article in train_cnn_data['data']:
    for para in article['paragraphs']:
        target_context.append(para['context'])

  from .autonotebook import tqdm as notebook_tqdm
2025-09-25 17:10:33.342660: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load base model 
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL, device_map="auto", torch_dtype=torch.float16)
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none"
)
model = get_peft_model(model, peft_config)

`torch_dtype` is deprecated! Use `dtype` instead!
We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


In [3]:
# Prepare the dataset
def create_question_prompt(input_text, question_type):
  # Define the prompt format for each question type
  prompt = f"""
  Given the context, generate a question based on the specified question type ({question_type}).

  Question Type: {question_type}
  Example: 
    Input: "{input_text}"
    Output: [generated_question]
  """
  return prompt.strip()


# Step 3: Tokenize the data
# def tokenize_gen_question(example,tokenizer, max_input_length=512, max_target_length=128):
#   prompt = create_question_prompt(example["context"], example["q_type"])
#   # Input 토크나이징 (context + question_type)
#   inputs = tokenizer(prompt,padding="max_length", truncation=True, max_length=512, return_tensors="pt")
#   # Traget 토크나이징 (question)
#   targets = tokenizer(example["question"], padding="max_length", truncation=True, max_length=64, return_tensors="pt")
#   return {"input_ids": inputs["input_ids"].squeeze(), "attention_mask": inputs["attention_mask"].squeeze(), "labels": targets["input_ids"].squeeze()}

def tokenize_gen_question(example, tokenizer, max_input_length=512, max_target_length=128):
    prompt = create_question_prompt(example["context"], example["q_type"])
    
    # 입력 토큰화 (context + question_type)
    model_inputs = tokenizer(
        prompt,
        max_length=max_input_length,
        truncation=True,
        padding="max_length"
    )

    # 타겟 토큰화 (question)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example["question"],
            max_length=max_target_length,
            truncation=True,
            padding="max_length"
        )

    # padding 토큰(-100으로 변환 → loss 계산에서 무시)
    labels["input_ids"] = [
        (l if l != tokenizer.pad_token_id else -100) for l in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [4]:
qa_data = []

for idx1 in range(0,len(train_squad_data['data'][:1])):
    for idx2 in range(0,len(train_squad_data['data'][idx1]['paragraphs'])):
        qas = train_squad_data['data'][idx1]['paragraphs'][idx2]['qas']
        question_li = [qas[i]['question'] for i in range(len(qas))]
        q_type_li = [qas[i]['q_type'] for i in range(len(qas))]
        answer_li = [qas[i]['answers'][0]['text'] for i in range(len(qas))]
        context_li = [train_squad_data['data'][idx1]['paragraphs'][idx2]['context']] * (len(qas))

        for i,j,x,z in zip(context_li, question_li, answer_li, q_type_li):    
            qa_data.append({"context":i, "question":j,"answer":x,"q_type":z})

# Convert data into a Dataset object
dataset = Dataset.from_pandas(pd.DataFrame(qa_data))
# Apply the tokenizer to the dataset
tokenized_dataset = dataset.map(lambda x: tokenize_gen_question(x, tokenizer), batched=False)

print('len(qa_data): ', len(qa_data))
print('qa_data[0]: ', end='')
qa_data[0]

Map: 100%|██████████| 269/269 [00:00<00:00, 1005.60 examples/s]

len(qa_data):  269
qa_data[0]: 




{'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answer': 'Saint Bernadette Soubirous',
 'q_type': 3}

In [5]:
# Define training arguments

args = TrainingArguments(
output_dir=GENERATE_QUESTION_MODEL, # 최종 모델 저장 경로
    per_device_train_batch_size=4,
    num_train_epochs=3,
    learning_rate=5e-4,
    logging_steps=10,
    save_strategy="no",  # Disable automatic saving
    save_total_limit=2,
    fp16=True,
    report_to="none"
)

# Train the model using the Trainer API
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model)
)

# Start the training
trainer.train()
trainer.save_model(GENERATE_QUESTION_MODEL)

  trainer = Trainer(


Step,Training Loss
10,0.0
20,0.0
30,0.0
40,0.0
50,0.0
60,0.0
70,0.0
80,0.0
90,0.0
100,0.0


- Inference

In [None]:
# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(GENERATE_QUESTION_MODEL)
model = AutoModelForSeq2SeqLM.from_pretrained(GENERATE_QUESTION_MODEL)

In [8]:
# Define a function to generate questions based on the context and question type
def generate_question(context, question_type):
    # Create the prompt for the T5 model
    prompt = f"""
    Given the context, generate a question based on the specified question type ({question_type}).

    Question Type: {question_type}
    Example: 
      Input: "{context}"
      Output: [generated_question]
    """

    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", truncation=True, max_length=512)

    # Generate the question using the model
    outputs = model.generate(
        input_ids=inputs["input_ids"], 
        attention_mask=inputs["attention_mask"], 
        max_length=64,  # Limit the length of the generated question
        num_beams=4,    # Use beam search for better quality
        early_stopping=True
    )

    # Decode the generated question
    generated_question = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_question

# Test with an example context
context = "Albert Einstein was a theoretical physicist who developed the theory of relativity."
question_type = "ENTY"  # Choose the question type (ABBR, DESC, ENTY, HUM, LOC, NUM)

# Generate the question
generated_question = generate_question(context, question_type)

# Print the generated question
print(f"Generated Question: {generated_question}")

Generated Question: What was Albert Einstein's occupation?


In [9]:
question_type_list = ['DESC', 'ENTY', 'ABBR', 'HUM', 'LOC', 'NUM']
context_list = [
    "Shakespeare was an English playwright and poet. He is widely regarded as one of the greatest writers in the English language.",
    "The Amazon River is the largest river in the world by discharge of water. It flows through South America, primarily in Brazil."]

for i in question_type_list:
    print(f"Question Type: {i}")
    for j in context_list:
        print(f"Context: {j}")
        print(f"Generated Question: {generate_question(j, i)}")
        print('-'*50)

Question Type: DESC
Context: Shakespeare was an English playwright and poet. He is widely regarded as one of the greatest writers in the English language.
Generated Question: What is the full name of the person who wrote Shakespeare?
--------------------------------------------------
Context: The Amazon River is the largest river in the world by discharge of water. It flows through South America, primarily in Brazil.
Generated Question: What is the largest river in the world by discharge of water?
--------------------------------------------------
Question Type: ENTY
Context: Shakespeare was an English playwright and poet. He is widely regarded as one of the greatest writers in the English language.
Generated Question: What was Shakespeare's occupation?
--------------------------------------------------
Context: The Amazon River is the largest river in the world by discharge of water. It flows through South America, primarily in Brazil.
Generated Question: What is the largest river in 

In [None]:
# Sample data for fine-tuning (you can replace this with your own dataset)
data = [
    {
        "context": "NASA is the United States government agency responsible for the civilian space program.",
        "q_type": "ABBR",
        "question": "What does the abbreviation 'NASA' stand for?"
    },
    {
        "context": "Photosynthesis is the process by which green plants use sunlight to synthesize foods from carbon dioxide and water.",
        "q_type": "DESC",
        "question": "Can you describe the process of photosynthesis?"
    },
    {
        "context": "Albert Einstein was a theoretical physicist who developed the theory of relativity.",
        "q_type": "ENTY",
        "question": "Who was Albert Einstein?"
    },
    {
        "context": "The first president of the United States was George Washington.",
        "q_type": "HUM",
        "question": "Who was the first president of the United States?"
    },
    {
        "context": "The Eiffel Tower is located in Paris, France.",
        "q_type": "LOC",
        "question": "Where is the Eiffel Tower located?"
    },
    {
        "context": "There are seven continents on Earth.",
        "q_type": "NUM",
        "question": "How many continents are there on Earth?"
    }
]