In [2]:
question_generation_prompt = """
Given the context, generate a question based on the specified question type (ABBR, DESC, ENTY, HUM, LOC, NUM). 

Question Type: ABBR (Abbreviation)
Example: 
  Input: "NASA is the United States government agency responsible for the civilian space program."
  Output: "What does the abbreviation 'NASA' stand for?"
  
Question Type: DESC (Description)
Example: 
  Input: "Photosynthesis is the process by which green plants use sunlight to synthesize foods from carbon dioxide and water."
  Output: "Can you describe the process of photosynthesis?"

Question Type: ENTY (Entity)
Example: 
  Input: "Albert Einstein was a theoretical physicist who developed the theory of relativity."
  Output: "Who was Albert Einstein?"

Question Type: HUM (Human)
Example: 
  Input: "The first president of the United States was George Washington."
  Output: "Who was the first president of the United States?"

Question Type: LOC (Location)
Example: 
  Input: "The Eiffel Tower is located in Paris, France."
  Output: "Where is the Eiffel Tower located?"

Question Type: NUM (Number)
Example: 
  Input: "There are seven continents on Earth."
  Output: "How many continents are there on Earth?"

Input: [target_context]
Output: [generated_question]
"""

In [1]:
import os
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_dataset, Dataset
from peft import get_peft_model, LoraConfig, TaskType
import torch
import pandas as pd

train_squad_path = '/home/gayeon39/gayeon/[DA]/model/QC4QA/data/squad/train-v1.1_classified.json'
dev_squad_path = '/home/gayeon39/gayeon/[DA]/model/QC4QA/data/squad/dev-v1.1.json'
train_cnn_path = '/home/gayeon39/gayeon/[DA]/model/QC4QA/data/target/cnn_train_classified.json'
dev_cnn_path = '/home/gayeon39/gayeon/[DA]/model/QC4QA/data/target/cnn_dev.json'

with open(train_squad_path, 'r') as f:
    train_squad_data = json.load(f)
with open(dev_squad_path, 'r') as f:
    dev_squad_data = json.load(f)
with open(train_cnn_path, 'r') as f:
    train_cnn_data = json.load(f)
with open(dev_cnn_path, 'r') as f:
    dev_cnn_data = json.load(f)
    
target_context = []
for article in train_cnn_data['data']:
    for para in article['paragraphs']:
        target_context.append(para['context'])
target_context[:3]

  from .autonotebook import tqdm as notebook_tqdm


['-- two of the most influential papers for voters in Iowa and New Hampshire -- the first two states to weigh in at the polls in 2008 -- both endorsed John McCain in the GOP presidential race , but differed in their choice in the Democratic contest . the Register backs Hillary Clinton , while the Globe picks Obama . the Register backed Hillary Clinton , while the Globe picked Obama , in excerpts of sunday \'s editorials posted on their papers \' web sites saturday night . the Iowa caucuses are january 3 , and New Hampshire \'s primary follows five days later . the Globe \'s editorial board dismissed concerns over the Illinois senator \'s relative lack of Washington experience . " it is true that all the other Democratic contenders have more conventional resumes , and have spent more time in Washington , " the board wrote . " but that exposure has tended to give them a sense of government \'s constraints . Obama is more open to its possibilities . " but the Register \'s editorial board 

In [2]:
# Step 1: Load base model (FLAN-T5)
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none"
)
model = get_peft_model(model, peft_config)

`torch_dtype` is deprecated! Use `dtype` instead!
We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


In [3]:
# Step 2: Prepare the dataset
def create_prompt(input_text, question_type):
    # Define the prompt format for each question type
    prompt = f"""
    Given the context, generate a question based on the specified question type ({question_type}).
    
    Question Type: {question_type}
    Example: 
      Input: "{input_text}"
      Output: [generated_question]
    """
    return prompt.strip()

# Sample data for fine-tuning (you can replace this with your own dataset)
data = [
    {
        "target_context": "NASA is the United States government agency responsible for the civilian space program.",
        "question_type": "ABBR",
        "generated_question": "What does the abbreviation 'NASA' stand for?"
    },
    {
        "target_context": "Photosynthesis is the process by which green plants use sunlight to synthesize foods from carbon dioxide and water.",
        "question_type": "DESC",
        "generated_question": "Can you describe the process of photosynthesis?"
    },
    {
        "target_context": "Albert Einstein was a theoretical physicist who developed the theory of relativity.",
        "question_type": "ENTY",
        "generated_question": "Who was Albert Einstein?"
    },
    {
        "target_context": "The first president of the United States was George Washington.",
        "question_type": "HUM",
        "generated_question": "Who was the first president of the United States?"
    },
    {
        "target_context": "The Eiffel Tower is located in Paris, France.",
        "question_type": "LOC",
        "generated_question": "Where is the Eiffel Tower located?"
    },
    {
        "target_context": "There are seven continents on Earth.",
        "question_type": "NUM",
        "generated_question": "How many continents are there on Earth?"
    }
]

# Step 3: Tokenize the data
def tokenize_data(example):
    prompt = create_prompt(example["target_context"], example["question_type"])
    inputs = tokenizer(prompt, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    targets = tokenizer(example["generated_question"], padding="max_length", truncation=True, max_length=64, return_tensors="pt")
    return {"input_ids": inputs["input_ids"].squeeze(), "attention_mask": inputs["attention_mask"].squeeze(), "labels": targets["input_ids"].squeeze()}

# Convert data into a Dataset object
dataset = Dataset.from_pandas(pd.DataFrame(data))

# Apply the tokenizer to the dataset
tokenized_dataset = dataset.map(tokenize_data, batched=False)

Map: 100%|██████████| 6/6 [00:00<00:00, 589.34 examples/s]


In [None]:
# Step 4: Define training arguments
# args = TrainingArguments(
#     output_dir="./results",         # Output directory for model checkpoints
#     learning_rate=5e-5,             # Learning rate
#     per_device_train_batch_size=8,  # Batch size
#     per_device_eval_batch_size=8,   # Evaluation batch size
#     num_train_epochs=10,             # Number of training epochs
#     save_steps=10_000,              # Save model checkpoints every 10,000 steps
#     save_total_limit=2,             # Keep only the last 2 checkpoints
#     logging_dir="./logs",           # Log directory
#     logging_steps=500,              # Log every 500 steps
# )
args = TrainingArguments(
output_dir="./flan_t5_domain_qa",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    learning_rate=5e-4,
    logging_steps=10,
    save_total_limit=2,
    fp16=True,
    report_to="none"
)

# Step 5: Train the model using the Trainer API
trainer = Trainer(
    model=model,                    # Model to train
    args=args,              # Training arguments
    train_dataset=tokenized_dataset, # Training dataset
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model)
)

# Start the training
trainer.train()
trainer.save_model("./fine_tuned_t5")

  trainer = Trainer(


Step,Training Loss


- Inference

In [5]:
# Step 1: Load the fine-tuned model and tokenizer
model_name = "./fine_tuned_t5"  
# model_name = "./results/checkpoint-10"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [6]:
# Step 2: Define a function to generate questions based on the context and question type
def generate_question(context, question_type):
    # Create the prompt for the T5 model
    prompt = f"""
    Given the context, generate a question based on the specified question type ({question_type}).

    Question Type: {question_type}
    Example: 
      Input: "{context}"
      Output: [generated_question]
    """

    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", truncation=True, max_length=512)

    # Generate the question using the model
    outputs = model.generate(
        input_ids=inputs["input_ids"], 
        attention_mask=inputs["attention_mask"], 
        max_length=64,  # Limit the length of the generated question
        num_beams=4,    # Use beam search for better quality
        early_stopping=True
    )

    # Decode the generated question
    generated_question = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_question

# Step 3: Test with an example context
context = "Albert Einstein was a theoretical physicist who developed the theory of relativity."
question_type = "ENTY"  # Choose the question type (ABBR, DESC, ENTY, HUM, LOC, NUM)

# Generate the question
generated_question = generate_question(context, question_type)

# Print the generated question
print(f"Generated Question: {generated_question}")

Generated Question: What was Albert Einstein's occupation?


In [12]:
question_type_list = ['DESC', 'ENTY', 'ABBR', 'HUM', 'LOC', 'NUM']
# context_list = [
#     "NASA is the United States government agency responsible for the civilian space program.",
#     "Albert Einstein was a theoretical physicist who developed the theory of relativity.",
#     "The Eiffel Tower is located in Paris, France.",]
context_list = [
    "Shakespeare was an English playwright and poet. He is widely regarded as one of the greatest writers in the English language.",
    "The Amazon River is the largest river in the world by discharge of water. It flows through South America, primarily in Brazil."]

for i in question_type_list:
    print(f"Question Type: {i}")
    for j in context_list:
        print(f"Context: {j}")
        print(f"Generated Question: {generate_question(j, i)}")
        print('-'*50)

Question Type: DESC
Context: Shakespeare was an English playwright and poet. He is widely regarded as one of the greatest writers in the English language.
Generated Question: What is the full name of the person who wrote Shakespeare?
--------------------------------------------------
Context: The Amazon River is the largest river in the world by discharge of water. It flows through South America, primarily in Brazil.
Generated Question: What is the largest river in the world by discharge of water?
--------------------------------------------------
Question Type: ENTY
Context: Shakespeare was an English playwright and poet. He is widely regarded as one of the greatest writers in the English language.
Generated Question: What was Shakespeare's occupation?
--------------------------------------------------
Context: The Amazon River is the largest river in the world by discharge of water. It flows through South America, primarily in Brazil.
Generated Question: What is the largest river in 