In [1]:
from datasets import load_dataset

train_ds = load_dataset("stanfordnlp/web_questions", split="train")
test_ds = load_dataset("stanfordnlp/web_questions", split="test")
train_ds = train_ds.filter(lambda x: len(x["answers"]) == 1)
test_ds = test_ds.filter(lambda x: len(x["answers"]) == 1)



In [7]:
for q, a in zip(test_ds["question"], test_ds["answers"]):
    print(f"Q: {q}\nA: {a[0]}\n")


Q: what did james k polk do before he was president?
A: Lawyer

Q: what is the oregon ducks 2012 football schedule?
A: University of Oregon

Q: who plays ken barlow in coronation street?
A: Tony Warren

Q: what happened after mr. sugihara died?
A: Yaotsu

Q: who did mozart write his four horn concertos for?
A: wolfgang amadeus mozart used story by pierre beaumarchais

Q: where is jamarcus russell from?
A: Mobile

Q: where was george washington carver from?
A: Diamond

Q: who was richard nixon married to?
A: Pat Nixon

Q: what country did germany invade first in ww1?
A: Belgium

Q: who is governor of ohio 2011?
A: John Kasich

Q: who was vice president after kennedy died?
A: Lyndon B. Johnson

Q: who is the minority leader of the house of representatives now?
A: Nancy Pelosi

Q: who is keyshia cole dad?
A: Leon Cole

Q: what town was martin luther king assassinated in?
A: Memphis

Q: where did edgar allan poe died?
A: Baltimore

Q: when did charles goodyear invented rubber?
A: During th

In [2]:
import random 
def format_example(example):
    question = example["question"]
    # Pick the first gold answer (you can change this logic if needed)
    answer = example["answers"][0]
    return f"Q: {question}\nA: {answer}"

# Create few-shot examples
def get_few_shot_examples(dataset, k=5, seed=42):
    random.seed(seed)
    indices = random.sample(range(len(dataset)), k)
    return "\n\n".join(format_example(dataset[i]) for i in indices)

# Example: get 5-shot prompt
few_shot_prompt = get_few_shot_examples(train_ds, k=5)
print(few_shot_prompt)

Q: who is the governor of hawaii now?
A: Neil Abercrombie

Q: what was nelson mandela's religion?
A: Methodism

Q: who played sean in scrubs?
A: Michael Jackson: 30th Anniversary Special

Q: what political party was henry clay?
A: Whig Party

Q: who are iran's major trading partners?
A: Madagascar


In [3]:
import transformers
transformers.__version__

'4.49.0'

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM

from constants import MODEL_PATHs

model_name = "qwen3_1.7b"

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

# tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-1.7B")
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-1.7B")

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATHs[model_name])
model = AutoModelForCausalLM.from_pretrained(MODEL_PATHs[model_name])
tokenizer.pad_token_id = tokenizer.eos_token_id

ValueError: The checkpoint you are trying to load has model type `qwen3` but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.

You can update Transformers with the command `pip install --upgrade transformers`. If this does not work, and the checkpoint is very new, then there may not be a release version that supports this model yet. In this case, you can get the most up-to-date code by installing Transformers from source with the command `pip install git+https://github.com/huggingface/transformers.git`

In [None]:
qa_instruction = "Answer the question based on general world knowledge. Provide a short and direct answer."
def generate_answers(prompts):
    model.generation_config.temperature=None
    model.generation_config.top_p=None

    prompts = [f"{qa_instruction}\n\n{few_shot_prompt}\n\nQ: {prompt}\nA:" for prompt in prompts]
    input_ids = tokenizer(
        prompts, 
        padding=True, truncation=True,
        padding_side='left',        
        return_tensors="pt",
        return_attention_mask=True).input_ids.to(model.device)
    generated_ids = model.generate(
        input_ids, 
        max_new_tokens=20, 
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id)

    generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    new_generated_texts = [gen[len(prompt):].strip() for gen, prompt in zip(generated_texts, prompts)]
    return prompts, new_generated_texts


In [None]:
from torch.utils.data import DataLoader

def webqa_collate_fn(batch):
    questions = [item["question"] for item in batch]
    answers = [item["answers"] for item in batch]  # answers is a list of lists
    return questions, answers

In [None]:
import re
import string

def normalize_answer(s):
    """Lower text and remove punctuation, articles, and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        return ''.join(ch for ch in text if ch not in string.punctuation)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def exact_match_score(predictions,  ):
    """Return 1 if the prediction matches any gold answer after normalization."""
    pred_norms = [normalize_answer(pred) for pred in predictions]
    answer_norms = [normalize_answer(answer) for answer in gold_answers]
    return sum([1 if pred == ans else 0 for pred, ans in zip(pred_norms, answer_norms)])/len(predictions)

In [None]:
from tqdm import tqdm
all_preds1 = []
all_prompts1 = []
all_answers1 = []

dataloader = DataLoader(test_ds, batch_size=8, webqa_collate_fn=webqa_collate_fn)
for questions, answers in tqdm(dataloader):
    answers = [ans[0] for ans in answers]
    prompts, generations = generate_answers(questions)
    predictions = [gen.strip().split('\n')[0] for gen in generations]
    all_preds1.extend(predictions)
    all_answers1.extend(answers)
    all_prompts1.extend(prompts)

In [None]:
exact_match = exact_match_score(all_preds1, all_answers1)
exact_match

In [None]:
def get_few_shot_paraphrases(few_shot=False, idx=0):
    instruction = """
Paraphrase the following question. Keep the original meaning, but use a different sentence structure and vocabulary. Aim to make the paraphrase sound natural and diverse.
    """

    example_prompts = [
        "who is the governor of hawaii now?",
        "what was nelson mandela's religion?",
        "who played sean in scrubs?",
        "what political party was henry clay?",
        "who are iran's major trading partners?"
    ]

    paraphrased_prompts = [
        [
            "as of now, who leads Hawaii as its governor?",
            "who's currently serving as Hawaii's governor?",
            "can you tell me who governs Hawaii right now?",
            "who’s in charge of the Hawaii state government these days?",
            "who’s the top executive official in Hawaii right now?"
        ],
        [
            "what was Mandela’s faith tradition",
            "can you tell me Mandela’s religion?",
            "what faith did Nelson Mandela practice?",
            "what was the religious affiliation of Nelson Mandela?",
            "what religion did Nelson Mandela follow?"
        ],
        [
            "which actor portrayed Sean in Scrubs?",
            "who took on the role of Sean in Scrubs?",
            "who played the character Sean in the TV show Scrubs?",
            "who was the actor that played Sean in the series Scrubs?",
            "do you know who played the part of Sean in Scrubs?"
        ],
        [
            "Henry Clay was a member of which political party?",
            "to which party did Henry Clay pledge his allegiance?",
            "what political affiliation did Henry Clay have?",
            "under which political banner did Henry Clay serve?",
            "where did Henry Clay stand on the political party map?"
        ],
        [
            "who does Iran trade with the most?",
            "who are the primary countries doing business with Iran?",
            "what are Iran’s strongest trade relationships?",
            "which countries top the list of Iran’s key trade allies?",
            "which countries are central to Iran’s import and export network?"
        ]
    ]

    few_shot_prompt = [f"Q: {prompt}\nParaphrase: {para}" for prompt, para in zip(example_prompts, [paras[idx] for paras in paraphrased_prompts])]
    if few_shot:
        prompts = f"{instruction}\n" + "\n\n".join(few_shot_prompt)
    else:
        prompts = f"{instruction}\n\n"
    return prompts

In [None]:
print(get_few_shot_paraphrases(few_shot=True))

In [None]:
import torch
import random
import numpy as np


def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def generate_paraphrases(prompts, idx, seed=42):
    context = get_few_shot_paraphrases(few_shot=True, idx=idx)
    # print(context)
    prompts = [f"{context}\n\nQ: {prompt}\nParaphrase:" for prompt in prompts]
    input_ids = tokenizer(
        prompts, 
        padding=True, truncation=True,
        padding_side='left',        
        return_tensors="pt",
        return_attention_mask=True).input_ids.to(model.device)

    set_seed(seed)
    generated_ids = model.generate(
        input_ids, 
        max_new_tokens=30, 
        do_sample=True,
        temperature=1.5,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id)

    generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    new_generated_texts = [gen[len(prompt):].strip() for gen, prompt in zip(generated_texts, prompts)]
    return new_generated_texts

In [None]:
all_preds2 = []
all_answers2 = []
all_prompts2 = []
paras = []
for i in range(5):
    print(f"Generating paraphrases for iteration {i+1}")
    all_paraphrases = []
    for questions, answers in tqdm(dataloader):
        answers = [ans[0] for ans in answers]
        generations = generate_paraphrases(questions, idx=i, seed=i)
        paraphrases = [gen.strip().split('\n')[0] for gen in generations]
        all_paraphrases.extend(paraphrases)
        # break
    paras.append(all_paraphrases)

In [None]:
for all_paras in paras:
    print(all_paras[10])

In [None]:
# prompts, generations2 = generate_answers(paraphrases)
# predictions = [gen.strip().split('\n')[0] for gen in generations2]
# all_preds2.extend(predictions)
# all_answers2.extend(answers)
# all_prompts2.extend(prompts)

# exact_match2 = exact_match_score(all_preds2, all_answers2)
# exact_match2

In [None]:
questions[0], paraphrases[0]

In [None]:
def get_ensemble_fewshot():
    example_prompts = [
        "who is the governor of hawaii now?",
        "what was nelson mandela's religion?",
        "who played sean in scrubs?",
        "what political party was henry clay?",
        "who are iran's major trading partners?"
    ]

    paraphrased_prompts = [
        "as of now, who leads Hawaii as its governor?",
        "what was Mandela’s faith tradition",
        "which actor portrayed Sean in Scrubs?",
        "Henry Clay was a member of which political party?",
        "who does Iran trade with the most?"
    ]

    answers = ["Neil Abercrombie", "Methodism", "Michael Jackson: 30th Anniversary Special", "Whig Party", "Madagascar"]

    few_shot_prompt = [f"Q: {prompt} In other word, {para}\nA: {ans}" for prompt, para, ans in zip(example_prompts, paraphrased_prompts, answers)]
    prompts = f"{qa_instruction}\n\n" + "\n\n".join(few_shot_prompt)
    return prompts
print(get_ensemble_fewshot())

In [None]:
test_ds2 = dataloader.dataset.add_column("paraphrase1", paras[0])
test_ds2 = test_ds2.add_column("paraphrase2", paras[1])
test_ds2 = test_ds2.add_column("paraphrase3", paras[2])
test_ds2 = test_ds2.add_column("paraphrase4", paras[3])
test_ds2 = test_ds2.add_column("paraphrase5", paras[4])

In [None]:
def webqa_collate_fn2(batch):
    questions = [item["question"] for item in batch]
    answers = [item["answers"] for item in batch]
    paraphrases = [item["paraphrases"] for item in batch]
    return questions, answers, paraphrases

dataloader2 = DataLoader(test_ds2, batch_size=8, webqa_collate_fn=webqa_collate_fn2)

In [None]:

all_preds3 = []
all_prompts3 = []
all_answers3 = []

def generate_answers_ensemble(prompts, paraphrases):
    model.generation_config.temperature=None
    model.generation_config.top_p=None
    
    context = get_ensemble_fewshot()
    prompts = [f"{context}\n\nQ: {prompt} In other words, {para}\nA:" for prompt, para in zip(questions, paraphrases)]
    all_prompts3.extend(prompts)
    
    input_ids = tokenizer(
        prompts, 
        padding=True, truncation=True,
        padding_side='left',        
        return_tensors="pt",
        return_attention_mask=True).input_ids.to(model.device)
    generated_ids = model.generate(
        input_ids, 
        max_new_tokens=20, 
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id)

    generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    new_generated_texts = [gen[len(prompt):].strip() for gen, prompt in zip(generated_texts, prompts)]
    return new_generated_texts

for questions, answers, paraphrases in tqdm(dataloader2):
    model.generation_config.temperature=None
    model.generation_config.top_p=None

    answers = [ans[0] for ans in answers]
    generated_answers = generate_answers_ensemble(questions, paraphrases)

    predictions = [gen.strip().split('\n')[0] for gen in generated_answers]
    all_preds3.extend(predictions)
    all_answers3.extend(answers)

exact_match3 = exact_match_score(all_preds3, all_answers3)
exact_match3

In [None]:
pred_norms1 = [normalize_answer(pred) for pred in all_preds1]
answer_norms1 = [normalize_answer(answer) for answer in all_answers1]
matched1 = [1 if pred in answer_norms1 else 0 for pred in pred_norms1]

pred_norms2 = [normalize_answer(pred) for pred in all_preds2]
answer_norms2 = [normalize_answer(answer) for answer in all_answers2]
matched2 = [1 if pred in answer_norms2 else 0 for pred in pred_norms2]  

pred_norms3 = [normalize_answer(pred) for pred in all_preds3]
answer_norms3 = [normalize_answer(answer) for answer in all_answers3]
matched3 = [1 if pred in answer_norms3 else 0 for pred in pred_norms3]

In [None]:
e12, e23, e13 = 0, 0, 0
e123 = 0
for m1, m2, m3 in zip(matched1, matched2, matched3):
    if m1 == m2:
        e12 += 1
    if m2 == m3:
        e23 += 1
    if m1 == m3:
        e13 += 1
    if m1 == m2 == m3:
        e123 += 1

print(e12/len(matched1), e23/len(matched2), e13/len(matched3), e123/len(matched1))

In [None]:
test_ds2 = test_ds2.add_column("prompts1", all_prompts1)
test_ds2 = test_ds2.add_column("prompts2", all_prompts2)
test_ds2 = test_ds2.add_column("prompts3", all_prompts3)

In [None]:
test_ds2.save_to_disk("./webqa_test_fewshot_ensemble_llama3.2_3b_it_paraphrase")

In [None]:
from datasets import load_from_disk
ds = load_from_disk("webqa_test_fewshot_ensemble")