In [1]:
%pip install transformers

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
%pip install -U datasets fsspec huggingface_hub

Collecting fsspec
  Downloading fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.32.4-py3-none-any.whl.metadata (14 kB)
Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface_hub)
  Downloading hf_xet-1.1.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (879 bytes)
Downloading huggingface_hub-0.32.4-py3-none-any.whl (512 kB)
Downloading hf_xet-1.1.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m886.5 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: hf-xet, huggingface_hub
[2K  Attempting uninstall: huggingface_hub━━━━━━━━━[0m [32m0/2[0m [hf-xet]
[2K    Found existing installation: huggingface-hub 0.31.2/2[0m [hf-xet]
[2K    Uninstalling huggingface-hub-0.31.2:━━━━[0m [32m0/2[0m [hf-xet]
[2K      Successfully uninstalled huggingface-hub-0.31.2m0/2[0m [hf-xet]
[2K   

In [3]:
# TriviaQA

from datasets import load_dataset
import random
random.seed(10)

class QABenchmark:
    def __init__(self):
        self.dataset = []

    def sample(self, k: int):
        return random.sample(self.dataset, min(k, len(self.dataset)))

    def first_k(self, k: int):
        return self.dataset[:k]


class TriviaQA(QABenchmark):
    def __init__(self, split='validation', config='rc'):
        super().__init__()
        loaded_dataset = load_dataset('trivia_qa', config, split=split)
        self.dataset = [(example['question'], list(set([example['answer']['value']] + example['answer']['aliases'])))
                        for example in loaded_dataset]
        print(f"Number of examples in train split: {len(self.dataset)}")


class Lama(QABenchmark):
    def __init__(self, split: str = 'train'):
        super().__init__()
        loaded_dataset = load_dataset('lama', split=split)
        self.dataset = [(example['masked_sentence'][:-7], example['obj_label']) for example in loaded_dataset
                        if example['masked_sentence'][-7:] == '[MASK].']


def get_optional_in_context_demonstrations_for_triviaqa(size: int = 200):
  trivia_qa_train_set = TriviaQA(split='train')
  return trivia_qa_train_set.first_k(k=size)


def get_triviaqa_validation_set(size: int = 100):
  trivia_qa_train_set = TriviaQA(split='validation')
  return trivia_qa_train_set.sample(k=size)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# GPT2

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

def print_output(output: str):
    print("Output:\n" + 100 * '-')
    print(output)


def process_generation(text: str):
    if not text:
        return text
    while text and text[0] in ['\n', ':', ' ', ',', ';']:
        text = text[1:]
    return text


def load_gpt2(model_name: str = 'gpt2-medium'):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id)
    return model, tokenizer


model, tokenizer = load_gpt2()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


def sampling(input_text: str, max_length=50, temperature=0.7):
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
    input_ids_len = input_ids.shape[1]
    sample_output = model.generate(
        input_ids,
        do_sample=True,
        max_length=input_ids_len + max_length,
        top_k=0,
        temperature=temperature,
    )
    return process_generation(tokenizer.decode(sample_output[0][input_ids_len:], skip_special_tokens=True))


def beam_search(input_text: str, max_length=20):
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
    input_ids_len = input_ids.shape[1]
    beam_output = model.generate(
        input_ids,
        max_length=input_ids_len + max_length,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True,
        # output_scores=True,
    )
    return process_generation(tokenizer.decode(beam_output[0][input_ids_len:], skip_special_tokens=True))

In [5]:
# Evaluation

import pandas as pd

def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))


def check_answer_truthfulness(generated_answer, gold_answers):
    if isinstance(gold_answers, str):
        gold_answers = [gold_answers]
    normalized_generation = normalize_text(generated_answer)
    return any([normalize_text(answer) in normalized_generation for answer in gold_answers])

In [6]:
optional_in_context_demonstrations = get_optional_in_context_demonstrations_for_triviaqa(size=500)
validation_set = get_triviaqa_validation_set(size=200)

Downloading data: 100%|██████████| 26/26 [10:46<00:00, 24.85s/files]
Generating train split: 100%|██████████| 138384/138384 [02:38<00:00, 870.60 examples/s] 
Generating validation split: 100%|██████████| 17944/17944 [00:22<00:00, 814.94 examples/s] 
Generating test split: 100%|██████████| 17210/17210 [00:21<00:00, 796.28 examples/s] 


Number of examples in train split: 138384
Number of examples in train split: 17944


In [7]:
# section 1 - fill in your code here
%pip install tqdm

[0mNote: you may need to restart the kernel to use updated packages.


**beam search**

In [8]:
from tqdm import tqdm

def build_in_context_prompts_random(demonstrations):
  promts = {}
  for k in range(3,9):
    smapeld_exp = random.sample(demonstrations, k)
    promt = ""
    for question, answers in smapeld_exp:
      promt += f"Question: {question}\nAnswer: {random.choice(answers)}\n\n"
    promts[k] = promt
  return promts

in_context_prompts = build_in_context_prompts_random(optional_in_context_demonstrations)

results = []
for k in range(3, 9):
  prompt_prefix = in_context_prompts[k]
  correct_count = 0
  print(f"Evaluating in-context size {k}...")
  for question, gold_answers in tqdm(validation_set, desc=f"In-Context Size {k}"):
    full_prompt = f"{prompt_prefix}\n\nQuestion: {question}\nAnswer:"
    generated_answer = beam_search(full_prompt)
    if check_answer_truthfulness(generated_answer, gold_answers):
      correct_count += 1
  accuracy = correct_count / len(validation_set)
  results.append({"In-Context Size": k, "Accuracy": accuracy})

print("\n")
df_beam_search = pd.DataFrame(results)
print(df_beam_search)


Evaluating in-context size 3...


In-Context Size 3:   0%|          | 0/200 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
In-Context Size 3: 100%|██████████| 200/200 [02:31<00:00,  1.32it/s]


Evaluating in-context size 4...


In-Context Size 4: 100%|██████████| 200/200 [03:16<00:00,  1.02it/s]


Evaluating in-context size 5...


In-Context Size 5: 100%|██████████| 200/200 [03:32<00:00,  1.06s/it]


Evaluating in-context size 6...


In-Context Size 6: 100%|██████████| 200/200 [02:08<00:00,  1.56it/s]


Evaluating in-context size 7...


In-Context Size 7: 100%|██████████| 200/200 [02:12<00:00,  1.51it/s]


Evaluating in-context size 8...


In-Context Size 8: 100%|██████████| 200/200 [02:21<00:00,  1.41it/s]




   In-Context Size  Accuracy
0                3     0.155
1                4     0.135
2                5     0.150
3                6     0.155
4                7     0.135
5                8     0.145


**sampling decoding (temperature = 0.7)**

In [9]:

results = []
for k in range(3, 9):
  prompt_prefix = in_context_prompts[k]
  correct_count = 0
  print(f"Evaluating in-context size {k}...")
  for question, gold_answers in tqdm(validation_set, desc=f"In-Context Size {k}"):
    full_prompt = f"{prompt_prefix}\n\nQuestion: {question}\nAnswer:"
    generated_answer = sampling(full_prompt)
    if check_answer_truthfulness(generated_answer, gold_answers):
      correct_count += 1
  accuracy = correct_count / len(validation_set)
  results.append({"In-Context Size": k, "Accuracy": accuracy})

print("\n")
df_sampling_decoding = pd.DataFrame(results)
print(df_sampling_decoding)

Evaluating in-context size 3...


In-Context Size 3: 100%|██████████| 200/200 [02:19<00:00,  1.44it/s]


Evaluating in-context size 4...


In-Context Size 4: 100%|██████████| 200/200 [02:16<00:00,  1.46it/s]


Evaluating in-context size 5...


In-Context Size 5: 100%|██████████| 200/200 [02:09<00:00,  1.54it/s]


Evaluating in-context size 6...


In-Context Size 6: 100%|██████████| 200/200 [02:23<00:00,  1.39it/s]


Evaluating in-context size 7...


In-Context Size 7: 100%|██████████| 200/200 [02:34<00:00,  1.30it/s]


Evaluating in-context size 8...


In-Context Size 8: 100%|██████████| 200/200 [02:34<00:00,  1.30it/s]



   In-Context Size  Accuracy
0                3     0.055
1                4     0.060
2                5     0.095
3                6     0.085
4                7     0.055
5                8     0.050





Temperature-based sampling performed significantly worse than beam search.
This is expected, as our goal wasn't to generate long or creative text, but rather short answers with a specific correct value. In such cases, introducing randomness through sampling can be harmful, as it increases the chance of generating incorrect or imprecise responses. Beam search, being more deterministic, is better suited for tasks that require accuracy over diversity.

In [10]:
from transformers import AutoTokenizer, AutoModel

def cls_pooling(model_output, attention_mask):
    return model_output[0][:,0]

tokenizer_bert = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-cls-token')
model_bert = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-cls-token')


def encode_question(question: str):
  encoded_input = tokenizer_bert([question], padding=True, truncation=True, return_tensors='pt')

  with torch.no_grad():
      model_output = model_bert(**encoded_input)

  # Perform pooling. In this case, max pooling.
  sentence_embeddings = cls_pooling(model_output, encoded_input['attention_mask'])

  return sentence_embeddings

In [11]:
# section 2 - fill in your code here
import torch

encoded_questions  = torch.stack([encode_question(tpl[0]) for tpl in optional_in_context_demonstrations])

def build_in_context_prompt_better(demonstrations, encoded_ques, question):
  q_vec = encode_question(question)
  scores = torch.matmul(encoded_ques, q_vec.T)
  topk = torch.topk(scores, k=8, dim=0).indices
  promt =""
  for i in topk:
    promt += f"Question: {demonstrations[i][0]}\nAnswer: {random.choice(demonstrations[i][1])}\n\n"
  return promt


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


**Beam search**

In [12]:
correct_count = 0
promt_examples = []
for question, gold_answers in tqdm(validation_set, desc=f"Beam serach"):
  prompt_prefix = build_in_context_prompt_better(optional_in_context_demonstrations, encoded_questions, question)
  full_prompt = f"{prompt_prefix}Question: {question}\nAnswer:"
  generated_answer = beam_search(full_prompt)
  if check_answer_truthfulness(generated_answer, gold_answers):
    correct_count += 1
    promt_examples.append((True, full_prompt))
  else:
    promt_examples.append((False, full_prompt))
accuracy = correct_count / len(validation_set)
print(f"\nDemonstrations Retrieval Beam search is {accuracy}")

Beam serach: 100%|██████████| 200/200 [02:35<00:00,  1.29it/s]


Demonstrations Retrieval Beam search is 0.15





**Sampling**

In [13]:
correct_count = 0
for question, gold_answers in tqdm(validation_set, desc=f"sampling decoding"):
  prompt_prefix = build_in_context_prompt_better(optional_in_context_demonstrations, encoded_questions, question)
  full_prompt = f"{prompt_prefix}Question: {question}\nAnswer:"
  generated_answer = sampling(full_prompt)
  if check_answer_truthfulness(generated_answer, gold_answers):
    correct_count += 1
    promt_examples.append((True, full_prompt))
  else:
    promt_examples.append((False, full_prompt))
accuracy = correct_count / len(validation_set)

print(f"\nDemonstrations Retrieval sampling decoding is {accuracy}")

sampling decoding: 100%|██████████| 200/200 [02:36<00:00,  1.28it/s]


Demonstrations Retrieval sampling decoding is 0.085





**new promts**

In [14]:
example_batch_size = 3
right_ans = []
wrong_ans = []
for tpl in promt_examples:
  if tpl[0]:
    right_ans.append(tpl[1])
  else:
    wrong_ans.append(tpl[1])

print(f"✅ Correct Answers")
for promt in right_ans[:min(example_batch_size, len(right_ans))]:
    print("\n-", promt)

print(f"\n❌ Incorrect Answers")
for promt in wrong_ans[:min(example_batch_size, len(wrong_ans))]:
    print("\n-", promt)

✅ Correct Answers

- Question: Where is New York's Empire State College located?
Answer: Saratoga Springs (NY)

Question: Which port lies between Puget Sound and Lake Washington?
Answer: St. Anne Seattle

Question: In which town or city was General Motors founded?
Answer: Flint michigan

Question: Which company first manufactured the electric toothbrush?
Answer: Bristol-Myers Squibb Epsilon Holdings

Question: Which Disney film had the theme tune A Whole New World?
Answer: Aladdin and the magic lamp

Question: In which year did Alcock and Brown make their Atlantic crossing?
Answer: 1919

Question: What is the Alaskan terminus of the Alaskan Highway?
Answer: Fairbanks, AL

Question: Where in the former Soviet Union was Yul Brynner born?
Answer: East Siberia

Question: What lake can be found on the border of Vermont and New York?
Answer:

- Question: Where was The Iron Triangle?
Answer: Socialist Republic of Viet Nam

Question: Which element along with polonium did the Curies discover?
A

The accuracy using semantic similarity-based context is lower than for some of the randomly selected context sizes.

This is because semantic similarity does not guarantee helpful context. The purpose of in-context examples is to demonstrate to the model the kind of question-answer format expected, not just to provide related content.

However, when selecting context based on semantic similarity, the retrieved questions tend to be similar in content but may differ in structure or format. As a result, the model may not learn the proper answering behavior from these examples — whereas randomly selected examples often provide a more diverse range of formats, which can help guide the model more effectively.

In [15]:
lama_validation_set = Lama().sample(200)

Downloading data: 100%|██████████| 74.6M/74.6M [00:01<00:00, 73.1MB/s]
Downloading data: 100%|██████████| 13.1k/13.1k [00:00<00:00, 16.0MB/s]
Generating train split: 100%|██████████| 1304391/1304391 [00:44<00:00, 29336.54 examples/s]


In [16]:
# section 3 - fill in your code here

**TriviaQA**

In [19]:
correct_count = 0
for question, gold_answers in tqdm(validation_set, desc=f"sampling decoding"):
  full_prompt = f"Question: {question}\nAnswer:"
  generated_answer = sampling(full_prompt)
  if check_answer_truthfulness(generated_answer, gold_answers):
    correct_count += 1

accuracy_sampling = correct_count / len(validation_set)

correct_count = 0
for question, gold_answers in tqdm(validation_set, desc=f"beam_search"):
  full_prompt = f"Question: {question}\nAnswer:"
  generated_answer = beam_search(full_prompt)
  if check_answer_truthfulness(generated_answer, gold_answers):
    correct_count += 1

accuracy_beam = correct_count / len(validation_set)
print("\nTriviaQA data set\n")
print(pd.DataFrame([{"Model": "Sampling", "Accuracy": accuracy_sampling}, {"Model": "Beam Search", "Accuracy": accuracy_beam}]))

sampling decoding: 100%|██████████| 200/200 [02:20<00:00,  1.42it/s]
beam_search: 100%|██████████| 200/200 [01:21<00:00,  2.44it/s]


TriviaQA data set

         Model  Accuracy
0     Sampling     0.100
1  Beam Search     0.085





**LAMA**

In [None]:
wrong_sampels = []
smaple_size = 4
correct_count = 0
for question, gold_answers in tqdm(lama_validation_set, desc=f"sampling decoding"):
  full_prompt = f"{question}"
  generated_answer = sampling(full_prompt)
  if check_answer_truthfulness(generated_answer, gold_answers):
    correct_count += 1

accuracy_sampling = correct_count / len(lama_validation_set)

correct_count = 0
for question, gold_answers in tqdm(lama_validation_set, desc=f"beam_search"):
  full_prompt = f"{question}"
  generated_answer = beam_search(full_prompt)
  if check_answer_truthfulness(generated_answer, gold_answers):
    correct_count += 1
  else:
    wrong_sampels.append(full_prompt + "__" + generated_answer)

accuracy_beam = correct_count / len(lama_validation_set)
print("\nLAMA data set\n")
print(pd.DataFrame([{"Model": "Sampling", "Accuracy": accuracy_sampling}, {"Model": "Beam Search", "Accuracy": accuracy_beam}]))
print("\nsome answers that were classified as wrong:\n")
print("\n\n".join(wrong_sampels[:min(smaple_size, len(wrong_sampels))]))

sampling decoding: 100%|██████████| 200/200 [02:13<00:00,  1.49it/s]
beam_search: 100%|██████████| 200/200 [01:23<00:00,  2.38it/s]


LAMA data set

         Model  Accuracy
0     Sampling     0.205
1  Beam Search     0.270

 some answers that were classified as wrong:

Lophospermum is a genus of herbaceous perennial climbers or scramblers, native to mountainous regions of Mexico and __iced-over areas of Central and South America. It has been used as an ornamental plant for

He had also won the silver medal at the 2000 Summer Olympics in __ Tokyo, Japan, where he was a member of the team that won gold in the men

It is located in the northwest corner of Ohio, approximately two miles from Indiana and six miles from ___________.

It was built in 1885 and is owned by the Ohio Historical Society.

Choe's diary accounts of his travels in China became widely printed in the 16th century in both Korea and __한국어 (Korean: 기리�





some sentences does seem true but they get cut in the middle : its seems the model has some word limit that prevents him from generaterating the desired outcome so actually its results sopposed to be better but we put a word limit that damages them