Student Name: Mohammad Mahdi Razmjoo

Student Number: 400101272

# Import Libraries

In [1]:
import os
from pathlib import Path
import json
import re
from tqdm import tqdm
from collections import Counter
import torch
from huggingface_hub import login
from collections import defaultdict

In [2]:
!pip install datasets transformers --quiet

In [3]:
from datasets import load_dataset
import transformers

# Configurations & Utils


In [None]:
# TODO: Research the Mistral-7B-Instruct-v0.3 model (capabilities, use cases, limitations)
model_id: str = "mistralai/Mistral-7B-Instruct-v0.3"

# TODO: Research Hugging Face API token.
hugging_face_token: str = "#############"

# The directory path to save our intermediate results.
output_dir: str = Path("outputs")
output_dir.mkdir(exist_ok=True, parents=True)

# TODO: Research the impact of max_new_tokens, do_sample, and temperature on model outputs.
max_new_tokens: int = 256 
do_sample: bool = False 
temperature: float = 1.0

# The number of data points we choose for our mini project.
sample_numbers: int = 50 # you can adjust it to 100 if you have more GPU, the accuracy becomes better.

# The number of shots for few-shot prompting (default is 2).
shots: int = 2

# The number of generated outputs in the self-consistency setting.
K: int = 3

In [5]:
# Functions to write and read data in a JSON file (for storing intermediate outputs).

def write_json(data, filename):
    """
    Writes data to a JSON file.

    Args:
        data (dict or list): The data to be written to the file.
        filename (str): The path to the JSON file.
    """
    with open(filename, "w", encoding="utf-8") as file:
        json.dump(data, file, indent=4)


def read_json(filename):
    """
    Reads and loads data from a JSON file.

    Args:
        filename (str): The path to the JSON file.

    Returns:
        dict or list: The parsed JSON content.
    """
    with open(filename, "r", encoding="utf-8") as file:
        return json.load(file)

In [6]:
# Function to read content from a .txt file (used for reading demonstrations)

def read_from_txt(file_path):
    """
    Reads the entire content of a .txt file.

    Args:
        file_path (str): Path to the text file.

    Returns:
        str: The content of the file as a string.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

In [7]:
# Functions to extract an answer from generated text and postprocess the final answer
def extract_answer(generated_text):
    """
    Extracts the last numerical expression from the generated text.

    Args:
        generated_text (str): The text output from which to extract a numeric answer.

    Returns:
        str or None: The extracted numerical expression if found, otherwise None.
    """
    # Compile a regex pattern to find both integers and floats (positive or negative)
    pattern = re.compile(r'[-+]?\d*\.\d+|\d+')
    
    # Find all matches in the generated text
    matches = pattern.findall(generated_text)
    
    # If there are matches, return the last one; otherwise return None
    if matches:
        return matches[-1]
    return None

def postprocess_final_answer(numeric_expression):
    """
    Cleans and evaluates a numeric expression to obtain a final answer.

    Args:
        numeric_expression (str): A string containing a numeric expression.

    Returns:
        str: The evaluated result as a string if computation is successful,
             otherwise returns the original input.

    Method:
    1. Removes commas from the numeric expression to avoid formatting issues.
    2. Attempts to evaluate the expression using Python's `eval()`.
    3. If evaluation fails (e.g., due to invalid input), returns the original expression.
    """
    try:
        cleaned_up = numeric_expression.replace(',', '')  # Remove commas for proper evaluation
        result = eval(cleaned_up)  # Evaluate the numeric expression
        return str(result)  # Convert result to string and return
    except Exception:
        return numeric_expression  # Return the original expression if evaluation fails

In [8]:
def aggregate_paths_based_on_scores(paths):
    """
    Aggregates answer paths based on their confidence scores and selects the best answer.

    Args:
        paths (list of tuples): Each tuple contains (answer, confidence, final_answer).
            - answer: The full answer text (str)
            - confidence: A numerical score representing confidence (float/int)
            - final_answer: The final extracted answer (str)

    Returns:
        tuple: (best_full_ans, best_score, best_answer)
               - best_full_ans: The full answer with highest total confidence.
               - best_score: The aggregated confidence score of the best answer.
               - best_answer: The final answer with the highest overall confidence.
    """
    # Dictionary to sum confidence scores for each final answer
    answer_scores = {}
    
    # Dictionary to store an example full answer for each final answer
    answer_text_map = {}

    # Accumulate confidence for each unique final_answer
    for full_answer, confidence, final_answer in paths:
        if final_answer not in answer_scores:
            answer_scores[final_answer] = 0
            # Store the first or most representative full_answer
            answer_text_map[final_answer] = full_answer
        answer_scores[final_answer] += confidence
    
    # Identify the final answer with the highest confidence sum
    best_answer = max(answer_scores, key=answer_scores.get)
    
    # Retrieve the aggregated confidence score and the corresponding full answer
    best_score = answer_scores[best_answer]
    best_full_ans = answer_text_map[best_answer]
    
    return best_full_ans, best_score, best_answer

In [9]:
def get_accuracy(json_path):
    """
    Computes the accuracy based on a JSON file containing correctness labels.

    Args:
        json_path (str): Path to the JSON file containing the evaluation data.

    Returns:
        float: The accuracy, calculated as the proportion of correctly classified items.
    """
    data = read_json(json_path)
    
    total = len(data)
    if total == 0:
        return 0.0  # Avoid divide-by-zero if somehow the file is empty
    
    # Count how many items have 'is_correct' == True
    correct = sum(1 for item in data if item.get('is_correct') == True)
    
    # Return the fraction of items that are correct
    return correct / total

# Dataset

In [10]:
# TODO: research about math reasoning datasets, especially the GSM8K.
def load_gsm8k(sample_number):
    """
    Loads a subset of the GSM8K dataset for evaluation.

    Args:
        sample_number (int): The number of samples to select from the test set.

    Returns:
        Dataset: A subset of the GSM8K test dataset with the specified number of samples.

    Method:
    1. Loads the "openai/gsm8k" dataset from the Hugging Face Datasets library.
    2. Selects the 'test' split of the dataset.
    3. Extracts the first `sample_number` examples from the test set.
    4. Returns the selected subset.
    
    Note:
        Ensure that `sample_number` does not exceed the total number of test samples.
    """
    dataset = load_dataset("openai/gsm8k", 'main')['test']
    dataset_samples = dataset.select(range(sample_number))
    return dataset_samples

In [11]:
gsm8k = load_gsm8k(sample_number= sample_numbers)

README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

# Model

In [12]:
# TODO: research about the Mistral model and its capabilities.
# TODO: research about device_map and trasnformers.pipeline.
class LLM:
    """
    A wrapper class for a large language model (LLM) using the Hugging Face Transformers library.

    Attributes:
        model_id (str): The identifier of the model to be loaded.
        max_new_tokens (int): The maximum number of tokens the model should generate.
        do_sample (bool): Whether to use sampling when generating text.
        temperature (float): Controls randomness in text generation.
        llm (transformers.pipeline): The loaded text-generation model.

    Methods:
        load_llm(model_id):
            Loads the specified language model with optimized settings.
        
        generate(system_prompt, user_prompt):
            Generates a response based on a system prompt and user input.
    """

    def __init__(self, model_id, do_sample, temperature, max_new_tokens):
        """
        Initializes the LLM class and loads the model.

        Args:
            model_id (str): The Hugging Face model identifier.
            do_sample (bool): Enables stochastic text generation when True.
            temperature (float): Adjusts randomness in text generation.
            max_new_tokens (int): Defines the maximum length of generated responses.
        """
        self.model_id = model_id
        self.max_new_tokens = max_new_tokens
        self.do_sample = do_sample
        self.temperature = temperature
        self.llm = self.load_llm(model_id)

    def load_llm(self, model_id):
        """
        Loads the text-generation model using the Hugging Face Transformers pipeline.

        Args:
            model_id (str): The model identifier from Hugging Face.

        Returns:
            transformers.pipeline: A text-generation pipeline with optimized settings.
        """
        pipeline = transformers.pipeline(
            "text-generation",
            model=model_id,
            model_kwargs={"torch_dtype": torch.bfloat16},  # Use bfloat16 for memory efficiency
            device_map="auto",  # Automatically assigns model to the best available device (GPU/CPU)
        )
        return pipeline

    def generate(self, system_prompt, user_prompt):
        """
        Generates a response from the model based on a system prompt and user input.

        Args:
            system_prompt (str): A system-level instruction to guide the model's response.
            user_prompt (str): The user's query or input.

        Returns:
            str: The generated response from the model.
        """
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ]

        outputs = self.llm(
            messages,
            max_new_tokens=self.max_new_tokens,
            do_sample=self.do_sample,
            temperature=self.temperature,
            pad_token_id=self.llm.tokenizer.eos_token_id  # Ensure proper padding
        )

        return outputs[0]["generated_text"][-1]['content']  # Extracts the last generated content


In [13]:
login(hugging_face_token)

In [14]:
llm = LLM(model_id=model_id, do_sample=do_sample,
          temperature=temperature, max_new_tokens=max_new_tokens)

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Device set to use cuda:0


# Methods

## Direct Prompting

In [None]:
def direct_prompt(llm: LLM, system_prompt, dataset, save_path):
    """
    Runs direct prompting on a given dataset using an LLM and evaluates accuracy.

    Args:
        llm (LLM): The language model instance used for generating responses.
        system_prompt (str): The system-level instruction for guiding the model.
        dataset (Dataset): A dataset containing questions and their ground-truth answers.
        save_path (str): The file path where the results will be saved in JSON format.

    Returns:
        None: Results are saved to `save_path`.

    Process:
    1. Iterates through each question in the dataset.
    2. Extracts the ground-truth answer and generates a model response.
    3. Post-processes the generated answer and compares it with the ground truth.
    4. Stores the results, including correctness, in a list.
    5. Writes intermediate results to a JSON file at every iteration.
    6. Computes and displays a running accuracy score using a progress bar.
    """
    total_questions = len(dataset)
    results = [] 

    with tqdm(total=total_questions, dynamic_ncols=True) as pbar:
        for query_id, data in enumerate(dataset):
            question = data['question']
            ground_truth_numeric = extract_answer(data['answer'])

            generated_text = llm.generate(
                system_prompt=system_prompt,
                user_prompt=question
            )

            generated_numeric = extract_answer(generated_text)
            final_answer = (
                postprocess_final_answer(generated_numeric)
                if generated_numeric is not None
                else None
            )

            is_correct = (final_answer == ground_truth_numeric)

            results.append({
                "question": question,
                "model_answer": generated_text,
                "generated_numeric": generated_numeric,
                "final_answer": final_answer,
                "ground_truth": ground_truth_numeric,
                "is_correct": is_correct
            })

            write_json(results, save_path)

            correct_answers = sum(1 for r in results if r["is_correct"])
            running_accuracy = (correct_answers / len(results)) * 100
            pbar.set_postfix(idx=query_id + 1, running_accuracy=f"{running_accuracy:.2f}%")
            pbar.update(1)

    write_json(results, save_path)

In [16]:
# I made a slight change to this part of the code and combined two previous sections for better evaluation
system_prompts = {
    "concise": "Provide a direct and concise answer to the question without additional commentary.",
    "step_by_step": "Answer the question step-by-step, showing your reasoning before giving the final answer.",
    "brief_final": "Respond briefly with only the final answer, without any elaboration or additional explanation.",
    "detailed_reasoning": "Think through the problem carefully and then provide a succinct answer with key reasoning steps highlighted."
}

variant_accuracies = {}
for variant_name, prompt in system_prompts.items():
    print(f"Evaluating system prompt variant: {variant_name}")
    save_path_variant = output_dir / f"direct_prompt_{variant_name}.json"
    
    direct_prompt(llm=llm, system_prompt=prompt, dataset=gsm8k, save_path=save_path_variant)
    
    accuracy_variant = get_accuracy(save_path_variant)
    variant_accuracies[variant_name] = accuracy_variant
    print(f"System Prompt Variant '{variant_name}' Accuracy: {accuracy_variant:.2%}\n")

print("All system prompt evaluation results:")
for variant, acc in variant_accuracies.items():
    print(f"{variant}: {acc:.2%}")

Evaluating system prompt variant: concise


 20%|██        | 10/50 [01:11<05:02,  7.56s/it, idx=10, running_accuracy=40.00%]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 50/50 [05:15<00:00,  6.31s/it, idx=50, running_accuracy=24.00%]


System Prompt Variant 'concise' Accuracy: 24.00%

Evaluating system prompt variant: step_by_step


100%|██████████| 50/50 [10:29<00:00, 12.58s/it, idx=50, running_accuracy=18.00%]


System Prompt Variant 'step_by_step' Accuracy: 18.00%

Evaluating system prompt variant: brief_final


100%|██████████| 50/50 [01:01<00:00,  1.23s/it, idx=50, running_accuracy=14.00%]


System Prompt Variant 'brief_final' Accuracy: 14.00%

Evaluating system prompt variant: detailed_reasoning


100%|██████████| 50/50 [09:30<00:00, 11.41s/it, idx=50, running_accuracy=34.00%]

System Prompt Variant 'detailed_reasoning' Accuracy: 34.00%

All system prompt evaluation results:
concise: 24.00%
step_by_step: 18.00%
brief_final: 14.00%
detailed_reasoning: 34.00%





## Zero-shot Prompting

In [None]:
def zero_shot(llm: LLM, system_prompt, cot_prefix, dataset, save_path):
    total_questions = len(dataset)
    results = [] 

    with tqdm(total=total_questions, dynamic_ncols=True) as pbar:
        for query_id, data in enumerate(dataset):
            question = data["question"]
            ground_truth = extract_answer(data["answer"])

            constructed_prompt = f"{question}\n{cot_prefix}\nAnswer:"
            generated_text = llm.generate(
                system_prompt=system_prompt,
                user_prompt=constructed_prompt
            )

            generated_numeric = extract_answer(generated_text)
            final_answer = postprocess_final_answer(generated_numeric) if generated_numeric else None
            is_correct = (final_answer == ground_truth)

            results.append({
                "question": question,
                "prompt_used": constructed_prompt,
                "model_answer": generated_text,
                "generated_numeric": generated_numeric,
                "final_answer": final_answer,
                "ground_truth": ground_truth,
                "is_correct": is_correct
            })

            write_json(results, save_path)

            correct_answers = sum(1 for r in results if r["is_correct"])
            running_accuracy = (correct_answers / (query_id + 1)) * 100

            pbar.set_postfix(idx=query_id + 1, running_accuracy=f"{running_accuracy:.2f}%")
            pbar.update(1)

    write_json(results, save_path)

In [19]:
# I made a slight change to this part of the code and combined two previous sections for better evaluation
cot_prefix_variants = {
    "step_by_step": "let's think step by step",
    "reasoning": "Let's carefully reason through the problem and solve it:",
    "detailed": "Begin by analyzing the problem step by step before arriving at the final answer:"
}

cot_variant_accuracies = {}

for variant_name, cot_prefix in cot_prefix_variants.items():
    print(f"Evaluating CoT prefix variant: {variant_name}")
    
    save_path_variant = output_dir / f"zero_shot_{variant_name}.json"
    
    zero_shot(
        llm=llm,
        system_prompt="Be helpful, answer the question.",
        cot_prefix=cot_prefix,
        dataset=gsm8k,
        save_path=save_path_variant
    )
    
    accuracy_variant = get_accuracy(save_path_variant)
    cot_variant_accuracies[variant_name] = accuracy_variant
    print(f"CoT Variant '{variant_name}' Accuracy: {accuracy_variant:.2%}\n")

print("All CoT prefix evaluation results:")
for variant, acc in cot_variant_accuracies.items():
    print(f"{variant}: {acc:.2%}")

Evaluating CoT prefix variant: step_by_step


100%|██████████| 50/50 [09:42<00:00, 11.66s/it, idx=50, running_accuracy=40.00%]


CoT Variant 'step_by_step' Accuracy: 40.00%

Evaluating CoT prefix variant: reasoning


100%|██████████| 50/50 [09:33<00:00, 11.47s/it, idx=50, running_accuracy=34.00%]


CoT Variant 'reasoning' Accuracy: 34.00%

Evaluating CoT prefix variant: detailed


100%|██████████| 50/50 [10:15<00:00, 12.32s/it, idx=50, running_accuracy=36.00%]

CoT Variant 'detailed' Accuracy: 36.00%

All CoT prefix evaluation results:
step_by_step: 40.00%
reasoning: 34.00%
detailed: 36.00%





## Few-shot prompting

### Few-shot Prompting with Correct Shots

In [None]:
def few_shot(llm, system_prompt, shots, dataset, save_path):
    """
    Performs few-shot prompting on a dataset using an LLM and evaluates accuracy.

    Args:
        llm (LLM): The language model instance used for generating responses.
        system_prompt (str): The system-level instruction guiding the model.
        shots (int): The number of few-shot examples to use.
        dataset (Dataset): A dataset containing questions and their ground-truth answers.
        save_path (str): The file path where results will be saved in JSON format.

    Returns:
        None: Results are saved to `save_path`.

    Process:
    1. Loads pre-written few-shot demonstrations from a text file.
    2. Iterates through each question in the dataset.
    3. Formats the prompt by inserting the question into the demonstration template.
    4. Extracts the ground-truth answer and generates a model response.
    5. Post-processes the generated answer and compares it with the ground truth.
    6. Stores the results, including correctness, in a list.
    7. Writes intermediate results to a JSON file at every iteration.
    8. Computes and displays a running accuracy score using a progress bar.
    """
    total_questions = len(dataset)
    results = []

    demonstrations = read_from_txt(f"/kaggle/input/few-shot-dataset/{shots}_shots.txt")

    with tqdm(total=total_questions, dynamic_ncols=True) as pbar:
        for query_id, data in enumerate(dataset):
            question = data['question']
            ground_truth = extract_answer(data['answer'])

            user_prompt = f"{demonstrations}\n\nQuestion: {question}\nAnswer:"

            generated_text = llm.generate(
                system_prompt=system_prompt,
                user_prompt=user_prompt
            )

            generated_numeric = extract_answer(generated_text)
            final_answer = (postprocess_final_answer(generated_numeric)
                            if generated_numeric is not None
                            else None)

            is_correct = (final_answer == ground_truth)

            results.append({
                "question": question,
                "demonstrations": demonstrations,
                "model_answer": generated_text,
                "generated_numeric": generated_numeric,
                "final_answer": final_answer,
                "ground_truth": ground_truth,
                "is_correct": is_correct
            })

            write_json(results, save_path)

            correct_answers = sum(1 for r in results if r["is_correct"])
            running_accuracy = (correct_answers / (query_id + 1)) * 100
            pbar.set_postfix(idx=query_id + 1, running_accuracy=f"{running_accuracy:.2f}%")
            pbar.update(1)

    write_json(results, save_path)


In [22]:
# I made a slight change to this part of the code and combined two previous sections for better evaluation
shots_list = [2, 4, 8]
shots_accuracy = {}

for shots in shots_list:
    print(f"Evaluating few_shot with {shots} shots...")
    
    few_shot_save_path = output_dir / f"few_shot_{shots}.json"
    
    few_shot(
        llm=llm,
        system_prompt="Be helpful, answer the question.",
        shots=shots,
        dataset=gsm8k,
        save_path=few_shot_save_path
    )
    
    accuracy = get_accuracy(few_shot_save_path)
    shots_accuracy[shots] = accuracy
    print(f"Few_shot with {shots} shots, accuracy: {accuracy:.2%}\n")

print("Few-shot evaluation results:")
for shot_count, acc in shots_accuracy.items():
    print(f"{shot_count} shots: {acc:.2%}")

Evaluating few_shot with 2 shots...


100%|██████████| 50/50 [08:34<00:00, 10.29s/it, idx=50, running_accuracy=40.00%]


Few_shot with 2 shots, accuracy: 40.00%

Evaluating few_shot with 4 shots...


100%|██████████| 50/50 [09:47<00:00, 11.75s/it, idx=50, running_accuracy=40.00%]


Few_shot with 4 shots, accuracy: 40.00%

Evaluating few_shot with 8 shots...


100%|██████████| 50/50 [11:44<00:00, 14.08s/it, idx=50, running_accuracy=36.00%]

Few_shot with 8 shots, accuracy: 36.00%

Few-shot evaluation results:
2 shots: 40.00%
4 shots: 40.00%
8 shots: 36.00%





### Few-shot Prompting with Wrong Shots

In [None]:
def wrong_shot(llm, system_prompt, shots, dataset, save_path):
    """
    Performs few-shot prompting using incorrect demonstrations (negative examples)
    and evaluates the impact on model performance.

    Args:
        llm (LLM): The language model instance used for generating responses.
        system_prompt (str): The system-level instruction guiding the model.
        shots (int): The number of incorrect few-shot examples to use.
        dataset (Dataset): A dataset containing questions and their ground-truth answers.
        save_path (str): The file path where results will be saved in JSON format.

    Returns:
        None: Results are saved to `save_path`.

    Process:
    1. Loads incorrect few-shot demonstrations from a text file (`negative_{shots}_shots.txt`).
    2. Iterates through each question in the dataset.
    3. Formats the prompt by inserting the question into the incorrect demonstration template.
    4. Extracts the ground-truth answer and generates a model response.
    5. Post-processes the generated answer and compares it with the ground truth.
    6. Stores the results, including correctness, in a list.
    7. Writes intermediate results to a JSON file at every iteration.
    8. Computes and displays a running accuracy score using a progress bar.
    """
    total_questions = len(dataset)
    results = []

    demonstrations = read_from_txt(f"/kaggle/input/few-negative-shot/negative_{shots}_shots.txt")

    with tqdm(total=total_questions, dynamic_ncols=True) as pbar:
        for query_id, data in enumerate(dataset):
            question = data['question']
            ground_truth = extract_answer(data['answer'])

            user_prompt = f"{demonstrations}\n\nQuestion: {question}\nAnswer:"

            generated_text = llm.generate(
                system_prompt=system_prompt,
                user_prompt=user_prompt
            )

            generated_numeric = extract_answer(generated_text)
            final_answer = (postprocess_final_answer(generated_numeric)
                            if generated_numeric is not None
                            else None)

            is_correct = (final_answer == ground_truth)

            results.append({
                "question": question,
                "demonstrations": demonstrations,
                "model_answer": generated_text,
                "generated_numeric": generated_numeric,
                "final_answer": final_answer,
                "ground_truth": ground_truth,
                "is_correct": is_correct
            })

            write_json(results, save_path)

            correct_answers = sum(1 for r in results if r["is_correct"])
            running_accuracy = (correct_answers / (query_id + 1)) * 100

            pbar.set_postfix(idx=query_id + 1, running_accuracy=f"{running_accuracy:.2f}%")
            pbar.update(1)

    write_json(results, save_path)

In [24]:
# I made a slight change to this part of the code and combined two previous sections for better evaluation
wrong_shots_list = [2, 4, 8]
wrong_shot_accuracies = {}

for shots in wrong_shots_list:
    print(f"Evaluating wrong_shot with {shots} shots...")
    
    wrong_shot_save_path = output_dir / f"wrong_shot_{shots}.json"
    
    wrong_shot(
        llm=llm,
        system_prompt="Be helpful, answer the question.",
        shots=shots,
        dataset=gsm8k,
        save_path=wrong_shot_save_path
    )
    
    accuracy = get_accuracy(wrong_shot_save_path)
    wrong_shot_accuracies[shots] = accuracy
    print(f"wrong_shot with {shots} shots, accuracy: {accuracy:.2%}\n")

print("Wrong-shot evaluation results:")
for shot_count, acc in wrong_shot_accuracies.items():
    print(f"{shot_count} shots: {acc:.2%}")

Evaluating wrong_shot with 2 shots...


100%|██████████| 50/50 [08:53<00:00, 10.66s/it, idx=50, running_accuracy=34.00%]


wrong_shot with 2 shots, accuracy: 34.00%

Evaluating wrong_shot with 4 shots...


100%|██████████| 50/50 [09:06<00:00, 10.92s/it, idx=50, running_accuracy=34.00%]


wrong_shot with 4 shots, accuracy: 34.00%

Evaluating wrong_shot with 8 shots...


100%|██████████| 50/50 [10:05<00:00, 12.12s/it, idx=50, running_accuracy=42.00%]

wrong_shot with 8 shots, accuracy: 42.00%

Wrong-shot evaluation results:
2 shots: 34.00%
4 shots: 34.00%
8 shots: 42.00%





## Self-Consistency

In [None]:
def self_consistency(llm, system_prompt, K, dataset, save_path):
    """
    Implements the Self-Consistency method for answer generation.
    
    Instead of relying on a single response, this function generates K different responses
    for each question and selects the most frequently occurring answer as the final prediction.
    This approach aims to improve accuracy and robustness.

    Args:
        llm (LLM): The language model instance for text generation.
        system_prompt (str): The system-level instruction guiding the model.
        K (int): The number of independent generations for self-consistency.
        dataset (Dataset): The dataset containing questions and ground-truth answers.
        save_path (str): Path to store results in a JSON file.

    Returns:
        None: Results are saved to `save_path`.

    Process:
    1. Iterates through each question in the dataset.
    2. Generates K independent responses for the same question.
    3. Extracts and post-processes answers from the generated text.
    4. Selects the most frequently occurring answer (majority voting).
    5. Compares the selected answer with the ground truth.
    6. Stores results in a list and updates a progress bar with running accuracy.
    7. Writes results progressively to prevent data loss.
    """
    total_questions = len(dataset)
    results = []

    with tqdm(total=total_questions, dynamic_ncols=True) as pbar:
        for query_id, data in enumerate(dataset):
            question = data['question']
            ground_truth = extract_answer(data['answer'])

            generated_texts = []
            numeric_answers = []

            for _ in range(K):
                response_text = llm.generate(
                    system_prompt=system_prompt,
                    user_prompt=question
                )
                generated_texts.append(response_text)

                numeric_ans = extract_answer(response_text)
                if numeric_ans is not None:
                    final_ans = postprocess_final_answer(numeric_ans)
                else:
                    final_ans = None

                numeric_answers.append(final_ans)

            freq_counter = Counter(numeric_answers)
            if None in freq_counter and len(freq_counter) > 1:
                freq_counter.pop(None)

            best_answer, _ = freq_counter.most_common(1)[0] if freq_counter else (None, 0)

            is_correct = (best_answer == ground_truth)

            results.append({
                "question": question,
                "model_answers": generated_texts,    
                "numeric_answers": numeric_answers,
                "best_answer": best_answer,
                "ground_truth": ground_truth,
                "is_correct": is_correct
            })

            write_json(results, save_path)

            correct_answers = sum(1 for r in results if r["is_correct"])
            running_accuracy = (correct_answers / (query_id + 1)) * 100

            pbar.set_postfix(idx=query_id + 1, running_accuracy=f"{running_accuracy:.2f}%")
            pbar.update(1)

    write_json(results, save_path)

In [None]:
# I made a slight change to this part of the code and combined two previous sections for better evaluation
for current_K in [3, 4, 5]:
    print(f"Running self_consistency with K = {current_K}...")
    self_consistency_save_path = output_dir / f"self_consistency_{current_K}.json"
    llm.temperature = 1.0

    self_consistency(
        llm=llm,
        system_prompt="Be helpful and answer the question concisely.",
        K=current_K,
        dataset=gsm8k,
        save_path=self_consistency_save_path
    )

    accuracy = get_accuracy(self_consistency_save_path)
    print(f"self_consistency with K = {current_K}, accuracy: {accuracy:.2%}\n")

Running self_consistency with K = 3...


100%|██████████| 50/50 [20:41<00:00, 24.82s/it, idx=50, running_accuracy=30.00%]


self_consistency with K = 3, accuracy: 30.00%

Running self_consistency with K = 4...


100%|██████████| 50/50 [27:31<00:00, 33.03s/it, idx=50, running_accuracy=30.00%]


self_consistency with K = 4, accuracy: 30.00%

Running self_consistency with K = 5...


100%|██████████| 50/50 [35:04<00:00, 42.08s/it, idx=50, running_accuracy=30.00%]

self_consistency with K = 5, accuracy: 30.00%






## Verbalized Confidence

In [None]:
def verbalized_confidence(llm, system_prompt, K, dataset, save_path):
    """
    Generates answers to a set of questions using a language model (LLM) and assigns a confidence score 
    to each generated response. Selects the most confident response and evaluates its correctness.

    Args:
        llm: The language model used for generating responses.
        system_prompt (str): The initial system prompt to guide the model.
        K (int): Number of answer candidates to generate per question.
        dataset (list): List of dictionaries, each containing a question and its corresponding answer.
        save_path (str): Path to save the results as a JSON file.

    Returns:
        None (Results are saved to a JSON file).
    """

    total_questions = len(dataset)
    results = []

    with tqdm(total=total_questions, dynamic_ncols=True) as pbar:
        for query_id, data in enumerate(dataset):
            question = data["question"]
            ground_truth = extract_answer(data["answer"])

            candidate_answers = []
            candidate_confidences = []
            candidate_texts = []

            for _ in range(K):
                answer_prompt = (
                    f"{question}\n"
                    "Let's reason this out step-by-step, then give a final numeric answer.\n"
                    "Answer:"
                )
                generated_answer_text = llm.generate(
                    system_prompt=system_prompt,
                    user_prompt=answer_prompt
                )

                numeric_ans = extract_answer(generated_answer_text)
                final_ans = (
                    postprocess_final_answer(numeric_ans) if numeric_ans is not None else None
                )

                confidence_prompt = (
                    f"I provided the answer '{final_ans}'. "
                    "On a scale from 0 to 100, how confident am I in this answer? "
                    "Please only respond with the number."
                )
                generated_confidence_text = llm.generate(
                    system_prompt=system_prompt,
                    user_prompt=confidence_prompt
                )

                conf_match = re.findall(r"[-+]?\d*\.\d+|\d+", generated_confidence_text)
                if conf_match:
                    try:
                        raw_conf = float(conf_match[-1])
                        confidence_value = max(0.0, min(100.0, raw_conf))
                    except ValueError:
                        confidence_value = 0.0
                else:
                    confidence_value = 0.0

                candidate_answers.append(final_ans)
                candidate_confidences.append(confidence_value)
                candidate_texts.append({
                    "answer_text": generated_answer_text,
                    "confidence_text": generated_confidence_text
                })

            if candidate_confidences:
                max_index = max(range(len(candidate_confidences)), key=candidate_confidences.__getitem__)
                best_answer = candidate_answers[max_index]
                best_confidence = candidate_confidences[max_index]
            else:
                best_answer = None
                best_confidence = 0.0

            is_correct = (best_answer == ground_truth)

            results.append({
                "question": question,
                "ground_truth": ground_truth,
                "candidate_answers": candidate_answers,
                "candidate_confidences": candidate_confidences,
                "candidate_texts": candidate_texts,
                "best_answer": best_answer,
                "best_confidence": best_confidence,
                "is_correct": is_correct
            })

            write_json(results, save_path)

            correct_answers = sum(1 for r in results if r["is_correct"])
            running_accuracy = (correct_answers / (query_id + 1)) * 100

            pbar.set_postfix(idx=query_id + 1, running_accuracy=f"{running_accuracy:.2f}%")
            pbar.update(1)

    write_json(results, save_path)

In [None]:
# I made a slight change to this part of the code and combined two previous sections for better evaluation
verbalized_confidence_results = {}

for current_K in [3, 4, 5]:
    print(f"Running verbalized_confidence with K = {current_K} answer candidates...")
    
    save_path = output_dir / f"verbalized_confidence_{current_K}.json"
    llm.do_sample = True  
    llm.temperature = 1.0  

    verbalized_confidence(
        llm=llm,
        system_prompt="Be helpful, answer the question.",
        K=current_K,
        dataset=gsm8k,
        save_path=save_path
    )
    
    accuracy = get_accuracy(save_path)
    verbalized_confidence_results[current_K] = accuracy
    print(f"verbalized_confidence with K = {current_K}, accuracy: {accuracy:.2%}\n")

Running verbalized_confidence with K = 3 answer candidates...


100%|██████████| 50/50 [36:44<00:00, 44.09s/it, idx=50, running_accuracy=36.00%]


verbalized_confidence with K = 3, accuracy: 36.00%

Running verbalized_confidence with K = 4 answer candidates...


100%|██████████| 50/50 [48:20<00:00, 58.00s/it, idx=50, running_accuracy=36.00%]


verbalized_confidence with K = 4, accuracy: 36.00%

Running verbalized_confidence with K = 5 answer candidates...


100%|██████████| 50/50 [59:25<00:00, 71.31s/it, idx=50, running_accuracy=36.00%]

verbalized_confidence with K = 5, accuracy: 36.00%






## Subquestion Prompting

In [None]:
def subquestion(llm, system_prompt, dataset, save_path):
    """
    Decomposes complex questions into a series of subquestions (few-shot style) and
    then solves them step-by-step, returning a final numeric answer.

    We provide a short demonstration for how to break down a math word problem
    into simpler subquestions, then answer them in a second step with chain-of-thought.

    Args:
        llm: The language model used for generating responses.
        system_prompt (str): The initial system prompt to guide the model's general behavior.
        dataset (list): A list of dictionaries, each containing a question and its corresponding answer.
        save_path (str): Path to save the results as a JSON file.
    """
    total_questions = len(dataset)
    results = []

    llm.do_sample = True  
    llm.temperature = 0.7

    decomposition_demo = """\
**Example Decomposition**:
Question: "Tom has 3 apples and 2 oranges. If each fruit costs 2 dollars, how much money does Tom need?"
Subquestions:
1) How many total fruits does Tom have?
2) What is the cost for each fruit?
3) What is the total cost for all fruits?

---

Question: "Sally runs 2 miles every day for 7 days. How many miles does she run in total?"
Subquestions:
1) How many days does Sally run?
2) How many miles does she run each day?
3) What is the total after 7 days?
---
"""

    solving_demo = """\
**Example Solving**:
Subquestions:
1) How many total fruits does Tom have?
Answer: 3 + 2 = 5
2) What is the cost for each fruit?
Answer: 2 dollars per fruit
3) What is the total cost for all fruits?
Answer: 5 fruits * 2 dollars = 10

Final numeric answer: 10

---

Subquestions:
1) How many days does Sally run?
Answer: 7
2) How many miles does she run each day?
Answer: 2
3) What is the total after 7 days?
Answer: 7 * 2 = 14

Final numeric answer: 14
---
"""

    with tqdm(total=total_questions, dynamic_ncols=True) as pbar:
        for query_id, data in enumerate(dataset):
            question = data["question"]
            ground_truth = extract_answer(data["answer"])

            decomposition_prompt = (
                f"{decomposition_demo}"
                f"Now decompose the following question into simpler subquestions:\n\n"
                f"Question:\n{question}\n\n"
                "Subquestions:\n"
            )
            subquestions_text = llm.generate(
                system_prompt=system_prompt,
                user_prompt=decomposition_prompt
            )

            solving_prompt = (
                f"{solving_demo}\n"
                f"Now solve these subquestions step by step:\n"
                f"{subquestions_text}\n\n"
                "Finally, provide the numeric answer at the end.\nAnswer:"
            )
            solving_text = llm.generate(
                system_prompt=system_prompt,
                user_prompt=solving_prompt
            )

            final_answer_raw = extract_answer(solving_text)
            final_answer = postprocess_final_answer(final_answer_raw) if final_answer_raw else None
            is_correct = (final_answer == ground_truth)

            results.append({
                "question": question,
                "subquestions_text": subquestions_text,
                "solving_text": solving_text,
                "final_answer": final_answer,
                "ground_truth": ground_truth,
                "is_correct": is_correct
            })

            write_json(results, save_path)

            correct_answers = sum(1 for r in results if r["is_correct"])
            running_accuracy = (correct_answers / (query_id + 1)) * 100

            pbar.set_postfix(
                idx=query_id + 1,
                running_accuracy=f"{running_accuracy:.2f}%"
            )
            pbar.update(1)

    write_json(results, save_path)

In [None]:
subquestion_save_path = output_dir / "subquestion.json"

llm.do_sample = False  
llm.temperature = 1.0  

subquestion(
    llm, 
    system_prompt="Be helpful, answer the question.",  
    dataset=gsm8k, 
    save_path=subquestion_save_path 
)

100%|██████████| 50/50 [18:31<00:00, 22.22s/it, idx=50, running_accuracy=4.00%]


In [76]:
json_path = output_dir / f"subquestion.json"
accuracy = get_accuracy(json_path)
print(f"method: subquestion, accuracy: {accuracy}")

method: subquestion, accuracy: 0.04


## Comparison

**TODO:** Compare all the algorithms implemented in the notebook in your report, highlighting which ones align with your expectations and which ones do not. Provide a detailed analysis of your observations, explaining the reasons behind each outcome. For instance, Why advanced methods fail to perform effectively.