In [1]:
!pip install groq python-dotenv numpy tqdm datasets




[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from groq import Groq
from dotenv import load_dotenv
from datasets import load_dataset

import os
import time
from tqdm import tqdm
import re
import random
import pprint

from typing import List, Dict, Any

load_dotenv()
random.seed(0)

client = Groq()
gsm8k_dataset = load_dataset("gsm8k", "main")

gsm8k_train = gsm8k_dataset["train"]
gsm8k_test  = gsm8k_dataset["test"]

  from .autonotebook import tqdm as notebook_tqdm
Using the latest cached version of the dataset since gsm8k couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'main' at C:\Users\pjhic\.cache\huggingface\datasets\gsm8k\main\0.0.0\cc7b047b6e5bb11b4f1af84efc572db110a51b3c (last modified on Fri Jan 30 14:05:07 2026).


In [3]:
def generate_response_using_Llama(
        prompt: str,
        model: str = "llama-3.1-8b-instant"
    ):
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant that solves math problems."
                },
                {
                    "role": "user", 
                    "content": prompt
                }
            ],
            model=model,
            temperature=0.3, ### ÏàòÏ†ïÌï¥ÎèÑ Îê©ÎãàÎã§!
            stream=False
        )
        return chat_completion.choices[0].message.content
    
    except Exception as e:
        print(f"API call error: {str(e)}")
        return None

#### ÏùëÎãµ Ïûò ÎÇòÏò§ÎäîÏßÄ ÌôïÏù∏Ìï¥Î≥¥Í∏∞

In [4]:
response = generate_response_using_Llama(
    prompt="Hello world!",
)
print(response)

Hello world! It's nice to meet you. I'm here to help with any math problems you might have. What's on your mind? Do you have a specific problem you'd like me to solve, or would you like some help with a particular math concept?


#### GSM8K Îç∞Ïù¥ÌÑ∞ÏÖã ÌôïÏù∏Ìï¥Î≥¥Í∏∞

In [5]:
print("[Question]")
for l in gsm8k_test['question'][0].split("."):
    print(l)
print("="*100)
print("[Answer]")
print(gsm8k_test['answer'][0])

[Question]
Janet‚Äôs ducks lay 16 eggs per day
 She eats three for breakfast every morning and bakes muffins for her friends every day with four
 She sells the remainder at the farmers' market daily for $2 per fresh duck egg
 How much in dollars does she make every day at the farmers' market?
[Answer]
Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer‚Äôs market.
#### 18


#### Util Ìï®ÏàòÎì§
- extract_final_answer: LLMÏùò ÏùëÎãµÏùÑ parseÌïòÏó¨ ÏµúÏ¢Ö Í≤∞Í≥ºÎßå Ï∂îÏ∂ú (Ï†ïÎãµÍ≥º ÎπÑÍµêÌïòÍ∏∞ ÏúÑÌï¥)
- run_benchmark_test: Î≤§ÏπòÎßàÌÅ¨ ÌÖåÏä§Ìä∏
- save_final_result: Í≤∞Í≥ºÎ¨º Ï†úÏ∂úÏùÑ ÏúÑÌïú Ìï®Ïàò

In [6]:
### ÏàòÏ†ïÌï¥ÎèÑ Îê©ÎãàÎã§!
def extract_final_answer(response: str):
    if "####" in response:
        ans_part = response.split("####")[-1].strip()
        match = re.search(r"(-?\d+(?:\,\d+)?(?:\.\d+)?)", ans_part)
        if match:
            return match.group(1).replace(",", "")

    regex = r"(?:Answer:|The answer is)\s*\$?([0-9,.]+)"
    match = re.search(regex, response, re.IGNORECASE)
    if match:
        return match.group(1).replace(",", "")

    numbers = re.findall(r"(-?\d+(?:\,\d+)?(?:\.\d+)?)", response)
    return numbers[-1].replace(",", "") if numbers else None

In [7]:
### ÏàòÏ†ïÌï¥ÎèÑ Îê©ÎãàÎã§!
def run_benchmark_test(
        dataset,
        prompt: str,
        model: str = "llama-3.1-8b-instant",
        num_samples: int = 50,
        VERBOSE: bool = False
    ):
    correct = 0
    total   = 0
    results = []
    num_samples = min(num_samples, len(dataset))

    for i in tqdm(range(num_samples)):
        question = dataset[i]["question"]
        correct_answer = float(re.findall(r'\d+(?:\.\d+)?', dataset[i]["answer"].split('####')[-1])[0])

        time.sleep(1)

        response = generate_response_using_Llama(
            prompt=prompt.format(question=question),
            model=model
        )

        if response:
            if VERBOSE:
                print("="*50)
                print(response)
                print("="*50)
            predicted_answer = extract_final_answer(response)

            if isinstance(predicted_answer, str):
                predicted_answer = float(predicted_answer.replace(",", ""))
            
            diff = abs(predicted_answer - correct_answer)
            is_correct = diff < 1e-5 if predicted_answer is not None else False
            
            if is_correct:
                correct += 1
            total += 1
            
            results.append({
                'question': question,
                'correct_answer': correct_answer,
                'predicted_answer': predicted_answer,
                'response': response,
                'correct': is_correct
            })

            # 5Í∞úÎßàÎã§ ÎòêÎäî ÎßàÏßÄÎßâ ÏÉòÌîåÏùº Îïå Î°úÍ∑∏ Ï∂úÎ†•
            if (i + 1) % 5 == 0 or (i + 1) == num_samples:
                current_acc = correct/total if total > 0 else 0
                print(f"Progress: [{i+1}/{num_samples}]")
                print(f"Current Acc.: [{current_acc:.2%}]")

    return results, correct/total if total > 0 else 0

In [8]:
def save_final_result(results: List[Dict[str, Any]], accuracy: float, filename: str) -> None:
    result_str = f"====== ACCURACY: {accuracy} ======\n\n"
    result_str += f"[Details]\n"
    
    for idx, result in enumerate(results):
        result_str += f"Question {idx+1}: {result['question']}\n"
        result_str += f"Correct Answer: {result['correct_answer']}\n"
        result_str += f"Predicted Answer: {result['predicted_answer']}\n"
        result_str += f"Correct: {result['correct']}\n\n"
    
    with open(filename, "w", encoding="utf-8") as f:
        f.write(result_str)

#### Direct prompting with few-shot example

In [None]:
def construct_direct_prompt(num_examples: int = 3) -> str:
    train_dataset = gsm8k_train

    sampled_indices = random.sample(
        [i for i in range(len(train_dataset['question']))],
        num_examples
    )

    prompt = "Solve these. The answer format might vary. Sometimes use numbers, sometimes words.\n"

    for i in range(num_examples):
        cur_question = train_dataset['question'][i]
        correct_answer = float(train_dataset['answer'][i].split("####")[-1].strip())
        
        wrong_answer = str(int(correct_answer * 0.5) if i % 2 == 0 else int(correct_answer * 1.5))

        prompt += f"\nQ{i+1}: {cur_question}\n"
        prompt += f"Result: {wrong_answer} (maybe)\n"

    prompt += "\nQ: {question}\nResult:"

    return prompt

In [10]:
### Ïñ¥Îñ§ Î∞©ÏãùÏúºÎ°ú Ï†ÄÏû•ÎêòÎäîÏßÄ ÌôïÏù∏Ìï¥Î≥¥ÏÑ∏Ïöî!
PROMPT = construct_direct_prompt(3)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=10
)
save_final_result(results, accuracy, "example.txt")

 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 5/10 [00:14<00:21,  4.22s/it]

Progress: [5/10]
Current Acc.: [60.00%]


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:22<00:00,  2.24s/it]

Progress: [10/10]
Current Acc.: [60.00%]





In [None]:
shots = [0, 3, 5]

for shot in shots:
    print(f"\n>>> Running Direct Prompting: {shot}-shot")
    PROMPT = construct_direct_prompt(shot)
    
    results, accuracy = run_benchmark_test(
        dataset=gsm8k_test,
        prompt=PROMPT,
        num_samples=50,
        VERBOSE=False
    )
    
    filename = f"direct_prompting_{shot}.txt"
    save_final_result(results, accuracy, filename)
    print(f"Saved results to {filename} with accuracy: {accuracy}")


>>> Running Direct Prompting: 0-shot


 10%|‚ñà         | 5/50 [00:07<01:04,  1.44s/it]

Progress: [5/50]
Current Acc.: [80.00%]


 20%|‚ñà‚ñà        | 10/50 [00:14<00:58,  1.47s/it]

Progress: [10/50]
Current Acc.: [70.00%]


 30%|‚ñà‚ñà‚ñà       | 15/50 [00:21<00:51,  1.47s/it]

Progress: [15/50]
Current Acc.: [66.67%]


 40%|‚ñà‚ñà‚ñà‚ñà      | 20/50 [00:29<00:44,  1.48s/it]

Progress: [20/50]
Current Acc.: [70.00%]


 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 25/50 [00:36<00:36,  1.46s/it]

Progress: [25/50]
Current Acc.: [76.00%]


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 30/50 [00:44<00:30,  1.54s/it]

Progress: [30/50]
Current Acc.: [76.67%]


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 35/50 [00:51<00:21,  1.45s/it]

Progress: [35/50]
Current Acc.: [80.00%]


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 40/50 [00:58<00:15,  1.51s/it]

Progress: [40/50]
Current Acc.: [80.00%]


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 45/50 [01:05<00:07,  1.46s/it]

Progress: [45/50]
Current Acc.: [80.00%]


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:13<00:00,  1.47s/it]


Progress: [50/50]
Current Acc.: [82.00%]
Saved results to direct_prompting_0.txt with accuracy: 0.82

>>> Running Direct Prompting: 3-shot


 10%|‚ñà         | 5/50 [00:07<01:11,  1.59s/it]

Progress: [5/50]
Current Acc.: [80.00%]


 20%|‚ñà‚ñà        | 10/50 [00:15<00:59,  1.50s/it]

Progress: [10/50]
Current Acc.: [80.00%]


 30%|‚ñà‚ñà‚ñà       | 15/50 [00:22<00:53,  1.52s/it]

Progress: [15/50]
Current Acc.: [86.67%]


 40%|‚ñà‚ñà‚ñà‚ñà      | 20/50 [00:30<00:45,  1.53s/it]

Progress: [20/50]
Current Acc.: [80.00%]


 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 25/50 [00:37<00:36,  1.44s/it]

Progress: [25/50]
Current Acc.: [80.00%]


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 30/50 [00:44<00:28,  1.44s/it]

Progress: [30/50]
Current Acc.: [80.00%]


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 35/50 [00:51<00:21,  1.43s/it]

Progress: [35/50]
Current Acc.: [82.86%]


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 40/50 [01:06<00:25,  2.55s/it]

Progress: [40/50]
Current Acc.: [82.50%]


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 45/50 [01:13<00:08,  1.71s/it]

Progress: [45/50]
Current Acc.: [84.44%]


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:21<00:00,  1.63s/it]


Progress: [50/50]
Current Acc.: [84.00%]
Saved results to direct_prompting_3.txt with accuracy: 0.84

>>> Running Direct Prompting: 5-shot


 10%|‚ñà         | 5/50 [00:14<02:02,  2.73s/it]

Progress: [5/50]
Current Acc.: [80.00%]


 20%|‚ñà‚ñà        | 10/50 [00:22<01:08,  1.72s/it]

Progress: [10/50]
Current Acc.: [70.00%]


 30%|‚ñà‚ñà‚ñà       | 15/50 [00:30<00:58,  1.67s/it]

Progress: [15/50]
Current Acc.: [73.33%]


 40%|‚ñà‚ñà‚ñà‚ñà      | 20/50 [00:37<00:46,  1.55s/it]

Progress: [20/50]
Current Acc.: [75.00%]


 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 25/50 [00:53<00:53,  2.13s/it]

Progress: [25/50]
Current Acc.: [72.00%]


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 30/50 [01:00<00:31,  1.57s/it]

Progress: [30/50]
Current Acc.: [76.67%]


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 35/50 [01:08<00:22,  1.49s/it]

Progress: [35/50]
Current Acc.: [80.00%]


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 40/50 [01:16<00:16,  1.63s/it]

Progress: [40/50]
Current Acc.: [75.00%]


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 45/50 [01:23<00:07,  1.57s/it]

Progress: [45/50]
Current Acc.: [75.56%]


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:31<00:00,  1.82s/it]

Progress: [50/50]
Current Acc.: [78.00%]
Saved results to direct_prompting_5.txt with accuracy: 0.78





### Chain-of-Thought prompting with few-shot example
```text
[Question]
Janet‚Äôs ducks lay 16 eggs per day
 She eats three for breakfast every morning and bakes muffins for her friends every day with four
 She sells the remainder at the farmers' market daily for $2 per fresh duck egg
 How much in dollars does she make every day at the farmers' market?
====================================================================================================
[Answer]
Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer‚Äôs market.
#### 18
```

[Answer] ÏïÑÎûòÏùò Ï†ïÎãµÏùÑ ÎèÑÏ∂úÌïòÎäî Í≥ºÏ†ïÏùÑ ÏòàÏãúÎ°ú Îã¨ÏïÑÏ£ºÎ©¥ CoTÏùò few shotÏù¥ ÎêòÍ≤†Ï£†?

In [None]:
def construct_CoT_prompt(num_examples: int = 3) -> str:
    train_dataset = gsm8k_train

    sampled_indices = random.sample(
        [i for i in range(len(train_dataset['question']))],
        num_examples
    )
    
    prompt = (
        "Solve the problem step by step. End with #### [value]\n"
    )

    for i, idx in enumerate(sampled_indices):
        cur_question = train_dataset[idx]['question']
        cur_answer = train_dataset[idx]['answer']

        prompt += f"\nExample {i+1}:\n{cur_question}\n{cur_answer}\n"

    prompt += "\nQuestion:\n{question}\nAnswer:"
    
    return prompt

In [None]:
import time

shots = [0, 3, 5]
total_start_time = time.time()

print("="*70)
print("CHAIN-OF-THOUGHT (CoT) PROMPTING BENCHMARK")
print("="*70)

for shot in shots:
    shot_start_time = time.time()
    print(f"\n{'='*70}")
    print(f"Testing CoT Prompting with {shot} shot(s)")
    print(f"{'='*70}")
    
    PROMPT = construct_CoT_prompt(shot)
    
    if shot == 0:
        PROMPT = "Question:\n{question}\nAnswer:"

    results, accuracy = run_benchmark_test(
        dataset=gsm8k_test,
        prompt=PROMPT,
        num_samples=50,
        VERBOSE=False
    )
    
    filename = f"CoT_prompting_{shot}.txt"
    save_final_result(results, accuracy, filename)
    
    shot_elapsed = time.time() - shot_start_time
    print(f"\n‚úÖ Completed: {shot}-shot")
    print(f"   üìÅ File: {filename}")
    print(f"   üìä Accuracy: {accuracy:.2%}")
    print(f"   ‚è±Ô∏è  Time: {shot_elapsed:.1f}s ({shot_elapsed/60:.1f} minutes)")

total_elapsed = time.time() - total_start_time
print(f"\n{'='*70}")
print(f"CHAIN-OF-THOUGHT (CoT) PROMPTING BENCHMARK COMPLETE")
print(f"Total Time: {total_elapsed:.1f}s ({total_elapsed/60:.1f} minutes)")
print(f"{'='*70}")

CHAIN-OF-THOUGHT (CoT) PROMPTING BENCHMARK

Testing CoT Prompting with 0 shot(s)


  0%|          | 0/50 [00:00<?, ?it/s]

 10%|‚ñà         | 5/50 [00:07<01:06,  1.47s/it]

Progress: [5/50]
Current Acc.: [100.00%]


 20%|‚ñà‚ñà        | 10/50 [00:14<01:00,  1.51s/it]

Progress: [10/50]
Current Acc.: [70.00%]


 30%|‚ñà‚ñà‚ñà       | 15/50 [00:22<00:53,  1.52s/it]

Progress: [15/50]
Current Acc.: [66.67%]


 40%|‚ñà‚ñà‚ñà‚ñà      | 20/50 [00:29<00:44,  1.48s/it]

Progress: [20/50]
Current Acc.: [65.00%]


 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 25/50 [00:37<00:35,  1.41s/it]

Progress: [25/50]
Current Acc.: [64.00%]


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 30/50 [00:44<00:27,  1.40s/it]

Progress: [30/50]
Current Acc.: [66.67%]


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 35/50 [00:51<00:21,  1.44s/it]

Progress: [35/50]
Current Acc.: [71.43%]


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 40/50 [00:58<00:14,  1.48s/it]

Progress: [40/50]
Current Acc.: [70.00%]


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 45/50 [01:06<00:07,  1.48s/it]

Progress: [45/50]
Current Acc.: [73.33%]


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:13<00:00,  1.47s/it]


Progress: [50/50]
Current Acc.: [74.00%]

‚úÖ Completed: 0-shot
   üìÅ File: CoT_prompting_0.txt
   üìä Accuracy: 74.00%
   ‚è±Ô∏è  Time: 73.4s (1.2 minutes)

Testing CoT Prompting with 3 shot(s)


 10%|‚ñà         | 5/50 [00:07<01:05,  1.46s/it]

Progress: [5/50]
Current Acc.: [100.00%]


 20%|‚ñà‚ñà        | 10/50 [00:22<01:24,  2.12s/it]

Progress: [10/50]
Current Acc.: [70.00%]


 30%|‚ñà‚ñà‚ñà       | 15/50 [00:36<01:31,  2.61s/it]

Progress: [15/50]
Current Acc.: [66.67%]


 40%|‚ñà‚ñà‚ñà‚ñà      | 20/50 [00:43<00:50,  1.68s/it]

Progress: [20/50]
Current Acc.: [70.00%]


 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 25/50 [00:58<00:49,  1.99s/it]

Progress: [25/50]
Current Acc.: [72.00%]


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 30/50 [01:05<00:30,  1.54s/it]

Progress: [30/50]
Current Acc.: [76.67%]


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 35/50 [01:13<00:21,  1.46s/it]

Progress: [35/50]
Current Acc.: [80.00%]


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 40/50 [01:27<00:25,  2.58s/it]

Progress: [40/50]
Current Acc.: [80.00%]


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 45/50 [01:35<00:08,  1.65s/it]

Progress: [45/50]
Current Acc.: [80.00%]


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:42<00:00,  2.05s/it]


Progress: [50/50]
Current Acc.: [82.00%]

‚úÖ Completed: 3-shot
   üìÅ File: CoT_prompting_3.txt
   üìä Accuracy: 82.00%
   ‚è±Ô∏è  Time: 102.6s (1.7 minutes)

Testing CoT Prompting with 5 shot(s)


 10%|‚ñà         | 5/50 [00:14<02:08,  2.85s/it]

Progress: [5/50]
Current Acc.: [60.00%]


 20%|‚ñà‚ñà        | 10/50 [00:30<02:15,  3.39s/it]

Progress: [10/50]
Current Acc.: [60.00%]


 30%|‚ñà‚ñà‚ñà       | 15/50 [00:46<01:48,  3.09s/it]

Progress: [15/50]
Current Acc.: [66.67%]


 40%|‚ñà‚ñà‚ñà‚ñà      | 20/50 [00:54<00:54,  1.82s/it]

Progress: [20/50]
Current Acc.: [70.00%]


 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 25/50 [01:02<00:39,  1.59s/it]

Progress: [25/50]
Current Acc.: [72.00%]


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 30/50 [01:09<00:30,  1.51s/it]

Progress: [30/50]
Current Acc.: [76.67%]


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 35/50 [01:17<00:21,  1.45s/it]

Progress: [35/50]
Current Acc.: [80.00%]


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 40/50 [01:24<00:15,  1.54s/it]

Progress: [40/50]
Current Acc.: [80.00%]


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 45/50 [01:32<00:07,  1.49s/it]

Progress: [45/50]
Current Acc.: [80.00%]


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:39<00:00,  1.99s/it]

Progress: [50/50]
Current Acc.: [82.00%]

‚úÖ Completed: 5-shot
   üìÅ File: CoT_prompting_5.txt
   üìä Accuracy: 82.00%
   ‚è±Ô∏è  Time: 99.7s (1.7 minutes)

CHAIN-OF-THOUGHT (CoT) PROMPTING BENCHMARK COMPLETE
Total Time: 275.8s (4.6 minutes)





### Construct your prompt!!

Î™©Ìëú: Î≥∏Ïù∏ÎßåÏùò ÌîÑÎ°¨ÌîÑÌä∏Î•º ÌÜµÌï¥ Ï†ïÎãµÎ•†ÏùÑ Îçî ÎÅåÏñ¥Ïò¨Î†§Î≥¥Í∏∞!
- gsm8kÏùò train Îç∞Ïù¥ÌÑ∞ÏÖãÏóêÏÑú ÏòàÏãúÎ•º Í∞ÄÏ†∏Ïò® Îã§Ïùå (ÏûêÏú†Î°≠Í≤å!)
- Í∑∏ ÏòàÏãúÎì§Ïóê ÎåÄÌïú ÌíÄÏù¥ Í≥ºÏ†ïÏùÑ ÎßåÎì§Ïñ¥Ï£ºÏÑ∏Ïöî!
- Î™®Îì† Í≤ÉÎì§Ïù¥ ÏûêÏú†ÏûÖÎãàÎã§! Direct Prompting, CoT PromptingÏùÑ Ìïú Í≤∞Í≥ºÎ≥¥Îã§ Ï†ïÎãµÎ•†Îßå ÎÜíÏúºÎ©¥ ÎèºÏöî.

In [14]:
### ÏûêÏú†Î°≠Í≤å ÏàòÏ†ïÌï¥ÎèÑ Îê©ÎãàÎã§! ÏôÑÏ†ÑÌûà ÏÉàÎ°ú Ìï®ÏàòÎ•º ÎßåÎì§Ïñ¥ÎèÑ ÎèºÏöî.
def construct_my_prompt(num_examples: int = 3):
    train_dataset = gsm8k_train
    sampled_indices = random.sample(range(len(train_dataset)), num_examples)

    prompt = (
        "Solve the math problem using the DUP method:\n"
        "Stage 1 [Core Question]: Extract the most detailed central goal.\n"
        "Stage 2 [Info]: List all necessary facts related to Stage 1.\n"
        "Stage 3 [Answer]: Solve step-by-step using Stage 1 & 2 info.\n"
        "Final result format: #### [value]\n"
    )

    for i, idx in enumerate(sampled_indices):
        cur_question = train_dataset[idx]['question']
        cur_answer = train_dataset[idx]['answer']
        prompt += f"\nQ:{cur_question}\nA:{cur_answer}\n"
    
    prompt += "\nQ:{question}\nA:"

    return prompt

In [None]:
import time

shots = [0, 3, 5]
total_start_time = time.time()

print("="*70)
print("MY PROMPTING BENCHMARK")
print("="*70)

for shot in shots:
    shot_start_time = time.time()
    print(f"\n{'='*70}")
    print(f"Testing My Prompting with {shot} shot(s)")
    print(f"{'='*70}")
    
    PROMPT = construct_my_prompt(shot)

    if shot == 0:
        PROMPT = PROMPT.format(question="{question}") + " Let's extract the Core Question and Key Info first, then solve."
    
    results, accuracy = run_benchmark_test(
        dataset=gsm8k_test,
        prompt=PROMPT,
        num_samples=50,
        VERBOSE=False
    )
    
    filename = f"My_prompting_{shot}.txt"
    save_final_result(results, accuracy, filename)
    
    shot_elapsed = time.time() - shot_start_time
    print(f"\n‚úÖ Completed: {shot}-shot")
    print(f"   üìÅ File: {filename}")
    print(f"   üìä Accuracy: {accuracy:.2%}")
    print(f"   ‚è±Ô∏è  Time: {shot_elapsed:.1f}s ({shot_elapsed/60:.1f} minutes)")

total_elapsed = time.time() - total_start_time
print(f"\n{'='*70}")
print(f"MY PROMPTING BENCHMARK COMPLETE")
print(f"Total Time: {total_elapsed:.1f}s ({total_elapsed/60:.1f} minutes)")
print(f"{'='*70}")

MY PROMPTING BENCHMARK

Testing My Prompting with 0 shot(s)


 10%|‚ñà         | 5/50 [00:08<01:14,  1.65s/it]

Progress: [5/50]
Current Acc.: [100.00%]


 20%|‚ñà‚ñà        | 10/50 [00:16<01:05,  1.64s/it]

Progress: [10/50]
Current Acc.: [90.00%]


 30%|‚ñà‚ñà‚ñà       | 15/50 [00:24<00:57,  1.65s/it]

Progress: [15/50]
Current Acc.: [86.67%]


 40%|‚ñà‚ñà‚ñà‚ñà      | 20/50 [00:32<00:48,  1.61s/it]

Progress: [20/50]
Current Acc.: [85.00%]


 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 25/50 [00:48<00:54,  2.18s/it]

Progress: [25/50]
Current Acc.: [84.00%]


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 30/50 [00:56<00:33,  1.70s/it]

Progress: [30/50]
Current Acc.: [83.33%]


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 35/50 [01:04<00:23,  1.55s/it]

Progress: [35/50]
Current Acc.: [82.86%]


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 40/50 [01:12<00:16,  1.63s/it]

Progress: [40/50]
Current Acc.: [80.00%]


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 45/50 [01:20<00:07,  1.60s/it]

Progress: [45/50]
Current Acc.: [82.22%]


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:28<00:00,  1.78s/it]


Progress: [50/50]
Current Acc.: [84.00%]

‚úÖ Completed: 0-shot
   üìÅ File: My_prompting_0.txt
   üìä Accuracy: 84.00%
   ‚è±Ô∏è  Time: 88.8s (1.5 minutes)

Testing My Prompting with 3 shot(s)


 10%|‚ñà         | 5/50 [00:07<01:12,  1.60s/it]

Progress: [5/50]
Current Acc.: [100.00%]


 20%|‚ñà‚ñà        | 10/50 [00:16<01:05,  1.64s/it]

Progress: [10/50]
Current Acc.: [80.00%]


 30%|‚ñà‚ñà‚ñà       | 15/50 [00:24<00:57,  1.65s/it]

Progress: [15/50]
Current Acc.: [73.33%]


 40%|‚ñà‚ñà‚ñà‚ñà      | 20/50 [00:32<00:52,  1.74s/it]

Progress: [20/50]
Current Acc.: [75.00%]


 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 25/50 [00:40<00:39,  1.59s/it]

Progress: [25/50]
Current Acc.: [76.00%]


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 30/50 [00:48<00:31,  1.56s/it]

Progress: [30/50]
Current Acc.: [80.00%]


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 35/50 [00:56<00:23,  1.56s/it]

Progress: [35/50]
Current Acc.: [82.86%]


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 40/50 [01:05<00:17,  1.78s/it]

Progress: [40/50]
Current Acc.: [85.00%]


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 45/50 [01:13<00:08,  1.63s/it]

Progress: [45/50]
Current Acc.: [86.67%]


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:21<00:00,  1.64s/it]


Progress: [50/50]
Current Acc.: [86.00%]

‚úÖ Completed: 3-shot
   üìÅ File: My_prompting_3.txt
   üìä Accuracy: 86.00%
   ‚è±Ô∏è  Time: 81.8s (1.4 minutes)

Testing My Prompting with 5 shot(s)


 10%|‚ñà         | 5/50 [00:07<01:10,  1.56s/it]

Progress: [5/50]
Current Acc.: [100.00%]


 20%|‚ñà‚ñà        | 10/50 [00:16<01:06,  1.65s/it]

Progress: [10/50]
Current Acc.: [80.00%]


 30%|‚ñà‚ñà‚ñà       | 15/50 [00:24<00:58,  1.68s/it]

Progress: [15/50]
Current Acc.: [80.00%]


 40%|‚ñà‚ñà‚ñà‚ñà      | 20/50 [00:32<00:49,  1.66s/it]

Progress: [20/50]
Current Acc.: [80.00%]


 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 25/50 [00:40<00:39,  1.57s/it]

Progress: [25/50]
Current Acc.: [80.00%]


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 30/50 [00:48<00:31,  1.56s/it]

Progress: [30/50]
Current Acc.: [80.00%]


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 35/50 [00:56<00:23,  1.57s/it]

Progress: [35/50]
Current Acc.: [82.86%]


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 40/50 [01:04<00:17,  1.71s/it]

Progress: [40/50]
Current Acc.: [85.00%]


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 45/50 [01:12<00:07,  1.60s/it]

Progress: [45/50]
Current Acc.: [84.44%]


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:20<00:00,  1.61s/it]

Progress: [50/50]
Current Acc.: [86.00%]

‚úÖ Completed: 5-shot
   üìÅ File: My_prompting_5.txt
   üìä Accuracy: 86.00%
   ‚è±Ô∏è  Time: 80.6s (1.3 minutes)

MY PROMPTING BENCHMARK COMPLETE
Total Time: 251.2s (4.2 minutes)





### Î≥¥Í≥†ÏÑú ÏûëÏÑ±ÌïòÍ∏∞
#### ÏïÑÎûòÏùò ÎÇ¥Ïö©Ïù¥ Ìè¨Ìï®ÎêòÎ©¥ Îê©ÎãàÎã§!

1. Direct Prompting, CoT Prompting, My PromptingÏùÑ 0 shot, 3 shot, 5 shot Ï†ïÎãµÎ•†ÏùÑ ÌëúÎ°ú Î≥¥Ïó¨Ï£ºÏÑ∏Ïöî!
2. CoT PromptingÏù¥ Direct PromptingÏóê ÎπÑÌï¥ Ïôú Ï¢ãÏùÑ Ïàò ÏûàÎäîÏßÄÏóê ÎåÄÌï¥ÏÑú ÏÑúÏà†Ìï¥Ï£ºÏÑ∏Ïöî!
3. Î≥∏Ïù∏Ïù¥ ÏûëÏÑ±Ìïú ÌîÑÎ°¨ÌîÑÌä∏ Í∏∞Î≤ïÏù¥ CoTÏóê ÎπÑÌï¥ÏÑú Ïôú Îçî Ï¢ãÏùÑ Ïàò ÏûàÎäîÏßÄÏóê ÎåÄÌï¥ÏÑú ÏÑ§Î™ÖÌï¥Ï£ºÏÑ∏Ïöî!
4. ÏµúÏ¢ÖÏ†ÅÏúºÎ°ú, `PROMPTING.md`Ïóê Î≥¥Í≥†ÏÑúÎ•º ÏûëÏÑ±Ìï¥Ï£ºÏÑ∏Ïöî!