In [1]:
!pip install groq python-dotenv numpy tqdm datasets

Collecting groq
  Downloading groq-1.0.0-py3-none-any.whl.metadata (16 kB)
Collecting datasets
  Downloading datasets-4.5.0-py3-none-any.whl.metadata (19 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.6.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (13 kB)
Collecting multiprocess<0.70.19 (from datasets)
  Downloading multiprocess-0.70.18-py313-none-any.whl.metadata (7.2 kB)
Collecting huggingface-hub<2.0,>=0.25.0 (from datasets)
  Downloading huggingface_hub-1.3.4-py3-none-any.whl.metadata (13 kB)
Collecting hf-xet<2.0.0,>=1.2.0 (from huggingface-hub<2.0,>=0.25.0->datasets)
  Using cached hf_xet-1.2.0-cp37-abi3-macosx_11_0_arm64.whl.metadata (4.9 kB)
Collecting typer-slim (from huggingface-hub<2.0,>=0.25.0->datasets)
  Downloading typer_slim-0.21.1-py3-none-any.whl.metadata (16 kB)
Downloading groq-1.0.0-py3-none-any.whl (138 kB)
Downloading datasets-4.5.0-py3-none-any.whl (515 kB)
Downloading huggingface_hub-1.3.4-py3-none-any.whl (536 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━

In [1]:
from groq import Groq
from dotenv import load_dotenv
from datasets import load_dataset

import os
from tqdm import tqdm
import re
import random
import pprint

from typing import List, Dict, Any

load_dotenv()
random.seed(0)

client = Groq()
gsm8k_dataset = load_dataset("gsm8k", "main")

gsm8k_train = gsm8k_dataset["train"]
gsm8k_test  = gsm8k_dataset["test"]

In [29]:
def generate_response_using_Llama(
        prompt: str,
        model: str = "llama-3.1-8b-instant"
    ):
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant that solves math problems."
                },
                {
                    "role": "user", 
                    "content": prompt
                }
            ],
            model=model,
            temperature=0.1, ### 수정해도 됩니다!
            stream=False
        )
        return chat_completion.choices[0].message.content
    
    except Exception as e:
        print(f"API call error: {str(e)}")
        return None

#### 응답 잘 나오는지 확인해보기

In [4]:
response = generate_response_using_Llama(
    prompt="Hello world!",
)
print(response)

Hello! It's nice to meet you. I'm here to help with any math problems you might have. What's on your mind? Do you have a specific problem you'd like me to solve, or would you like some help with a particular math concept?


#### GSM8K 데이터셋 확인해보기

In [5]:
print("[Question]")
for l in gsm8k_test['question'][0].split("."):
    print(l)
print("="*100)
print("[Answer]")
print(gsm8k_test['answer'][0])

[Question]
Janet’s ducks lay 16 eggs per day
 She eats three for breakfast every morning and bakes muffins for her friends every day with four
 She sells the remainder at the farmers' market daily for $2 per fresh duck egg
 How much in dollars does she make every day at the farmers' market?
[Answer]
Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18


#### Util 함수들
- extract_final_answer: LLM의 응답을 parse하여 최종 결과만 추출 (정답과 비교하기 위해)
- run_benchmark_test: 벤치마크 테스트
- save_final_result: 결과물 제출을 위한 함수

In [2]:
### 수정해도 됩니다!
def extract_final_answer(response: str):
    # 정규식 패턴: "Answer:" 뒤의 숫자나 단위 앞의 숫자를 찾음 
    regex = r"(?:Answer:|Model response:)\s*\$?([0-9,.]+)\b|([0-9,.]+)\s*(meters|cups|miles|minutes)"
    matches = re.finditer(regex, response, re.MULTILINE)
    
    # 숫자 형태의 텍스트만 리스트에 담고, 비어있는 값은 제외
    results = []
    for match in matches:
        val = match.group(1) if match.group(1) else match.group(2)
        if val:
            results.append(val.replace(",", ""))

    # 만약 위에서 못 찾았다면 텍스트 내 모든 숫자를 검색 
    if not results:
        additional_regex = r"\$?([0-9,.]+)"
        additional_matches = re.finditer(additional_regex, response, re.MULTILINE)
        results.extend([m.group(1).replace(",", "") for m in additional_matches if m.group(1)])

    # 리스트가 비어있지 않은 경우에만 마지막 숫자 반환, 아니면 None
    return results[-1] if results and results[-1] != '' else None

In [3]:
### 수정해도 됩니다!
def run_benchmark_test(
        dataset,
        prompt: str,
        model: str = "llama-3.1-8b-instant",
        num_samples: int = 50,
        VERBOSE: bool = False
    ):
    correct = 0
    total   = 0
    results = []

    for i in tqdm(range(min(num_samples, len(dataset)))):
        question = dataset[i]["question"]
        correct_answer = float(re.findall(r'\d+(?:\.\d+)?', dataset[i]["answer"].split('####')[-1])[0])

        # LLM 답변
        response = generate_response_using_Llama(
            prompt=prompt.format(question=question),
            model=model
        )

        if response:
            if VERBOSE:
                print("="*50)
                print(response)
                print("="*50)

            # 정답 추출
            ans_str = extract_final_answer(response)
            predicted_answer = None
            is_correct = False

            # 빈 문자열이나 None이 아닐 때만 숫자로 변환 시도
            if ans_str is not None and ans_str.strip() != '':
                try:
                    predicted_answer = float(ans_str)
                    diff = abs(predicted_answer - correct_answer)
                    is_correct = diff < 1e-5  # 미세하게 차이날 경우 정답 처리
                except ValueError:
                    predicted_answer = None # 숫자로 변환 불가한 경우
            
            if is_correct:
                correct += 1
            total += 1
            
            results.append({
                'question': question,
                'correct_answer': correct_answer,
                'predicted_answer': predicted_answer,
                'response': response,
                'correct': is_correct
            })

            if (i + 1) % 5 == 0:
                current_acc = correct/total if total > 0 else 0
                print(f"Progress: [{i+1}/{num_samples}]")
                print(f"Current Acc.: [{current_acc:.2%}]")

    return results, correct/total if total > 0 else 0

In [4]:
def save_final_result(results: List[Dict[str, Any]], accuracy: float, filename: str) -> None:
    result_str = f"====== ACCURACY: {accuracy} ======\n\n"
    result_str += f"[Details]\n"
    
    for idx, result in enumerate(results):
        result_str += f"Question {idx+1}: {result['question']}\n"
        result_str += f"Correct Answer: {result['correct_answer']}\n"
        result_str += f"Predicted Answer: {result['predicted_answer']}\n"
        result_str += f"Correct: {result['correct']}\n\n"
    
    with open(filename, "w", encoding="utf-8") as f:
        f.write(result_str)

#### Direct prompting with few-shot example

In [9]:
def construct_direct_prompt(num_examples: int = 3) -> str:
    train_dataset = gsm8k_train

    sampled_indices = random.sample(
        [i for i in range(len(train_dataset['question']))],
        num_examples
    )

    prompt = "Instruction:\nSolve the following mathematical question and generate ONLY the answer after a tag, 'Answer:' without any rationale.\n"

    for i in range(num_examples):
        cur_question = train_dataset['question'][i]
        cur_answer = train_dataset['answer'][i].split("####")[-1].strip()

        prompt += f"\n[Example {i+1}]\n"
        prompt += f"Question:\n{cur_question}\n"
        prompt += f"Answer:{cur_answer}\n"

    prompt += "\nQuestion:\n{question}\nAnswer:"

    return prompt

In [10]:
### 어떤 방식으로 저장되는지 확인해보세요!
PROMPT = construct_direct_prompt(3)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=10
)
save_final_result(results, accuracy, "example.txt")

 50%|██████████████████████████████████████████████                                              | 5/10 [00:02<00:02,  2.40it/s]

Progress: [5/10]
Current Acc.: [100.00%]


100%|███████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.23it/s]

Progress: [10/10]
Current Acc.: [70.00%]





In [11]:
print("Instruction:\nSolve the following mathematical question and generate ONLY the answer after a tag, 'Answer:' without any rationale.\n")

Instruction:
Solve the following mathematical question and generate ONLY the answer after a tag, 'Answer:' without any rationale.



In [13]:
print(PROMPT)

Instruction:
Solve the following mathematical question and generate ONLY the answer after a tag, 'Answer:' without any rationale.

[Example 1]
Question:
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
Answer:72

[Example 2]
Question:
Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?
Answer:10

[Example 3]
Question:
Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?
Answer:5

Question:
{question}
Answer:


In [None]:
# TODO: 0 shot, 3 shot, 5 shot direct prompting을 통해 벤치마크 테스트를 한 후, 각각 direct_prompting_{shot: int}.txt로 저장해주세요!
# 예시: shot이 5인 경우 direct_prompting_5.txt
# 항상 num_samples=50 입니다!

In [17]:
### 0 shot direct prompting
PROMPT = construct_direct_prompt(0)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=50
)
save_final_result(results, accuracy, "direct_prompting_0.txt")

 10%|█████████▏                                                                                  | 5/50 [00:01<00:17,  2.58it/s]

Progress: [5/50]
Current Acc.: [80.00%]


 20%|██████████████████▏                                                                        | 10/50 [00:04<00:17,  2.33it/s]

Progress: [10/50]
Current Acc.: [70.00%]


 30%|███████████████████████████▎                                                               | 15/50 [00:06<00:15,  2.27it/s]

Progress: [15/50]
Current Acc.: [80.00%]


 40%|████████████████████████████████████▍                                                      | 20/50 [00:08<00:13,  2.23it/s]

Progress: [20/50]
Current Acc.: [75.00%]


 50%|█████████████████████████████████████████████▌                                             | 25/50 [00:10<00:12,  2.06it/s]

Progress: [25/50]
Current Acc.: [72.00%]


 60%|██████████████████████████████████████████████████████▌                                    | 30/50 [00:13<00:10,  1.95it/s]

Progress: [30/50]
Current Acc.: [73.33%]


 70%|███████████████████████████████████████████████████████████████▋                           | 35/50 [00:26<00:34,  2.29s/it]

Progress: [35/50]
Current Acc.: [77.14%]


 80%|████████████████████████████████████████████████████████████████████████▊                  | 40/50 [00:40<00:26,  2.69s/it]

Progress: [40/50]
Current Acc.: [77.50%]


 90%|█████████████████████████████████████████████████████████████████████████████████▉         | 45/50 [00:53<00:13,  2.76s/it]

Progress: [45/50]
Current Acc.: [77.78%]


100%|███████████████████████████████████████████████████████████████████████████████████████████| 50/50 [01:06<00:00,  1.34s/it]

Progress: [50/50]
Current Acc.: [80.00%]





In [18]:
### 3 shot direct prompting
PROMPT = construct_direct_prompt(3)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=50
)
save_final_result(results, accuracy, "direct_prompting_3.txt")

 10%|█████████▏                                                                                  | 5/50 [00:03<00:39,  1.14it/s]

Progress: [5/50]
Current Acc.: [80.00%]


 20%|██████████████████▏                                                                        | 10/50 [00:10<01:12,  1.82s/it]

Progress: [10/50]
Current Acc.: [60.00%]


 30%|███████████████████████████▎                                                               | 15/50 [00:28<01:58,  3.39s/it]

Progress: [15/50]
Current Acc.: [66.67%]


 40%|████████████████████████████████████▍                                                      | 20/50 [00:48<01:52,  3.76s/it]

Progress: [20/50]
Current Acc.: [70.00%]


 50%|█████████████████████████████████████████████▌                                             | 25/50 [01:06<01:32,  3.68s/it]

Progress: [25/50]
Current Acc.: [68.00%]


 60%|██████████████████████████████████████████████████████▌                                    | 30/50 [01:25<01:13,  3.68s/it]

Progress: [30/50]
Current Acc.: [73.33%]


 70%|███████████████████████████████████████████████████████████████▋                           | 35/50 [01:43<00:55,  3.70s/it]

Progress: [35/50]
Current Acc.: [77.14%]


 80%|████████████████████████████████████████████████████████████████████████▊                  | 40/50 [02:02<00:37,  3.79s/it]

Progress: [40/50]
Current Acc.: [77.50%]


 90%|█████████████████████████████████████████████████████████████████████████████████▉         | 45/50 [02:22<00:19,  3.83s/it]

Progress: [45/50]
Current Acc.: [75.56%]


100%|███████████████████████████████████████████████████████████████████████████████████████████| 50/50 [02:42<00:00,  3.24s/it]

Progress: [50/50]
Current Acc.: [76.00%]





### Chain-of-Thought prompting with few-shot example
```text
[Question]
Janet’s ducks lay 16 eggs per day
 She eats three for breakfast every morning and bakes muffins for her friends every day with four
 She sells the remainder at the farmers' market daily for $2 per fresh duck egg
 How much in dollars does she make every day at the farmers' market?
====================================================================================================
[Answer]
Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18
```

[Answer] 아래의 정답을 도출하는 과정을 예시로 달아주면 CoT의 few shot이 되겠죠?

In [20]:
def construct_CoT_prompt(num_examples: int = 3) -> str:
    train_dataset = gsm8k_train

    sampled_indices = random.sample(
        [i for i in range(len(train_dataset['question']))],
        num_examples
    )
    prompt = (
        "Instruction:\n"
        "Solve the following mathematical questions step-by-step. "
        "Show your reasoning clearly and provide the final answer after '####'.\n"
    )

    for i in range(num_examples):
        cur_question = train_dataset['question'][i]
        cur_reasoning = train_dataset['answer'][i].split("####")[0].strip()
        cur_answer = train_dataset['answer'][i].split("####")[-1].strip()

        prompt += f"\n[Example {i+1}]\n"
        prompt += f"Question:\n{cur_question}\n"
        prompt += f"Reasoning:\n{cur_reasoning}\n"
        prompt += f"Answer:{cur_answer}\n"

    prompt += "\nQuestion:\n{question}\nAnswer:"

    return prompt

In [None]:
# TODO: 0 shot, 3 shot, 5 shot CoT prompting을 통해 벤치마크 테스트를 한 후, 각각 CoT_prompting_{shot: int}.txt로 저장해주세요!
# 예시: shot이 5인 경우 CoT_prompting_5.txt
# 항상 num_samples=50 입니다!

In [21]:
### 0 shot CoT prompting
PROMPT = construct_CoT_prompt(0)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=50
)
save_final_result(results, accuracy, "CoT_prompting_0.txt")

 10%|█████████▏                                                                                  | 5/50 [00:02<00:25,  1.77it/s]

Progress: [5/50]
Current Acc.: [80.00%]


 20%|██████████████████▏                                                                        | 10/50 [00:05<00:22,  1.75it/s]

Progress: [10/50]
Current Acc.: [70.00%]


 30%|███████████████████████████▎                                                               | 15/50 [00:08<00:21,  1.60it/s]

Progress: [15/50]
Current Acc.: [80.00%]


 40%|████████████████████████████████████▍                                                      | 20/50 [00:17<00:57,  1.92s/it]

Progress: [20/50]
Current Acc.: [85.00%]


 50%|█████████████████████████████████████████████▌                                             | 25/50 [00:30<00:53,  2.12s/it]

Progress: [25/50]
Current Acc.: [88.00%]


 60%|██████████████████████████████████████████████████████▌                                    | 30/50 [00:44<00:51,  2.55s/it]

Progress: [30/50]
Current Acc.: [90.00%]


 70%|███████████████████████████████████████████████████████████████▋                           | 35/50 [00:54<00:30,  2.05s/it]

Progress: [35/50]
Current Acc.: [91.43%]


 80%|████████████████████████████████████████████████████████████████████████▊                  | 40/50 [01:07<00:26,  2.67s/it]

Progress: [40/50]
Current Acc.: [90.00%]


 90%|█████████████████████████████████████████████████████████████████████████████████▉         | 45/50 [01:26<00:16,  3.24s/it]

Progress: [45/50]
Current Acc.: [86.67%]


100%|███████████████████████████████████████████████████████████████████████████████████████████| 50/50 [01:39<00:00,  1.99s/it]

Progress: [50/50]
Current Acc.: [88.00%]





In [22]:
### 3 shot CoT prompting
PROMPT = construct_CoT_prompt(3)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=50
)
save_final_result(results, accuracy, "CoT_prompting_3.txt")

 10%|█████████▏                                                                                  | 5/50 [00:29<04:33,  6.09s/it]

Progress: [5/50]
Current Acc.: [100.00%]


 20%|██████████████████▏                                                                        | 10/50 [00:58<03:56,  5.91s/it]

Progress: [10/50]
Current Acc.: [70.00%]


 30%|███████████████████████████▎                                                               | 15/50 [01:28<03:25,  5.87s/it]

Progress: [15/50]
Current Acc.: [66.67%]


 40%|████████████████████████████████████▍                                                      | 20/50 [01:57<02:56,  5.90s/it]

Progress: [20/50]
Current Acc.: [70.00%]


 50%|█████████████████████████████████████████████▌                                             | 25/50 [02:26<02:24,  5.76s/it]

Progress: [25/50]
Current Acc.: [72.00%]


 60%|██████████████████████████████████████████████████████▌                                    | 30/50 [02:55<01:56,  5.81s/it]

Progress: [30/50]
Current Acc.: [76.67%]


 70%|███████████████████████████████████████████████████████████████▋                           | 35/50 [03:24<01:26,  5.80s/it]

Progress: [35/50]
Current Acc.: [80.00%]


 80%|████████████████████████████████████████████████████████████████████████▊                  | 40/50 [03:53<00:58,  5.84s/it]

Progress: [40/50]
Current Acc.: [75.00%]


 90%|█████████████████████████████████████████████████████████████████████████████████▉         | 45/50 [04:23<00:29,  5.87s/it]

Progress: [45/50]
Current Acc.: [73.33%]


100%|███████████████████████████████████████████████████████████████████████████████████████████| 50/50 [04:59<00:00,  5.99s/it]

Progress: [50/50]
Current Acc.: [72.00%]





In [23]:
### 5 shot CoT prompting
PROMPT = construct_CoT_prompt(5)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=50
)
save_final_result(results, accuracy, "CoT_prompting_5.txt")

 10%|█████████▏                                                                                  | 5/50 [00:41<06:18,  8.41s/it]

Progress: [5/50]
Current Acc.: [80.00%]


 20%|██████████████████▏                                                                        | 10/50 [01:24<05:44,  8.60s/it]

Progress: [10/50]
Current Acc.: [70.00%]


 30%|███████████████████████████▎                                                               | 15/50 [02:17<05:55, 10.15s/it]

Progress: [15/50]
Current Acc.: [66.67%]


 40%|████████████████████████████████████▍                                                      | 20/50 [03:00<04:23,  8.80s/it]

Progress: [20/50]
Current Acc.: [70.00%]


 50%|█████████████████████████████████████████████▌                                             | 25/50 [03:42<03:30,  8.43s/it]

Progress: [25/50]
Current Acc.: [68.00%]


 60%|██████████████████████████████████████████████████████▌                                    | 30/50 [04:21<02:24,  7.23s/it]

Progress: [30/50]
Current Acc.: [73.33%]


 70%|███████████████████████████████████████████████████████████████▋                           | 35/50 [05:02<02:01,  8.07s/it]

Progress: [35/50]
Current Acc.: [77.14%]


 80%|████████████████████████████████████████████████████████████████████████▊                  | 40/50 [05:45<01:26,  8.68s/it]

Progress: [40/50]
Current Acc.: [75.00%]


 90%|█████████████████████████████████████████████████████████████████████████████████▉         | 45/50 [06:29<00:43,  8.76s/it]

Progress: [45/50]
Current Acc.: [75.56%]


100%|███████████████████████████████████████████████████████████████████████████████████████████| 50/50 [07:07<00:00,  8.55s/it]

Progress: [50/50]
Current Acc.: [78.00%]





### Construct your prompt!!

목표: 본인만의 프롬프트를 통해 정답률을 더 끌어올려보기!
- gsm8k의 train 데이터셋에서 예시를 가져온 다음 (자유롭게!)
- 그 예시들에 대한 풀이 과정을 만들어주세요!
- 모든 것들이 자유입니다! Direct Prompting, CoT Prompting을 한 결과보다 정답률만 높으면 돼요.

#### 아이디어 메모 (Phase 1)

1. LLM이 question을 보고 CoT로 풀어봄 (0 shot). 풀이과정도 서술함
2. question, 정답(풀이과정 포함), LLM의 답변(풀이과정 포함), 정답 여부를 바탕으로 각 문제의 난이도를 1~5등급으로 나눔
3. few shot에 다양한 등급의 문제를 넣음

## Phase 1 : few shot example 선정

In [5]:
def generate_response_using_Llama_with_sys_prompt(
        system_prompt: str,
        user_prompt: str,
        model: str = "llama-3.1-8b-instant"
    ):
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": system_prompt
                },
                {
                    "role": "user", 
                    "content": user_prompt
                }
            ],
            model=model,
            temperature=0.1, ### 수정해도 됩니다!
            stream=False
        )
        return chat_completion.choices[0].message.content
    
    except Exception as e:
        print(f"API call error: {str(e)}")
        return None

### 1-1. 0shot CoT로 문제 풀고 오답 분류

In [7]:
import random
import time
from tqdm import tqdm

# 1. Sampling from gsm8k_train
sample_size = 50
train_indices = random.sample(range(len(gsm8k_train)), sample_size)
train_subset = gsm8k_train.select(train_indices)

# 2. Run 0-shot CoT to find errors
def solve_problem_cot_0shot(subset):
    problem_solve_list = []
    # 0-shot CoT Prompt
    cot_0shot_system_prompt = "You are a helpful assistant that solves math problems step-by-step."
    cot_0shot_prompt = "Instruction:\nSolve the following question step-by-step. Provide final answer after '####'.\n\nQuestion:\n{question}\nAnswer:"
    
    print(f"Testing {len(subset)} samples from train set...")
    for i in tqdm(range(len(subset))):
        question = subset[i]['question']
        correct_full = subset[i]['answer']
        correct_val = float(re.findall(r'\d+(?:\.\d+)?', correct_full.split('####')[-1])[0])
        
        response = generate_response_using_Llama_with_sys_prompt(
            system_prompt=cot_0shot_system_prompt,
            user_prompt=cot_0shot_prompt.format(question=question)
        )
        
        if response:
            pred_val_str = extract_final_answer(response)
            is_correct = False
            if pred_val_str:
                try:
                    pred_val = float(pred_val_str)
                    is_correct = abs(pred_val - correct_val) < 1e-5
                except ValueError: pass
            
            # 3. Collect only errors
            problem_solve_list.append({
                'question': question,
                'correct_solution': correct_full,
                'model_response': response,
                'is_correct': is_correct
            })
    return problem_solve_list

# 실행: 오답 노트 생성
problem_solve_list = solve_problem_cot_0shot(train_subset)

Testing 50 samples from train set...


100%|███████████████████████████████████████████████████████████████████████████████████████████| 50/50 [01:50<00:00,  2.20s/it]


### 1-2. 정답과 LLM 답변 및 풀이과정을 보고 난이도 분류 후 few shot example 선정

In [9]:
# 3. Difficulty Classification (Grade 1~5)

graded_problems = []
system_prompt = "You are an expert math educator who evaluates problem difficulty based on reasoning complexity."
classification_prompt_template = """Instruction:
Evaluate the mathematical complexity of the question and assign a Grade from 1 (Hardest) to 5 (Easiest).
Consider the inherent logic of the correct solution and where the model might struggle.

[Question]: {question}
[Correct Solution]: {correct_sol}
[Model's Attempt]: {attempt}
[Answer Verification]: {is_correct}

Response Format (Strict):
Grade: [Number]
Reasoning: [1-sentence explanation]"""

print("Classifying error difficulty...")
for i, problem in enumerate(problem_solve_list):
    
    prompt = classification_prompt_template.format(
        question=problem['question'][:1000],          # 질문 길이를 제한
        correct_sol=problem['correct_solution'][:2000], # 정답 해설 길이를 제한
        attempt=problem['model_response'][:2000],       # 모델 답변 길이를 제한
        is_correct=problem['is_correct']
    )

    grade_resp = generate_response_using_Llama_with_sys_prompt(
        system_prompt=system_prompt,
        user_prompt=prompt
    )
    if grade_resp:
        # "Grade: 1" 같은 형식에서 숫자 추출
        grade_match = re.search(r"Grade:\s*([1-5])", grade_resp)
        grade = int(grade_match.group(1)) if grade_match else 3
        
        reason_match = re.search(r"Reasoning:\s*(.*)", grade_resp)
        reason = reason_match.group(1).strip() if reason_match else "No reasoning provided."
        
        graded_problems.append({
            'question': problem['question'],
            'answer': problem['correct_solution'],
            'model_response': problem['model_response'],
            'is_correct': problem['is_correct'],
            'grade': grade,
            'grade_reason': reason
        })
        
# 난이도 순(Grade 1 우선)으로 정렬 및 분류
graded_problems = sorted(graded_problems, key=lambda x: x['grade'])

Classifying error difficulty...
API call error: Error code: 413 - {'error': {'message': 'Request too large for model `llama-3.1-8b-instant` in organization `org_01kg3ze8fvejnrfvfgpxcmn4g3` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 8619, please reduce your message size and try again. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}


### 데이터 분석 및 few shot example 선정

In [11]:
graded_problems

[{'question': 'Greg has his own dog walking business. He charges $20 per dog plus $1 per minute per dog for walking the dog. If he walks one dog for 10 minutes, two dogs for 7 minutes and three dogs for 9 minutes, how much money, in dollars, does he earn?',
  'answer': 'Single dog is $20 + (10 minutes * $1 per minute) = $<<20+10*1=30>>30.\nTwo dogs are 2 * ($20 + (7 minutes * $1 per minute)) = $<<2*(20+7*1)=54>>54\nThree dogs are 3 * ($20 + (9 minutes * $1 per minute)) = $<<3*(20+9*1)=87>>87\nThe total is $30 + $54 + $87 = $<<30+54+87=171>>171.\n#### 171',
  'model_response': 'To find out how much money Greg earns, we need to calculate the total cost for each dog and then add them up.\n\n**Step 1: Calculate the cost for walking one dog for 10 minutes**\n\n- Cost per dog: $20\n- Additional cost per minute per dog: $1\n- Total minutes walked: 10 minutes\n- Additional cost for 10 minutes: 10 * $1 = $10\n- Total cost for one dog: $20 + $10 = $30\n\n**Step 2: Calculate the cost for walking 

In [12]:
grade_1_pool = [ex for ex in graded_problems if ex['grade'] == 1]
grade_2_pool = [ex for ex in graded_problems if ex['grade'] == 2]
grade_3_pool = [ex for ex in graded_problems if ex['grade'] == 3]
grade_4_pool = [ex for ex in graded_problems if ex['grade'] == 4]
grade_5_pool = [ex for ex in graded_problems if ex['grade'] == 5]

In [15]:
print(len(grade_1_pool), len(grade_2_pool), len(grade_3_pool), len(grade_4_pool), len(grade_5_pool))

0 7 40 0 2


In [25]:
few_shot_examples = []
few_shot_examples.append(grade_2_pool[3])
few_shot_examples.append(grade_3_pool[8])
few_shot_examples.append(grade_5_pool[0])
few_shot_examples.append(grade_3_pool[18])
few_shot_examples.append(grade_3_pool[26])

## Phase 2: CoT + 검증 프롬프트

In [27]:
### 자유롭게 수정해도 됩니다! 완전히 새로 함수를 만들어도 돼요.
def construct_my_prompt(example_list: List[dict], num_examples: int = 3):
    # 1. CoT + Verification
    prompt = """Instruction:
Solve the math problem efficiently. 
1. Think step-by-step but keep it concise. 
2. If you find an error, correct it once and move on. But do not keep rethinking.
3. You MUST end your response with the final answer in this format: "#### [number]".

Question: {question}
Answer:
"""
    if num_examples > 0:
        prompt += (
            "- Note: The following examples are complex for your reference. For simpler questions, "
            "keep your logic direct and efficient without unnecessary over-complication.\n\n"
        )

    # 2. 틀린 Examples 배치 (In-Context Learning)
    for i, ex in enumerate(example_list[:num_examples]):
        prompt += f"[Reference Example {i+1} (Hard)]\n"
        prompt += f"Question: {ex['question']}\n"
        # GSM8K answer 형식을 Reasoning과 Answer로 분리하여 학습 효과 증대
        reasoning = ex['answer'].split("####")[0].strip()
        final_ans = ex['answer'].split("####")[-1].strip()
        
        prompt += f"Reasoning:\n{reasoning}\n"
        prompt += f"Final Answer: #### {final_ans}\n\n"

    # 3. 실제 질문 템플릿
    prompt += (
        "--- Now solve the following question ---\n"
        "Question:\n{question}\n"
        "Reasoning:\n"
    )

    return prompt

In [None]:
# TODO: 만든 0 shot, 3 shot, 5 shot example과 프롬프트를 통해 벤치마크 테스트를 한 후, 각각 My_prompting_{shot: int}.txt로 저장해주세요!
# 예시: shot이 5인 경우 My_prompting_5.txt
# 항상 num_samples=50 입니다!

In [43]:
### 0 shot custom prompting
PROMPT = construct_my_prompt(few_shot_examples, 0)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=50
)
save_final_result(results, accuracy, "My_prompting_0.txt")

 10%|█████████▏                                                                                  | 5/50 [00:02<00:27,  1.62it/s]

Progress: [5/50]
Current Acc.: [80.00%]


 20%|██████████████████▏                                                                        | 10/50 [00:05<00:21,  1.89it/s]

Progress: [10/50]
Current Acc.: [80.00%]


 30%|███████████████████████████▎                                                               | 15/50 [00:08<00:18,  1.87it/s]

Progress: [15/50]
Current Acc.: [86.67%]


 40%|████████████████████████████████████▍                                                      | 20/50 [00:18<01:07,  2.23s/it]

Progress: [20/50]
Current Acc.: [90.00%]


 50%|█████████████████████████████████████████████▌                                             | 25/50 [00:34<01:11,  2.86s/it]

Progress: [25/50]
Current Acc.: [88.00%]


 60%|██████████████████████████████████████████████████████▌                                    | 30/50 [00:50<01:04,  3.24s/it]

Progress: [30/50]
Current Acc.: [90.00%]


 70%|███████████████████████████████████████████████████████████████▋                           | 35/50 [01:05<00:43,  2.87s/it]

Progress: [35/50]
Current Acc.: [91.43%]


 80%|████████████████████████████████████████████████████████████████████████▊                  | 40/50 [01:21<00:32,  3.22s/it]

Progress: [40/50]
Current Acc.: [90.00%]


 90%|█████████████████████████████████████████████████████████████████████████████████▉         | 45/50 [01:38<00:17,  3.50s/it]

Progress: [45/50]
Current Acc.: [88.89%]


100%|███████████████████████████████████████████████████████████████████████████████████████████| 50/50 [01:54<00:00,  2.29s/it]

Progress: [50/50]
Current Acc.: [88.00%]





In [30]:
### 3 shot custom prompting
PROMPT = construct_my_prompt(few_shot_examples, 3)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=50
)
save_final_result(results, accuracy, "My_prompting_3.txt")

 10%|█████████▏                                                                                  | 5/50 [00:02<00:23,  1.89it/s]

Progress: [5/50]
Current Acc.: [100.00%]


 20%|██████████████████▏                                                                        | 10/50 [01:02<06:55, 10.38s/it]

Progress: [10/50]
Current Acc.: [90.00%]


 30%|███████████████████████████▎                                                               | 15/50 [02:02<06:48, 11.66s/it]

Progress: [15/50]
Current Acc.: [86.67%]


 40%|████████████████████████████████████▍                                                      | 20/50 [03:08<06:57, 13.90s/it]

Progress: [20/50]
Current Acc.: [90.00%]


 50%|█████████████████████████████████████████████▌                                             | 25/50 [04:14<05:20, 12.80s/it]

Progress: [25/50]
Current Acc.: [88.00%]


 60%|██████████████████████████████████████████████████████▌                                    | 30/50 [05:13<03:58, 11.93s/it]

Progress: [30/50]
Current Acc.: [90.00%]


 70%|███████████████████████████████████████████████████████████████▋                           | 35/50 [06:11<02:55, 11.70s/it]

Progress: [35/50]
Current Acc.: [91.43%]


 80%|████████████████████████████████████████████████████████████████████████▊                  | 40/50 [07:11<02:00, 12.03s/it]

Progress: [40/50]
Current Acc.: [87.50%]


 90%|█████████████████████████████████████████████████████████████████████████████████▉         | 45/50 [08:12<01:01, 12.36s/it]

Progress: [45/50]
Current Acc.: [84.44%]


100%|███████████████████████████████████████████████████████████████████████████████████████████| 50/50 [09:13<00:00, 11.07s/it]

Progress: [50/50]
Current Acc.: [84.00%]





In [32]:
### 5 shot custom prompting
PROMPT = construct_my_prompt(few_shot_examples, 5)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=50
)
save_final_result(results, accuracy, "My_prompting_5.txt")

 10%|█████████▏                                                                                  | 5/50 [01:05<11:03, 14.75s/it]

Progress: [5/50]
Current Acc.: [100.00%]


 20%|██████████████████▏                                                                        | 10/50 [02:26<10:36, 15.92s/it]

Progress: [10/50]
Current Acc.: [80.00%]


 30%|███████████████████████████▎                                                               | 15/50 [03:45<09:15, 15.87s/it]

Progress: [15/50]
Current Acc.: [86.67%]


 40%|████████████████████████████████████▍                                                      | 20/50 [05:03<07:48, 15.63s/it]

Progress: [20/50]
Current Acc.: [85.00%]


 50%|█████████████████████████████████████████████▌                                             | 25/50 [06:22<06:27, 15.52s/it]

Progress: [25/50]
Current Acc.: [84.00%]


 60%|██████████████████████████████████████████████████████▌                                    | 30/50 [07:47<05:43, 17.15s/it]

Progress: [30/50]
Current Acc.: [86.67%]


 70%|███████████████████████████████████████████████████████████████▋                           | 35/50 [09:03<03:50, 15.39s/it]

Progress: [35/50]
Current Acc.: [88.57%]


 80%|████████████████████████████████████████████████████████████████████████▊                  | 40/50 [10:22<02:37, 15.78s/it]

Progress: [40/50]
Current Acc.: [85.00%]


 90%|█████████████████████████████████████████████████████████████████████████████████▉         | 45/50 [11:42<01:19, 15.90s/it]

Progress: [45/50]
Current Acc.: [84.44%]


100%|███████████████████████████████████████████████████████████████████████████████████████████| 50/50 [13:09<00:00, 15.79s/it]

Progress: [50/50]
Current Acc.: [84.00%]





### 보고서 작성하기
#### 아래의 내용이 포함되면 됩니다!

1. Direct Prompting, CoT Prompting, My Prompting을 0 shot, 3 shot, 5 shot 정답률을 표로 보여주세요!
2. CoT Prompting이 Direct Prompting에 비해 왜 좋을 수 있는지에 대해서 서술해주세요!
3. 본인이 작성한 프롬프트 기법이 CoT에 비해서 왜 더 좋을 수 있는지에 대해서 설명해주세요!
4. 최종적으로, `PROMPTING.md`에 보고서를 작성해주세요!