In [2]:
!pip install groq python-dotenv numpy tqdm datasets



In [3]:
from groq import Groq
from dotenv import load_dotenv
from datasets import load_dataset

import os
from tqdm import tqdm
import re
import random
import pprint

from typing import List, Dict, Any

load_dotenv()
random.seed(0)

client = Groq()
gsm8k_dataset = load_dataset("gsm8k", "main")

gsm8k_train = gsm8k_dataset["train"]
gsm8k_test  = gsm8k_dataset["test"]

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
def generate_response_using_Llama(
        prompt: str,
        model: str = "llama3-8b-8192"
    ):
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant that solves math problems."
                },
                {
                    "role": "user", 
                    "content": prompt
                }
            ],
            model=model,
            temperature=0.3, ### 수정해도 됩니다!
            stream=False
        )
        return chat_completion.choices[0].message.content
    
    except Exception as e:
        print(f"API call error: {str(e)}")
        return None

#### 응답 잘 나오는지 확인해보기

In [9]:
response = generate_response_using_Llama(
    prompt="Hello world!",
)
print(response)

Hello! I'm excited to help you with any math problems you have. What kind of math are you working on? Do you have a specific problem you're stuck on or a concept you're trying to understand? Let me know and I'll do my best to assist you!


#### GSM8K 데이터셋 확인해보기

In [6]:
print("[Question]")
for l in gsm8k_test['question'][0].split("."):
    print(l)
print("="*100)
print("[Answer]")
print(gsm8k_test['answer'][0])

[Question]
Janet’s ducks lay 16 eggs per day
 She eats three for breakfast every morning and bakes muffins for her friends every day with four
 She sells the remainder at the farmers' market daily for $2 per fresh duck egg
 How much in dollars does she make every day at the farmers' market?
[Answer]
Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18


#### Util 함수들
- extract_final_answer: LLM의 응답을 parse하여 최종 결과만 추출 (정답과 비교하기 위해)
- run_benchmark_test: 벤치마크 테스트
- save_final_result: 결과물 제출을 위한 함수

In [12]:
### 수정해도 됩니다!
def extract_final_answer(response: str):
    regex = r"(?:Answer:|Model response:)\s*\$?([0-9,]+)\b|([0-9,]+)\s*(meters|cups|miles|minutes)"
    matches = re.finditer(regex, response, re.MULTILINE)
    results = [match.group(1) if match.group(1) else match.group(2).replace(",", "") for match in matches]

    if len(results) == 0:
        additional_regex = r"\$?([0-9,]+)"
        additional_matches = re.finditer(additional_regex, response, re.MULTILINE)
        results.extend([match.group(1).replace(",", "") for match in additional_matches])

    return results[-1] if results else None

In [14]:
### 수정해도 됩니다!
def run_benchmark_test(
        dataset,
        prompt: str,
        model: str = "llama3-8b-8192",
        num_samples: int = 50,
        VERBOSE: bool = False
    ):
    correct = 0
    total   = 0
    results = []

    for i in tqdm(range(min(num_samples, len(dataset)))):
        question = dataset[i]["question"]
        correct_answer = float(re.findall(r'\d+(?:\.\d+)?', dataset[i]["answer"].split('####')[-1])[0])

        response = generate_response_using_Llama(
            prompt=prompt.format(question=question),
            model=model
        )

        if response:
            if VERBOSE:
                print("="*50)
                print(response)
                print("="*50)
            predicted_answer = extract_final_answer(response)

            if isinstance(predicted_answer, str):
                predicted_answer = float(predicted_answer.replace(",", ""))
            
            diff = abs(predicted_answer - correct_answer)
            is_correct = diff < 1e-5 if predicted_answer is not None else False
            
            if is_correct:
                correct += 1
            total += 1
            
            results.append({
                'question': question,
                'correct_answer': correct_answer,
                'predicted_answer': predicted_answer,
                'response': response,
                'correct': is_correct
            })

            if (i + 1) % 5 == 0:
                current_acc = correct/total if total > 0 else 0
                print(f"Progress: [{i+1}/{num_samples}]")
                print(f"Current Acc.: [{current_acc:.2%}]")

    return results, correct/total if total > 0 else 0

In [10]:
def save_final_result(results: List[Dict[str, Any]], accuracy: float, filename: str) -> None:
    result_str = f"====== ACCURACY: {accuracy} ======\n\n"
    result_str += f"[Details]\n"
    
    for idx, result in enumerate(results):
        result_str += f"Question {idx+1}: {result['question']}\n"
        result_str += f"Correct Answer: {result['correct_answer']}\n"
        result_str += f"Predicted Answer: {result['predicted_answer']}\n"
        result_str += f"Correct: {result['correct']}\n\n"
    
    with open(filename, "w", encoding="utf-8") as f:
        f.write(result_str)

#### Direct prompting with few-shot example

In [1]:
def construct_direct_prompt(num_examples: int = 3) -> str:
    train_dataset = gsm8k_train

    sampled_indices = random.sample(
        [i for i in range(len(train_dataset['question']))],
        num_examples
    )

    prompt = "Instruction:\nSolve the following mathematical question and generate ONLY the answer after a tag, 'Answer:' without any rationale.\n"

    for i in range(num_examples):
        cur_question = train_dataset['question'][i]
        cur_answer = train_dataset['answer'][i].split("####")[-1].strip()

        prompt += f"\n[Example {i+1}]\n"
        prompt += f"Question:\n{cur_question}\n"
        prompt += f"Answer:{cur_answer}\n"

    prompt += "\nQuestion:\n{question}\nAnswer:"

    return prompt

In [15]:
### 어떤 방식으로 저장되는지 확인해보세요!
PROMPT = construct_direct_prompt(3)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=10
)
save_final_result(results, accuracy, "example.txt")

 50%|█████     | 5/10 [00:02<00:02,  2.12it/s]

Progress: [5/10]
Current Acc.: [20.00%]


100%|██████████| 10/10 [00:04<00:00,  2.09it/s]

Progress: [10/10]
Current Acc.: [30.00%]





In [16]:
# TODO: 0 shot, 3 shot, 5 shot direct prompting을 통해 벤치마크 테스트를 한 후, 각각 direct_prompting_{shot: int}.txt로 저장해주세요!
# 예시: shot이 5인 경우 direct_prompting_5.txt
# 항상 num_samples=50 입니다!


#0-shot

PROMPT = construct_direct_prompt(0)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=50
)
save_final_result(results, accuracy, "direct_prompting_0.txt")



#3-shot

PROMPT = construct_direct_prompt(3)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=50
)
save_final_result(results, accuracy, "direct_prompting_3.txt")



#5-shot

PROMPT = construct_direct_prompt(5)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=50
)
save_final_result(results, accuracy, "direct_prompting_5.txt")



 10%|█         | 5/50 [00:02<00:19,  2.32it/s]

Progress: [5/50]
Current Acc.: [20.00%]


 20%|██        | 10/50 [00:05<00:25,  1.56it/s]

Progress: [10/50]
Current Acc.: [10.00%]


 30%|███       | 15/50 [00:08<00:18,  1.85it/s]

Progress: [15/50]
Current Acc.: [6.67%]


 40%|████      | 20/50 [00:10<00:15,  1.96it/s]

Progress: [20/50]
Current Acc.: [20.00%]


 50%|█████     | 25/50 [00:16<00:35,  1.43s/it]

Progress: [25/50]
Current Acc.: [20.00%]


 60%|██████    | 30/50 [00:18<00:12,  1.57it/s]

Progress: [30/50]
Current Acc.: [16.67%]


 70%|███████   | 35/50 [00:32<00:35,  2.40s/it]

Progress: [35/50]
Current Acc.: [14.29%]


 80%|████████  | 40/50 [00:44<00:26,  2.67s/it]

Progress: [40/50]
Current Acc.: [15.00%]


 90%|█████████ | 45/50 [00:59<00:14,  2.85s/it]

Progress: [45/50]
Current Acc.: [17.78%]


100%|██████████| 50/50 [01:12<00:00,  1.46s/it]


Progress: [50/50]
Current Acc.: [20.00%]


 10%|█         | 5/50 [00:15<02:42,  3.60s/it]

Progress: [5/50]
Current Acc.: [20.00%]


 20%|██        | 10/50 [00:28<01:46,  2.65s/it]

Progress: [10/50]
Current Acc.: [30.00%]


 30%|███       | 15/50 [00:41<01:30,  2.59s/it]

Progress: [15/50]
Current Acc.: [20.00%]


 40%|████      | 20/50 [00:57<01:31,  3.04s/it]

Progress: [20/50]
Current Acc.: [20.00%]


 50%|█████     | 25/50 [01:10<01:05,  2.61s/it]

Progress: [25/50]
Current Acc.: [20.00%]


 60%|██████    | 30/50 [01:28<01:17,  3.86s/it]

Progress: [30/50]
Current Acc.: [23.33%]


 70%|███████   | 35/50 [01:41<00:41,  2.78s/it]

Progress: [35/50]
Current Acc.: [20.00%]


 80%|████████  | 40/50 [01:54<00:26,  2.64s/it]

Progress: [40/50]
Current Acc.: [17.50%]


 90%|█████████ | 45/50 [02:10<00:14,  2.90s/it]

Progress: [45/50]
Current Acc.: [17.78%]


100%|██████████| 50/50 [02:22<00:00,  2.86s/it]


Progress: [50/50]
Current Acc.: [20.00%]


 10%|█         | 5/50 [00:16<02:30,  3.34s/it]

Progress: [5/50]
Current Acc.: [40.00%]


 20%|██        | 10/50 [00:29<01:49,  2.75s/it]

Progress: [10/50]
Current Acc.: [20.00%]


 30%|███       | 15/50 [00:41<01:21,  2.33s/it]

Progress: [15/50]
Current Acc.: [13.33%]


 40%|████      | 20/50 [00:57<01:24,  2.82s/it]

Progress: [20/50]
Current Acc.: [20.00%]


 50%|█████     | 25/50 [01:10<01:07,  2.71s/it]

Progress: [25/50]
Current Acc.: [24.00%]


 60%|██████    | 30/50 [01:26<00:59,  3.00s/it]

Progress: [30/50]
Current Acc.: [23.33%]


 70%|███████   | 35/50 [01:40<00:40,  2.68s/it]

Progress: [35/50]
Current Acc.: [20.00%]


 80%|████████  | 40/50 [02:00<00:46,  4.66s/it]

Progress: [40/50]
Current Acc.: [17.50%]


 90%|█████████ | 45/50 [02:16<00:18,  3.61s/it]

Progress: [45/50]
Current Acc.: [17.78%]


100%|██████████| 50/50 [02:25<00:00,  2.92s/it]

Progress: [50/50]
Current Acc.: [18.00%]





### Chain-of-Thought prompting with few-shot example
```text
[Question]
Janet’s ducks lay 16 eggs per day
 She eats three for breakfast every morning and bakes muffins for her friends every day with four
 She sells the remainder at the farmers' market daily for $2 per fresh duck egg
 How much in dollars does she make every day at the farmers' market?
====================================================================================================
[Answer]
Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18
```

[Answer] 아래의 정답을 도출하는 과정을 예시로 달아주면 CoT의 few shot이 되겠죠?

In [11]:
def construct_CoT_prompt(num_examples: int = 3) -> str:
    train_dataset = gsm8k_train

    sampled_indices = random.sample(
        [i for i in range(len(train_dataset['question']))],
        num_examples
    )
    prompt = "Instruction:\nSolve the following mathematical question step by step, and then generate the final answer after a tag, 'Answer:'.\n" #TODO: 프롬프트를 작성해주세요!
    

    for i in range(num_examples):
        idx = sampled_indices[i]
        cur_question = train_dataset['question'][idx]
        cur_answer = train_dataset['answer'][idx].strip()
        
        #TODO: CoT example을 만들어주세요!
        prompt += f"\n[Example {i+1}]\n"
        prompt += f"Question:\n{cur_question}\n"
        prompt += f"{cur_answer}\n"

    prompt += "\nQuestion:\n{question}\nLet's think step by step."

    return prompt

In [None]:
# TODO: 0 shot, 3 shot, 5 shot CoT prompting을 통해 벤치마크 테스트를 한 후, 각각 CoT_prompting_{shot: int}.txt로 저장해주세요!
# 예시: shot이 5인 경우 CoT_prompting_5.txt
# 항상 num_samples=50 입니다!


#0-shot

PROMPT = construct_CoT_prompt(0)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=50
)
save_final_result(results, accuracy, "CoT_prompting_0.txt")



#3-shot

PROMPT = construct_CoT_prompt(3)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=50
)
save_final_result(results, accuracy, "CoT_prompting_3.txt")



#5-shot

PROMPT = construct_CoT_prompt(5)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=50
)
save_final_result(results, accuracy, "CoT_prompting_5.txt")





 10%|█         | 5/50 [00:03<00:44,  1.02it/s]

Progress: [5/50]
Current Acc.: [60.00%]


 20%|██        | 10/50 [00:06<00:22,  1.78it/s]

Progress: [10/50]
Current Acc.: [60.00%]


 30%|███       | 15/50 [00:08<00:17,  2.00it/s]

Progress: [15/50]
Current Acc.: [60.00%]


 40%|████      | 20/50 [00:12<00:20,  1.48it/s]

Progress: [20/50]
Current Acc.: [60.00%]


 50%|█████     | 25/50 [00:32<01:15,  3.01s/it]

Progress: [25/50]
Current Acc.: [64.00%]


 60%|██████    | 30/50 [00:46<00:57,  2.85s/it]

Progress: [30/50]
Current Acc.: [66.67%]


 70%|███████   | 35/50 [01:04<00:44,  2.96s/it]

Progress: [35/50]
Current Acc.: [71.43%]


 80%|████████  | 40/50 [01:21<00:37,  3.75s/it]

Progress: [40/50]
Current Acc.: [72.50%]


 90%|█████████ | 45/50 [01:45<00:22,  4.55s/it]

Progress: [45/50]
Current Acc.: [73.33%]


100%|██████████| 50/50 [02:01<00:00,  2.44s/it]


Progress: [50/50]
Current Acc.: [74.00%]


 10%|█         | 5/50 [00:47<05:26,  7.26s/it]

Progress: [5/50]
Current Acc.: [80.00%]


 20%|██        | 10/50 [01:34<06:09,  9.23s/it]

Progress: [10/50]
Current Acc.: [70.00%]


 30%|███       | 15/50 [02:20<05:06,  8.76s/it]

Progress: [15/50]
Current Acc.: [66.67%]


 40%|████      | 20/50 [03:06<04:37,  9.24s/it]

Progress: [20/50]
Current Acc.: [65.00%]


 50%|█████     | 25/50 [03:52<03:40,  8.81s/it]

Progress: [25/50]
Current Acc.: [64.00%]


 60%|██████    | 30/50 [04:33<02:44,  8.20s/it]

Progress: [30/50]
Current Acc.: [60.00%]


 70%|███████   | 35/50 [05:18<02:13,  8.88s/it]

Progress: [35/50]
Current Acc.: [65.71%]


 80%|████████  | 40/50 [05:59<01:20,  8.09s/it]

Progress: [40/50]
Current Acc.: [70.00%]


 90%|█████████ | 45/50 [06:48<00:49,  9.84s/it]

Progress: [45/50]
Current Acc.: [68.89%]


100%|██████████| 50/50 [07:32<00:00,  9.04s/it]


Progress: [50/50]
Current Acc.: [70.00%]


 10%|█         | 5/50 [01:02<09:24, 12.54s/it]

Progress: [5/50]
Current Acc.: [80.00%]


 20%|██        | 10/50 [02:09<08:57, 13.45s/it]

Progress: [10/50]
Current Acc.: [70.00%]


 30%|███       | 15/50 [03:18<08:09, 13.97s/it]

Progress: [15/50]
Current Acc.: [66.67%]


 40%|████      | 20/50 [04:23<06:26, 12.87s/it]

Progress: [20/50]
Current Acc.: [65.00%]


 50%|█████     | 25/50 [05:28<05:17, 12.70s/it]

Progress: [25/50]
Current Acc.: [68.00%]


 60%|██████    | 30/50 [06:33<04:18, 12.93s/it]

Progress: [30/50]
Current Acc.: [70.00%]


 70%|███████   | 35/50 [07:35<03:04, 12.28s/it]

Progress: [35/50]
Current Acc.: [74.29%]


 80%|████████  | 40/50 [08:40<02:11, 13.16s/it]

Progress: [40/50]
Current Acc.: [70.00%]


 90%|█████████ | 45/50 [09:44<01:04, 12.91s/it]

Progress: [45/50]
Current Acc.: [71.11%]


100%|██████████| 50/50 [10:50<00:00, 13.01s/it]

Progress: [50/50]
Current Acc.: [70.00%]





### Construct your prompt!!

목표: 본인만의 프롬프트를 통해 정답률을 더 끌어올려보기!
- gsm8k의 train 데이터셋에서 예시를 가져온 다음 (자유롭게!)
- 그 예시들에 대한 풀이 과정을 만들어주세요!
- 모든 것들이 자유입니다! Direct Prompting, CoT Prompting을 한 결과보다 정답률만 높으면 돼요.

In [12]:
### 자유롭게 수정해도 됩니다! 완전히 새로 함수를 만들어도 돼요.
#강의안에서 배웠던 CoT-SC를 구현했습니다

import re
import time
from collections import Counter
from tqdm import tqdm

# ✅ 모델 응답에서 정답 숫자만 추출 & 정규화
def extract_and_normalize_answer(response: str) -> str:
    regex = r"(?:Answer:|Model response:)\s*\$?([0-9,]+)\b|([0-9,]+)\s*(meters|cups|miles|minutes)"
    matches = re.finditer(regex, response, re.MULTILINE)

    results = []
    for match in matches:
        if match.group(1):
            results.append(match.group(1).replace(",", ""))
        elif match.group(2):
            results.append(match.group(2).replace(",", ""))

    if not results:
        fallback_regex = r"\$?([0-9,]+)"
        fallback_matches = re.finditer(fallback_regex, response, re.MULTILINE)
        results.extend([m.group(1).replace(",", "") for m in fallback_matches])

    return results[-1] if results else None


def run_sc_benchmark_test(
    dataset,
    prompt_template: str,
    num_samples: int = 50,
    num_generations: int = 5,
    VERBOSE: bool = False
):
    results = []
    correct = 0

    for i in tqdm(range(num_samples)):
        example = dataset[i]
        question = example["question"]
        gt_answer_text = example["answer"]
        gt_answer = extract_and_normalize_answer(gt_answer_text)

        prompt = prompt_template.format(question=question)
        generations = []
        parsed_answers = []

        for _ in range(num_generations):
            # 자동 재시도
            for attempt in range(3):
                try:
                    response = client.chat.completions.create(
                        model="llama3-70b-8192",
                        messages=[{"role": "user", "content": prompt}],
                        temperature=1.0
                    )
                    output = response.choices[0].message.content.strip()
                    generations.append(output)
                    final_ans = extract_and_normalize_answer(output)
                    if final_ans is not None:
                        parsed_answers.append(final_ans)
                    break
                except Exception as e:
                    print(f"[Attempt {attempt+1}] Error: {e}")
                    time.sleep(1)

        if parsed_answers:
            predicted_answer, count = Counter(parsed_answers).most_common(1)[0]
        else:
            predicted_answer = "N/A"

        is_correct = (predicted_answer == gt_answer)

        if is_correct:
            correct += 1

        results.append({
            "question": question,
            "correct_answer": gt_answer,
            "predicted_answer": predicted_answer,
            "correct": is_correct,
            "raw_generations": generations,
            "parsed_answers": parsed_answers
        })

        if VERBOSE:
            print(f"\n[{i}] GT: {gt_answer} | Votes: {parsed_answers} | Majority: {predicted_answer}")

    accuracy = correct / num_samples * 100
    return results, accuracy








In [None]:
# TODO: 만든 0 shot, 3 shot, 5 shot example과 프롬프트를 통해 벤치마크 테스트를 한 후, 각각 My_prompting_{shot: int}.txt로 저장해주세요!
# 예시: shot이 5인 경우 My_prompting_5.txt
# 항상 num_samples=50 입니다!


#0-shot


prompt_template = construct_CoT_prompt(0)  # 0CoT prompt
results, acc = run_sc_benchmark_test(
    dataset=gsm8k_test,
    prompt_template=prompt_template,
    num_samples=50,
    num_generations=5,  # self-consistency 횟수
    VERBOSE=True
)
save_final_result(results, acc, "my_prompting_0.txt")



  2%|▏         | 1/50 [00:05<04:24,  5.39s/it]


[0] GT: 18 | Votes: ['18', '18', '18', '18', '18'] | Majority: 18


  4%|▍         | 2/50 [00:15<06:21,  7.94s/it]


[1] GT: 3 | Votes: ['3', '3', '3', '3', '3'] | Majority: 3


  6%|▌         | 3/50 [00:31<09:08, 11.68s/it]


[2] GT: 70000 | Votes: ['70000', '70000', '70000', '70000', '70000'] | Majority: 70000


  8%|▊         | 4/50 [00:41<08:29, 11.08s/it]


[3] GT: 540 | Votes: ['540', '540', '540', '540', '540'] | Majority: 540


 10%|█         | 5/50 [00:59<10:07, 13.51s/it]


[4] GT: 20 | Votes: ['20', '20', '20', '20', '20'] | Majority: 20


 12%|█▏        | 6/50 [01:16<10:47, 14.72s/it]


[5] GT: 64 | Votes: ['64', '16', '16', '16', '16'] | Majority: 16


 14%|█▍        | 7/50 [01:27<09:47, 13.66s/it]


[6] GT: 260 | Votes: ['260', '260', '260', '260', '260'] | Majority: 260


 16%|█▌        | 8/50 [01:45<10:21, 14.81s/it]


[7] GT: 160 | Votes: ['120', '120', '160', '160', '120'] | Majority: 120


 18%|█▊        | 9/50 [02:08<11:58, 17.51s/it]


[8] GT: 45 | Votes: ['45', '45', '45', '45', '45'] | Majority: 45


 20%|██        | 10/50 [02:24<11:26, 17.17s/it]


[9] GT: 460 | Votes: ['460', '460', '460', '460', '460'] | Majority: 460


 22%|██▏       | 11/50 [02:40<10:46, 16.57s/it]


[10] GT: 366 | Votes: ['366', '366', '366', '366', '366'] | Majority: 366


 24%|██▍       | 12/50 [02:57<10:43, 16.94s/it]


[11] GT: 694 | Votes: ['694', '694', '694', '694', '694'] | Majority: 694


 26%|██▌       | 13/50 [03:18<11:03, 17.93s/it]


[12] GT: 13 | Votes: ['12', '12', '12', '12', '13'] | Majority: 12


 28%|██▊       | 14/50 [03:36<10:49, 18.05s/it]


[13] GT: 18 | Votes: ['18', '18', '9', '18', '18'] | Majority: 18


 30%|███       | 15/50 [03:51<10:04, 17.26s/it]


[14] GT: 60 | Votes: ['60', '60', '60', '60', '60'] | Majority: 60


 32%|███▏      | 16/50 [04:11<10:12, 18.02s/it]


[15] GT: 125 | Votes: ['125', '125', '125', '125', '125'] | Majority: 125


 34%|███▍      | 17/50 [04:25<09:11, 16.70s/it]


[16] GT: 230 | Votes: ['230', '230', '230', '230', '230'] | Majority: 230


 36%|███▌      | 18/50 [04:40<08:41, 16.30s/it]


[17] GT: 57500 | Votes: ['57500', '57500', '57500', '57500', '57500'] | Majority: 57500


 38%|███▊      | 19/50 [04:53<07:50, 15.19s/it]


[18] GT: 7 | Votes: ['4', '4', '4', '7', '7'] | Majority: 4


 40%|████      | 20/50 [05:14<08:29, 16.99s/it]


[19] GT: 6 | Votes: ['6', '6', '6', '6', '6'] | Majority: 6


 42%|████▏     | 21/50 [05:39<09:21, 19.35s/it]


[20] GT: 15 | Votes: ['5', '5', '15', '00', '24'] | Majority: 5


 44%|████▍     | 22/50 [05:51<08:04, 17.31s/it]


[21] GT: 14 | Votes: ['2', '14', '14', '14', '14'] | Majority: 14


 46%|████▌     | 23/50 [06:04<07:10, 15.93s/it]


[22] GT: 7 | Votes: ['7', '7', '7', '7', '7'] | Majority: 7


 48%|████▊     | 24/50 [06:15<06:16, 14.49s/it]


[23] GT: 8 | Votes: ['00', '8', '00', '8', '00'] | Majority: 00


 50%|█████     | 25/50 [06:28<05:47, 13.91s/it]


[24] GT: 26 | Votes: ['00', '00', '00', '00', '00'] | Majority: 00


 52%|█████▏    | 26/50 [06:43<05:41, 14.22s/it]


[25] GT: 2 | Votes: ['2', '2', '2', '2', '2'] | Majority: 2


 54%|█████▍    | 27/50 [07:00<05:47, 15.11s/it]


[26] GT: 243 | Votes: ['243', '243', '243', '243', '243'] | Majority: 243


 56%|█████▌    | 28/50 [07:14<05:27, 14.87s/it]


[27] GT: 16 | Votes: ['16', '16', '16', '16', '16'] | Majority: 16


 58%|█████▊    | 29/50 [07:26<04:55, 14.05s/it]


[28] GT: 25 | Votes: ['25', '25', '25', '25', '25'] | Majority: 25


 60%|██████    | 30/50 [07:39<04:31, 13.58s/it]


[29] GT: 104 | Votes: ['104', '104', '104', '104', '104'] | Majority: 104


 62%|██████▏   | 31/50 [07:54<04:25, 14.00s/it]


[30] GT: 109 | Votes: ['109', '109', '109', '109', '109'] | Majority: 109


 64%|██████▍   | 32/50 [08:09<04:18, 14.37s/it]


[31] GT: 80 | Votes: ['80', '80', '80', '80', '80'] | Majority: 80


 66%|██████▌   | 33/50 [08:20<03:49, 13.49s/it]


[32] GT: 35 | Votes: ['35', '35', '35', '35', '35'] | Majority: 35


 68%|██████▊   | 34/50 [08:34<03:38, 13.64s/it]


[33] GT: 70 | Votes: ['70', '70', '70', '70', '70'] | Majority: 70


 70%|███████   | 35/50 [08:45<03:12, 12.80s/it]


[34] GT: 23 | Votes: ['23', '23', '23', '23', '23'] | Majority: 23


 72%|███████▏  | 36/50 [08:59<03:03, 13.13s/it]


[35] GT: 40 | Votes: ['20', '20', '9', '9', '20'] | Majority: 20


 74%|███████▍  | 37/50 [09:14<02:56, 13.60s/it]


[36] GT: 75 | Votes: ['00', '00', '75', '75', '00'] | Majority: 00


 76%|███████▌  | 38/50 [09:33<03:01, 15.15s/it]


[37] GT: 2 | Votes: ['0', '11', '2', '0', '2'] | Majority: 0


 78%|███████▊  | 39/50 [09:49<02:51, 15.60s/it]


[38] GT: 10 | Votes: ['10', '10', '10', '10', '10'] | Majority: 10


 80%|████████  | 40/50 [10:10<02:52, 17.24s/it]


[39] GT: 18 | Votes: ['18', '18', '18', '18', '18'] | Majority: 18


 82%|████████▏ | 41/50 [10:22<02:19, 15.52s/it]


[40] GT: 8 | Votes: ['8', '8', '8', '8', '8'] | Majority: 8


 84%|████████▍ | 42/50 [10:39<02:07, 15.97s/it]


[41] GT: 200 | Votes: ['200', '200', '200', '200', '200'] | Majority: 200


 86%|████████▌ | 43/50 [10:52<01:45, 15.12s/it]


[42] GT: 26 | Votes: ['26', '26', '26', '26', '26'] | Majority: 26


 88%|████████▊ | 44/50 [11:10<01:35, 15.93s/it]


[43] GT: 48 | Votes: ['48', '48', '48', '48', '48'] | Majority: 48


 90%|█████████ | 45/50 [11:27<01:22, 16.42s/it]


[44] GT: 20 | Votes: ['20', '20', '20', '20', '20'] | Majority: 20


 92%|█████████▏| 46/50 [11:48<01:10, 17.65s/it]


[45] GT: 104 | Votes: ['104', '104', '104', '104', '104'] | Majority: 104


 94%|█████████▍| 47/50 [12:08<00:54, 18.29s/it]


[46] GT: 163 | Votes: ['163', '37', '163', '163', '163'] | Majority: 163


 96%|█████████▌| 48/50 [12:25<00:36, 18.07s/it]


[47] GT: 800 | Votes: ['800', '800', '800', '800', '800'] | Majority: 800


 98%|█████████▊| 49/50 [12:37<00:16, 16.30s/it]


[48] GT: 8 | Votes: ['8', '8', '8', '8', '8'] | Majority: 8


100%|██████████| 50/50 [12:51<00:00, 15.43s/it]



[49] GT: 30 | Votes: ['30', '30', '30', '30', '30'] | Majority: 30


  2%|▏         | 1/50 [00:39<32:10, 39.39s/it]


[0] GT: 18 | Votes: ['18', '18', '18', '18', '18'] | Majority: 18


  4%|▍         | 2/50 [01:15<30:11, 37.74s/it]


[1] GT: 3 | Votes: ['3', '3', '3', '3', '3'] | Majority: 3


  6%|▌         | 3/50 [01:57<31:04, 39.67s/it]


[2] GT: 70000 | Votes: ['70000', '70000', '70000', '70000', '70000'] | Majority: 70000


  8%|▊         | 4/50 [02:35<29:37, 38.64s/it]


[3] GT: 540 | Votes: ['540', '540', '540', '540', '540'] | Majority: 540


 10%|█         | 5/50 [03:19<30:29, 40.65s/it]


[4] GT: 20 | Votes: ['20', '20', '20', '20', '20'] | Majority: 20


 12%|█▏        | 6/50 [04:01<30:18, 41.33s/it]


[5] GT: 64 | Votes: ['64', '64', '64', '64', '64'] | Majority: 64


 14%|█▍        | 7/50 [04:40<29:00, 40.47s/it]


[6] GT: 260 | Votes: ['260', '260', '260', '260', '260'] | Majority: 260


 16%|█▌        | 8/50 [05:25<29:16, 41.81s/it]


[7] GT: 160 | Votes: ['80', '160', '120', '120', '120'] | Majority: 120


 18%|█▊        | 9/50 [06:13<30:03, 43.98s/it]


[8] GT: 45 | Votes: ['135', '45', '45', '45', '45'] | Majority: 45


 20%|██        | 10/50 [06:55<28:53, 43.33s/it]


[9] GT: 460 | Votes: ['460', '460', '460', '460', '460'] | Majority: 460


 22%|██▏       | 11/50 [07:37<27:50, 42.83s/it]


[10] GT: 366 | Votes: ['366', '366', '366', '366', '366'] | Majority: 366


 24%|██▍       | 12/50 [08:19<27:00, 42.65s/it]


[11] GT: 694 | Votes: ['694', '694', '694', '694', '694'] | Majority: 694


 26%|██▌       | 13/50 [09:07<27:18, 44.27s/it]


[12] GT: 13 | Votes: ['5', '11', '12', '13', '12'] | Majority: 12


 28%|██▊       | 14/50 [10:05<29:02, 48.41s/it]


[13] GT: 18 | Votes: ['18', '18', '18', '12', '18'] | Majority: 18


 30%|███       | 15/50 [10:47<27:06, 46.48s/it]


[14] GT: 60 | Votes: ['60', '60', '60', '60', '60'] | Majority: 60


 32%|███▏      | 16/50 [11:35<26:31, 46.80s/it]


[15] GT: 125 | Votes: ['125', '125', '29', '125', '125'] | Majority: 125


 34%|███▍      | 17/50 [12:15<24:37, 44.78s/it]


[16] GT: 230 | Votes: ['230', '230', '230', '170', '230'] | Majority: 230


 36%|███▌      | 18/50 [12:57<23:24, 43.88s/it]


[17] GT: 57500 | Votes: ['57500', '57500', '57500', '57500', '57500'] | Majority: 57500


 38%|███▊      | 19/50 [13:36<21:54, 42.40s/it]


[18] GT: 7 | Votes: ['7', '7', '7', '7', '7'] | Majority: 7


 40%|████      | 20/50 [14:20<21:29, 42.98s/it]


[19] GT: 6 | Votes: ['6', '6', '6', '6', '6'] | Majority: 6


 42%|████▏     | 21/50 [15:05<21:02, 43.52s/it]


[20] GT: 15 | Votes: ['15', '14', '15', '15', '15'] | Majority: 15


 44%|████▍     | 22/50 [15:45<19:48, 42.44s/it]


[21] GT: 14 | Votes: ['14', '14', '14', '14', '14'] | Majority: 14


 46%|████▌     | 23/50 [16:25<18:50, 41.88s/it]


[22] GT: 7 | Votes: ['7', '7', '7', '7', '7'] | Majority: 7


 48%|████▊     | 24/50 [17:03<17:32, 40.49s/it]


[23] GT: 8 | Votes: ['8', '8', '8', '8', '8'] | Majority: 8


 50%|█████     | 25/50 [17:43<16:55, 40.63s/it]


[24] GT: 26 | Votes: ['26', '26', '26', '26', '26'] | Majority: 26


 52%|█████▏    | 26/50 [18:26<16:29, 41.22s/it]


[25] GT: 2 | Votes: ['2', '2', '2', '2', '2'] | Majority: 2


 54%|█████▍    | 27/50 [19:10<16:07, 42.05s/it]


[26] GT: 243 | Votes: ['243', '243', '243', '243', '243'] | Majority: 243


 56%|█████▌    | 28/50 [19:50<15:09, 41.34s/it]


[27] GT: 16 | Votes: ['16', '16', '16', '16', '16'] | Majority: 16


 58%|█████▊    | 29/50 [20:29<14:14, 40.67s/it]


[28] GT: 25 | Votes: ['25', '15', '25', '25', '25'] | Majority: 25


 60%|██████    | 30/50 [21:08<13:23, 40.18s/it]


[29] GT: 104 | Votes: ['104', '104', '104', '104', '104'] | Majority: 104


 62%|██████▏   | 31/50 [21:49<12:46, 40.34s/it]


[30] GT: 109 | Votes: ['109', '109', '109', '109', '109'] | Majority: 109


 64%|██████▍   | 32/50 [22:31<12:17, 40.98s/it]


[31] GT: 80 | Votes: ['80', '80', '80', '80', '80'] | Majority: 80


 66%|██████▌   | 33/50 [23:09<11:19, 40.00s/it]


[32] GT: 35 | Votes: ['35', '35', '35', '35', '35'] | Majority: 35


 68%|██████▊   | 34/50 [23:48<10:37, 39.85s/it]


[33] GT: 70 | Votes: ['70', '70', '70', '70', '70'] | Majority: 70


 70%|███████   | 35/50 [24:26<09:50, 39.35s/it]


[34] GT: 23 | Votes: ['23', '23', '23', '23', '23'] | Majority: 23


 72%|███████▏  | 36/50 [25:07<09:17, 39.81s/it]


[35] GT: 40 | Votes: ['9', '9', '9', '9', '9'] | Majority: 9


 74%|███████▍  | 37/50 [25:47<08:35, 39.68s/it]


[36] GT: 75 | Votes: ['75', '75', '75', '75', '75'] | Majority: 75


 76%|███████▌  | 38/50 [26:30<08:08, 40.70s/it]


[37] GT: 2 | Votes: ['2', '11', '2', '2', '2'] | Majority: 2


 78%|███████▊  | 39/50 [27:12<07:33, 41.26s/it]


[38] GT: 10 | Votes: ['10', '11', '10', '10', '67'] | Majority: 10


 80%|████████  | 40/50 [27:59<07:09, 42.93s/it]


[39] GT: 18 | Votes: ['18', '18', '18', '18', '18'] | Majority: 18


 82%|████████▏ | 41/50 [28:37<06:13, 41.51s/it]


[40] GT: 8 | Votes: ['8', '8', '8', '8', '8'] | Majority: 8


 84%|████████▍ | 42/50 [29:20<05:33, 41.70s/it]


[41] GT: 200 | Votes: ['200', '200', '200', '200', '200'] | Majority: 200


 86%|████████▌ | 43/50 [30:01<04:51, 41.59s/it]


[42] GT: 26 | Votes: ['26', '26', '26', '26', '26'] | Majority: 26


 88%|████████▊ | 44/50 [30:45<04:14, 42.34s/it]


[43] GT: 48 | Votes: ['48', '48', '240', '48', '48'] | Majority: 48


 90%|█████████ | 45/50 [31:27<03:31, 42.34s/it]


[44] GT: 20 | Votes: ['20', '20', '20', '20', '20'] | Majority: 20


 92%|█████████▏| 46/50 [32:12<02:51, 42.97s/it]


[45] GT: 104 | Votes: ['104', '104', '104', '104', '104'] | Majority: 104


 94%|█████████▍| 47/50 [32:55<02:09, 43.01s/it]


[46] GT: 163 | Votes: ['163', '163', '163', '163', '163'] | Majority: 163


 96%|█████████▌| 48/50 [33:37<01:25, 42.86s/it]


[47] GT: 800 | Votes: ['800', '800', '800', '800', '800'] | Majority: 800


 98%|█████████▊| 49/50 [34:15<00:41, 41.30s/it]


[48] GT: 8 | Votes: ['8', '8', '8', '8', '8'] | Majority: 8


100%|██████████| 50/50 [34:55<00:00, 41.90s/it]



[49] GT: 30 | Votes: ['30', '30', '30', '30', '30'] | Majority: 30


  2%|▏         | 1/50 [00:44<36:08, 44.25s/it]


[0] GT: 18 | Votes: ['18', '18', '18', '18', '18'] | Majority: 18


  4%|▍         | 2/50 [01:24<33:29, 41.86s/it]


[1] GT: 3 | Votes: ['3', '3', '3', '3', '3'] | Majority: 3


  6%|▌         | 3/50 [02:08<33:32, 42.82s/it]


[2] GT: 70000 | Votes: ['70000', '70000', '70000', '65000', '54000'] | Majority: 70000


  8%|▊         | 4/50 [02:50<32:38, 42.58s/it]


[3] GT: 540 | Votes: ['540', '540', '540', '540', '540'] | Majority: 540


 10%|█         | 5/50 [03:38<33:28, 44.63s/it]


[4] GT: 20 | Votes: ['20', '20', '20', '20', '20'] | Majority: 20


 12%|█▏        | 6/50 [04:28<33:58, 46.33s/it]


[5] GT: 64 | Votes: ['64', '64', '64', '64', '128'] | Majority: 64


 14%|█▍        | 7/50 [05:10<32:15, 45.02s/it]


[6] GT: 260 | Votes: ['260', '260', '260', '260', '260'] | Majority: 260
[Attempt 1] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500223, Requested 641. Please try again in 2m29.4028s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 2] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500216, Requested 641. Please try again in 2m28.1948s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 3] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for m

 16%|█▌        | 8/50 [05:35<26:55, 38.46s/it]


[7] GT: 160 | Votes: ['160'] | Majority: 160
[Attempt 1] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500135, Requested 671. Please try again in 2m19.2868s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 2] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500127, Requested 671. Please try again in 2m18.0498s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 3] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in or

 18%|█▊        | 9/50 [05:54<22:05, 32.33s/it]


[8] GT: 45 | Votes: [] | Majority: N/A
[Attempt 1] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500028, Requested 626. Please try again in 1m53.1348s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 2] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500021, Requested 626. Please try again in 1m51.8738s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 3] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organiza

 20%|██        | 10/50 [06:11<18:34, 27.87s/it]


[9] GT: 460 | Votes: [] | Majority: N/A
[Attempt 1] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499922, Requested 637. Please try again in 1m36.5696s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 2] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499915, Requested 637. Please try again in 1m35.3516s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 3] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organiz

 22%|██▏       | 11/50 [06:30<16:18, 25.10s/it]


[10] GT: 366 | Votes: [] | Majority: N/A
[Attempt 1] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499816, Requested 629. Please try again in 1m16.7622s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 2] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499809, Requested 629. Please try again in 1m15.5352s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 3] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organi

 24%|██▍       | 12/50 [07:51<26:31, 41.89s/it]


[11] GT: 694 | Votes: ['694'] | Majority: 694
[Attempt 1] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500266, Requested 634. Please try again in 2m35.653599999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 2] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500259, Requested 634. Please try again in 2m34.3866s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 3] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192`

 26%|██▌       | 13/50 [08:10<21:34, 34.98s/it]


[12] GT: 13 | Votes: [] | Majority: N/A
[Attempt 1] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500159, Requested 629. Please try again in 2m16.1856s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 2] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500152, Requested 629. Please try again in 2m14.9786s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 3] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organiz

 28%|██▊       | 14/50 [08:29<18:09, 30.27s/it]


[13] GT: 18 | Votes: [] | Majority: N/A
[Attempt 1] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500049, Requested 624. Please try again in 1m56.3386s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 2] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500042, Requested 624. Please try again in 1m55.1466s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 3] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organiz

 30%|███       | 15/50 [08:47<15:27, 26.51s/it]


[14] GT: 60 | Votes: [] | Majority: N/A
[Attempt 1] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499943, Requested 669. Please try again in 1m45.7236s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 2] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499936, Requested 669. Please try again in 1m44.4616s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 3] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organiz

 32%|███▏      | 16/50 [09:06<13:44, 24.26s/it]


[15] GT: 125 | Votes: [] | Majority: N/A
[Attempt 1] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499835, Requested 625. Please try again in 1m19.4864s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 2] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499828, Requested 625. Please try again in 1m18.2154s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 3] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organi

 34%|███▍      | 17/50 [09:24<12:14, 22.27s/it]


[16] GT: 230 | Votes: [] | Majority: N/A
[Attempt 1] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500266, Requested 617. Please try again in 2m32.662s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 2] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500259, Requested 617. Please try again in 2m31.462s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 3] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organiza

 36%|███▌      | 18/50 [10:42<20:51, 39.12s/it]


[17] GT: 57500 | Votes: ['57500'] | Majority: 57500
[Attempt 1] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500182, Requested 596. Please try again in 2m14.4812s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 2] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500174, Requested 596. Please try again in 2m13.156199999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 3] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b

 38%|███▊      | 19/50 [11:00<16:55, 32.76s/it]


[18] GT: 7 | Votes: [] | Majority: N/A
[Attempt 1] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500075, Requested 633. Please try again in 2m2.345799999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 2] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500067, Requested 633. Please try again in 2m1.122799999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 3] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in 

 40%|████      | 20/50 [11:18<14:15, 28.52s/it]


[19] GT: 6 | Votes: [] | Majority: N/A
[Attempt 1] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499970, Requested 630. Please try again in 1m43.5224s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 2] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499962, Requested 630. Please try again in 1m42.2624s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 3] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organiza

 42%|████▏     | 21/50 [11:37<12:22, 25.59s/it]


[20] GT: 15 | Votes: [] | Majority: N/A
[Attempt 1] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499864, Requested 614. Please try again in 1m22.4876s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 2] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499857, Requested 614. Please try again in 1m21.2956s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 3] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organiz

 44%|████▍     | 22/50 [11:55<10:51, 23.27s/it]


[21] GT: 14 | Votes: [] | Majority: N/A
[Attempt 1] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499757, Requested 622. Please try again in 1m5.391999999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 2] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499750, Requested 622. Please try again in 1m4.188999999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 3] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in

 46%|████▌     | 23/50 [13:16<18:13, 40.51s/it]


[22] GT: 7 | Votes: ['7'] | Majority: 7
[Attempt 1] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500171, Requested 605. Please try again in 2m14.1796s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 2] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500164, Requested 605. Please try again in 2m12.9276s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 3] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organiz

 48%|████▊     | 24/50 [13:34<14:36, 33.70s/it]


[23] GT: 8 | Votes: [] | Majority: N/A
[Attempt 1] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500064, Requested 606. Please try again in 1m55.9174s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 2] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 500057, Requested 606. Please try again in 1m54.673399999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 3] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in org

 50%|█████     | 25/50 [13:53<12:12, 29.31s/it]


[24] GT: 26 | Votes: [] | Majority: N/A
[Attempt 1] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499957, Requested 627. Please try again in 1m40.8742s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 2] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499950, Requested 627. Please try again in 1m39.606199999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 3] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in or

 52%|█████▏    | 26/50 [14:11<10:26, 26.11s/it]


[25] GT: 2 | Votes: [] | Majority: N/A
[Attempt 1] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499852, Requested 627. Please try again in 1m22.672199999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 2] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499845, Requested 627. Please try again in 1m21.4372s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 3] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in org

 54%|█████▍    | 27/50 [14:29<09:04, 23.66s/it]


[26] GT: 243 | Votes: [] | Majority: N/A
[Attempt 1] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499745, Requested 621. Please try again in 1m3.0924s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 2] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01k1gtn3fcfkw95b53fvs24h2z` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499738, Requested 621. Please try again in 1m1.8684s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[Attempt 3] Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organiza

 54%|█████▍    | 27/50 [15:05<12:51, 33.55s/it]


KeyboardInterrupt: 

In [None]:

#3-shot



prompt_template = construct_CoT_prompt(3)  # 0CoT prompt
results, acc = run_sc_benchmark_test(
    dataset=gsm8k_test,
    prompt_template=prompt_template,
    num_samples=50,
    num_generations=5,  # self-consistency 횟수
    VERBOSE=True
)
save_final_result(results, acc, "my_prompting_3.txt")



In [13]:


#5-shot


prompt_template = construct_CoT_prompt(5)  # CoT prompt
results, acc = run_sc_benchmark_test(
    dataset=gsm8k_test,
    prompt_template=prompt_template,
    num_samples=50,
    num_generations=5,  # self-consistency 횟수
    VERBOSE=True
)
save_final_result(results, acc, "my_prompting_5.txt")








  2%|▏         | 1/50 [00:07<06:23,  7.82s/it]


[0] GT: 18 | Votes: ['18', '18', '18', '18', '18'] | Majority: 18


  4%|▍         | 2/50 [01:15<34:24, 43.01s/it]


[1] GT: 3 | Votes: ['3', '3', '3', '3', '3'] | Majority: 3


  6%|▌         | 3/50 [02:26<43:37, 55.69s/it]


[2] GT: 70000 | Votes: ['70000', '70000', '70000', '70000', '70000'] | Majority: 70000


  8%|▊         | 4/50 [03:35<46:46, 61.01s/it]


[3] GT: 540 | Votes: ['540', '540', '540', '540', '540'] | Majority: 540


 10%|█         | 5/50 [04:49<49:14, 65.65s/it]


[4] GT: 20 | Votes: ['20', '20', '20', '20', '20'] | Majority: 20


 12%|█▏        | 6/50 [06:01<49:49, 67.94s/it]


[5] GT: 64 | Votes: ['50', '64', '64', '92', '64'] | Majority: 64


 14%|█▍        | 7/50 [07:10<49:00, 68.38s/it]


[6] GT: 260 | Votes: ['260', '260', '260', '260', '260'] | Majority: 260


 16%|█▌        | 8/50 [08:25<49:17, 70.41s/it]


[7] GT: 160 | Votes: ['160', '120', '120', '160', '120'] | Majority: 120


 18%|█▊        | 9/50 [09:46<50:17, 73.59s/it]


[8] GT: 45 | Votes: ['45', '45', '45', '45', '45'] | Majority: 45


 20%|██        | 10/50 [10:57<48:39, 73.00s/it]


[9] GT: 460 | Votes: ['460', '460', '460', '460', '460'] | Majority: 460


 22%|██▏       | 11/50 [12:08<46:57, 72.24s/it]


[10] GT: 366 | Votes: ['366', '366', '366', '366', '366'] | Majority: 366


 24%|██▍       | 12/50 [13:20<45:40, 72.12s/it]


[11] GT: 694 | Votes: ['694', '694', '694', '694', '694'] | Majority: 694


 26%|██▌       | 13/50 [14:38<45:34, 73.91s/it]


[12] GT: 13 | Votes: ['12', '12', '12', '13', '12'] | Majority: 12


 28%|██▊       | 14/50 [15:52<44:25, 74.04s/it]


[13] GT: 18 | Votes: ['18', '18', '36', '18', '18'] | Majority: 18


 30%|███       | 15/50 [17:06<43:07, 73.94s/it]


[14] GT: 60 | Votes: ['60', '60', '60', '60', '60'] | Majority: 60


 32%|███▏      | 16/50 [18:22<42:11, 74.45s/it]


[15] GT: 125 | Votes: ['125', '96', '125', '125', '125'] | Majority: 125


 34%|███▍      | 17/50 [19:32<40:13, 73.14s/it]


[16] GT: 230 | Votes: ['230', '230', '230', '230', '230'] | Majority: 230


 36%|███▌      | 18/50 [20:44<38:49, 72.80s/it]


[17] GT: 57500 | Votes: ['57500', '57500', '57500', '57500', '57500'] | Majority: 57500


 38%|███▊      | 19/50 [21:53<37:01, 71.66s/it]


[18] GT: 7 | Votes: ['7', '7', '7', '7', '7'] | Majority: 7


 40%|████      | 20/50 [23:08<36:26, 72.89s/it]


[19] GT: 6 | Votes: ['6', '6', '6', '6', '4'] | Majority: 6


 42%|████▏     | 21/50 [24:28<36:09, 74.80s/it]


[20] GT: 15 | Votes: ['15', '15', '15', '15', '15'] | Majority: 15


 44%|████▍     | 22/50 [25:38<34:17, 73.49s/it]


[21] GT: 14 | Votes: ['14', '14', '14', '14', '14'] | Majority: 14


 46%|████▌     | 23/50 [26:46<32:20, 71.87s/it]


[22] GT: 7 | Votes: ['7', '7', '7', '7', '7'] | Majority: 7


 48%|████▊     | 24/50 [27:53<30:25, 70.23s/it]


[23] GT: 8 | Votes: ['8', '8', '8', '8', '8'] | Majority: 8


 50%|█████     | 25/50 [29:02<29:09, 69.98s/it]


[24] GT: 26 | Votes: ['26', '26', '26', '26', '26'] | Majority: 26


 52%|█████▏    | 26/50 [30:18<28:44, 71.85s/it]


[25] GT: 2 | Votes: ['2', '2', '2', '2', '2'] | Majority: 2


 54%|█████▍    | 27/50 [31:30<27:35, 71.96s/it]


[26] GT: 243 | Votes: ['243', '243', '243', '243', '243'] | Majority: 243


 56%|█████▌    | 28/50 [32:40<26:10, 71.38s/it]


[27] GT: 16 | Votes: ['16', '16', '16', '16', '16'] | Majority: 16


 58%|█████▊    | 29/50 [33:51<24:53, 71.10s/it]


[28] GT: 25 | Votes: ['25', '25', '25', '25', '25'] | Majority: 25


 60%|██████    | 30/50 [35:02<23:40, 71.03s/it]


[29] GT: 104 | Votes: ['104', '104', '104', '104', '104'] | Majority: 104


 62%|██████▏   | 31/50 [36:13<22:30, 71.10s/it]


[30] GT: 109 | Votes: ['109', '109', '109', '109', '109'] | Majority: 109


 64%|██████▍   | 32/50 [37:25<21:25, 71.43s/it]


[31] GT: 80 | Votes: ['80', '80', '80', '80', '80'] | Majority: 80


 66%|██████▌   | 33/50 [38:34<20:00, 70.60s/it]


[32] GT: 35 | Votes: ['35', '35', '35', '35', '35'] | Majority: 35


 68%|██████▊   | 34/50 [39:42<18:39, 69.99s/it]


[33] GT: 70 | Votes: ['70', '70', '70', '70', '70'] | Majority: 70


 70%|███████   | 35/50 [40:51<17:24, 69.61s/it]


[34] GT: 23 | Votes: ['23', '23', '23', '23', '23'] | Majority: 23


 72%|███████▏  | 36/50 [42:02<16:19, 69.99s/it]


[35] GT: 40 | Votes: ['9', '9', '9', '9', '9'] | Majority: 9


 74%|███████▍  | 37/50 [43:12<15:10, 70.05s/it]


[36] GT: 75 | Votes: ['75', '75', '75', '75', '75'] | Majority: 75


 76%|███████▌  | 38/50 [44:25<14:11, 70.98s/it]


[37] GT: 2 | Votes: ['0', '2', '2', '2', '11'] | Majority: 2


 78%|███████▊  | 39/50 [45:36<13:00, 70.95s/it]


[38] GT: 10 | Votes: ['10', '10', '10', '10', '10'] | Majority: 10


 80%|████████  | 40/50 [46:53<12:07, 72.75s/it]


[39] GT: 18 | Votes: ['18', '18', '18', '18', '18'] | Majority: 18


 82%|████████▏ | 41/50 [48:02<10:43, 71.46s/it]


[40] GT: 8 | Votes: ['8', '8', '8', '8', '8'] | Majority: 8


 84%|████████▍ | 42/50 [49:15<09:36, 72.01s/it]


[41] GT: 200 | Votes: ['600', '200', '200', '200', '200'] | Majority: 200


 86%|████████▌ | 43/50 [50:25<08:20, 71.57s/it]


[42] GT: 26 | Votes: ['26', '26', '26', '26', '26'] | Majority: 26


 88%|████████▊ | 44/50 [51:38<07:11, 71.99s/it]


[43] GT: 48 | Votes: ['48', '48', '48', '48', '47'] | Majority: 48


 90%|█████████ | 45/50 [52:51<06:01, 72.28s/it]


[44] GT: 20 | Votes: ['20', '20', '20', '20', '20'] | Majority: 20


 92%|█████████▏| 46/50 [54:06<04:51, 72.99s/it]


[45] GT: 104 | Votes: ['104', '104', '104', '104', '104'] | Majority: 104


 94%|█████████▍| 47/50 [55:19<03:38, 72.91s/it]


[46] GT: 163 | Votes: ['163', '163', '163', '163', '163'] | Majority: 163


 96%|█████████▌| 48/50 [56:30<02:24, 72.44s/it]


[47] GT: 800 | Votes: ['800', '800', '800', '800', '800'] | Majority: 800


 98%|█████████▊| 49/50 [57:39<01:11, 71.50s/it]


[48] GT: 8 | Votes: ['8', '8', '8', '8', '8'] | Majority: 8


100%|██████████| 50/50 [58:50<00:00, 70.61s/it]


[49] GT: 30 | Votes: ['30', '30', '30', '30', '30'] | Majority: 30





### 보고서 작성하기
#### 아래의 내용이 포함되면 됩니다!

1. Direct Prompting, CoT Prompting, My Prompting을 0 shot, 3 shot, 5 shot 정답률을 표로 보여주세요!
2. CoT Prompting이 Direct Prompting에 비해 왜 좋을 수 있는지에 대해서 서술해주세요!
3. 본인이 작성한 프롬프트 기법이 CoT에 비해서 왜 더 좋을 수 있는지에 대해서 설명해주세요!
4. 최종적으로, `PROMPTING.md`에 보고서를 작성해주세요!