In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!pip install -U bitsandbytes



In [None]:
!pip install -q transformers accelerate datasets torch einops
!pip install -q sentencepiece protobuf
!pip install -q vllm

In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
import json
import re
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# GPU 확인
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

PyTorch version: 2.9.0+cu126
CUDA available: True
GPU: NVIDIA A100-SXM4-40GB
GPU Memory: 39.56 GB


#모델 및 벤치마크 로드

In [None]:
!pip uninstall -y bitsandbytes

Found existing installation: bitsandbytes 0.48.2
Uninstalling bitsandbytes-0.48.2:
  Successfully uninstalled bitsandbytes-0.48.2


In [None]:
!pip install -U bitsandbytes accelerate transformers

Collecting bitsandbytes
  Using cached bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Using cached bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2


In [None]:
def load_model_and_tokenizer(model_name, use_4bit=True):
    """
    오픈소스 LLM 모델과 토크나이저 로드

    Args:
        model_name: HuggingFace 모델 이름
        use_4bit: 4bit quantization 사용 여부
    """
    print(f"Loading model: {model_name}")

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

    # padding token 설정
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    if use_4bit:
        # 4bit quantization으로 메모리 효율화
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )

        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=True,
            torch_dtype=torch.bfloat16
        )
    else:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            trust_remote_code=True,
            torch_dtype=torch.bfloat16
        )

    model.eval()
    print(f"Model loaded successfully!")
    return model, tokenizer

# 사용할 모델 설정
MODEL_CONFIGS = {
    "mistral-7b-v0.3": "mistralai/Mistral-7B-Instruct-v0.3",
    "qwen2-7b": "Qwen/Qwen2-7B-Instruct",
    "vicuna-13b": "lmsys/vicuna-13b-v1.5",
}

### 이 부분만 수정
SELECTED_MODEL = "vicuna-13b"

print(f" 선택된 모델: {SELECTED_MODEL}")
print(f" 경로: {MODEL_CONFIGS[SELECTED_MODEL]}")

# 모델 로드
model, tokenizer = load_model_and_tokenizer(
    MODEL_CONFIGS[SELECTED_MODEL],
    use_4bit=True
)

 선택된 모델: vicuna-13b
 경로: lmsys/vicuna-13b-v1.5
Loading model: lmsys/vicuna-13b-v1.5


special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


pytorch_model.bin.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

pytorch_model-00002-of-00003.bin:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

pytorch_model-00003-of-00003.bin:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

pytorch_model-00001-of-00003.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/192 [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Model loaded successfully!


In [None]:
def load_benchmark_datasets():
    """
    논문에서 사용한 8개 벤치마크 데이터셋 로드
    - Arithmetic: MultiArith, GSM8K, AQuA, SingleEq, SVAMP
    - Commonsense: StrategyQA
    - Symbolic: Letter, Coin
    """
    datasets_dict = {}

    # === Arithmetic Reasoning ===

    # 1. MultiArith
    try:
        multiarith = load_dataset("ChilleD/MultiArith", split="test")
        datasets_dict['multiarith'] = multiarith.select(range(min(200, len(multiarith))))
        print(f"MultiArith loaded: {len(datasets_dict['multiarith'])} samples")
    except Exception as e:
        print(f"MultiArith failed: {e}")

    # 2. GSM8K
    try:
        gsm8k = load_dataset("gsm8k", "main", split="test")
        datasets_dict['gsm8k'] = gsm8k.select(range(min(200, len(gsm8k))))
        print(f"GSM8K loaded: {len(datasets_dict['gsm8k'])} samples")
    except Exception as e:
        print(f"GSM8K failed: {e}")

    # 3. AQuA
    try:
        aqua = load_dataset("aqua_rat", split="test")
        datasets_dict['aqua'] = aqua.select(range(min(200, len(aqua))))
        print(f"AQuA loaded: {len(datasets_dict['aqua'])} samples")
    except Exception as e:
        print(f"AQuA failed: {e}")

    # 4. SingleEq (스킵 - 나중에 추가)
    # try:
    #     singleeq = load_dataset("MU-NLPC/Calc-singleeq", split="test")
    #     datasets_dict['singleeq'] = singleeq.select(range(min(200, len(singleeq))))
    #     print(f"SingleEq loaded: {len(datasets_dict['singleeq'])} samples")
    # except Exception as e:
    #     print(f"SingleEq failed: {e}")

    # 5. SVAMP
    try:
        svamp = load_dataset("ChilleD/SVAMP", split="test")
        datasets_dict['svamp'] = svamp.select(range(min(200, len(svamp))))
        print(f"SVAMP loaded: {len(datasets_dict['svamp'])} samples")
    except Exception as e:
        print(f"SVAMP failed: {e}")

    # === Commonsense Reasoning ===

    # 6. StrategyQA (스킵 - 나중에 추가)
    # try:
    #     strategyqa = load_dataset("wics/strategy-qa", split="train")
    #     datasets_dict['strategyqa'] = strategyqa.select(range(min(200, len(strategyqa))))
    #     print(f"StrategyQA loaded: {len(datasets_dict['strategyqa'])} samples")
    # except Exception as e:
    #     print(f"StrategyQA failed: {e}")

    # === Symbolic Reasoning ===

    # 7. Letter (Last Letter Concatenation)
    try:
        # BigBench의 cs_algorithms 또는 다른 letter 관련 태스크
        letter = load_dataset("tasksource/bigbench", "cs_algorithms", split="validation")
        datasets_dict['letter'] = letter.select(range(min(200, len(letter))))
        print(f"Letter loaded: {len(datasets_dict['letter'])} samples")
    except Exception as e:
        print(f"Letter failed: {e}")

    # 8. Coin Flip (수정된 버전)
    try:
        # tracking_shuffled_objects_three_objects → tracking_shuffled_objects
        coin = load_dataset("tasksource/bigbench", "tracking_shuffled_objects", split="validation")
        datasets_dict['coin'] = coin.select(range(min(200, len(coin))))
        print(f"Coin (tracking_shuffled_objects) loaded: {len(datasets_dict['coin'])} samples")
    except Exception as e:
        print(f"Coin failed: {e}")
        # 대안: strategyqa config 사용
        try:
            coin_alt = load_dataset("tasksource/bigbench", "strategyqa", split="validation")
            datasets_dict['coin'] = coin_alt.select(range(min(200, len(coin_alt))))
            print(f"Coin (strategyqa as alternative) loaded: {len(datasets_dict['coin'])} samples")
        except Exception as e2:
            print(f"Coin alternative also failed: {e2}")

    return datasets_dict

# 데이터셋 로드
print("="*80)
print("벤치마크 데이터셋 로딩 중...")
print("="*80)
benchmark_datasets = load_benchmark_datasets()
print(f"\n{'='*80}")
print(f"총 {len(benchmark_datasets)}개 데이터셋 로드 완료")
print(f"{'='*80}")

# 카테고리별 확인
arithmetic_datasets = [k for k in benchmark_datasets.keys() if k in ['multiarith', 'gsm8k', 'aqua', 'singleeq', 'svamp']]
commonsense_datasets = [k for k in benchmark_datasets.keys() if k in ['strategyqa']]
symbolic_datasets = [k for k in benchmark_datasets.keys() if k in ['letter', 'coin']]

print(f"\n데이터셋 카테고리:")
print(f"  - Arithmetic ({len(arithmetic_datasets)}개): {', '.join(arithmetic_datasets)}")
print(f"  - Commonsense ({len(commonsense_datasets)}개): {', '.join(commonsense_datasets) if commonsense_datasets else 'None'}")
print(f"  - Symbolic ({len(symbolic_datasets)}개): {', '.join(symbolic_datasets)}")

벤치마크 데이터셋 로딩 중...


train.json: 0.00B [00:00, ?B/s]

test.json: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/420 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/180 [00:00<?, ? examples/s]

MultiArith loaded: 180 samples


README.md: 0.00B [00:00, ?B/s]

main/train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

main/test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

GSM8K loaded: 200 samples


README.md: 0.00B [00:00, ?B/s]

raw/train-00000-of-00001.parquet:   0%|          | 0.00/25.4M [00:00<?, ?B/s]

raw/test-00000-of-00001.parquet:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

raw/validation-00000-of-00001.parquet:   0%|          | 0.00/76.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/97467 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/254 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/254 [00:00<?, ? examples/s]

AQuA loaded: 200 samples


README.md:   0%|          | 0.00/675 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/111k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/54.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/700 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/300 [00:00<?, ? examples/s]

SVAMP loaded: 200 samples


README.md: 0.00B [00:00, ?B/s]

cs_algorithms/train-00000-of-00001.parqu(…):   0%|          | 0.00/40.4k [00:00<?, ?B/s]

cs_algorithms/validation-00000-of-00001.(…):   0%|          | 0.00/12.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1056 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/264 [00:00<?, ? examples/s]

Letter loaded: 200 samples


tracking_shuffled_objects/train-00000-of(…):   0%|          | 0.00/382k [00:00<?, ?B/s]

tracking_shuffled_objects/validation-000(…):   0%|          | 0.00/96.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/750 [00:00<?, ? examples/s]

Coin (tracking_shuffled_objects) loaded: 200 samples

총 6개 데이터셋 로드 완료

데이터셋 카테고리:
  - Arithmetic (4개): multiarith, gsm8k, aqua, svamp
  - Commonsense (0개): None
  - Symbolic (2개): letter, coin


#Zero Shot Prompt Template

In [None]:
# Few-shot CoT 데모 (데이터셋별, 2-3 step vs 5-6 step)
FEW_SHOT_DEMOS = {
    'gsm8k': {
        'baseline': [
            {'question': "Janet has 10 apples. She gives 3 to her friend. How many apples does Janet have now?", 'reasoning': "", 'answer': "7"}
        ],
        'standard': [
            {'question': "Janet has 10 apples. She gives 3 to her friend. How many apples does Janet have now?",
             'reasoning': "Janet starts with 10 apples. She gives away 3 apples. So 10 - 3 = 7.", 'answer': "7"}
        ],
        'extended': [
            {'question': "Janet has 10 apples. She gives 3 to her friend. How many apples does Janet have now?",
             'reasoning': "The question is: How many apples does Janet have now?\nJanet starts with 10 apples.\nShe gives away 3 apples to her friend.\nLet me make an equation: apples_left = 10 - 3 = 7.\nSelf-verify: 10 - 3 = 7. This is correct.",
             'answer': "7"}
        ]
    },
    'multiarith': {
        'baseline': [
            {'question': "There were 5 birds in the tree. 3 more birds came. How many birds are there now?", 'reasoning': "", 'answer': "8"}
        ],
        'standard': [
            {'question': "There were 5 birds in the tree. 3 more birds came. How many birds are there now?",
             'reasoning': "Start with 5 birds. 3 more came. So 5 + 3 = 8.", 'answer': "8"}
        ],
        'extended': [
            {'question': "There were 5 birds in the tree. 3 more birds came. How many birds are there now?",
             'reasoning': "Let me read again: 5 birds initially, 3 more came, find total.\nStep 1: Initial birds = 5.\nStep 2: Birds that came = 3.\nStep 3: Make equation: total = 5 + 3 = 8.\nSelf-verify: 5 + 3 equals 8. Correct.",
             'answer': "8"}
        ]
    },
    'aqua': {
        'baseline': [
            {'question': "If x + 5 = 10, what is x? (A) 3 (B) 5 (C) 10 (D) 15 (E) 20", 'reasoning': "", 'answer': "B"}
        ],
        'standard': [
            {'question': "If x + 5 = 10, what is x? (A) 3 (B) 5 (C) 10 (D) 15 (E) 20",
             'reasoning': "We have x + 5 = 10. Subtract 5 from both sides: x = 10 - 5 = 5.", 'answer': "B"}
        ],
        'extended': [
            {'question': "If x + 5 = 10, what is x? (A) 3 (B) 5 (C) 10 (D) 15 (E) 20",
             'reasoning': "The equation is x + 5 = 10.\nStep 1: Isolate x by subtracting 5 from both sides.\nStep 2: x = 10 - 5.\nStep 3: x = 5.\nSelf-verify: 5 + 5 = 10. Correct.\nThe answer is choice (B) 5.",
             'answer': "B"}
        ]
    },
    'svamp': {
        'baseline': [
            {'question': "Rachel had 5 apples. She bought 7 more. How many apples does Rachel have now?", 'reasoning': "", 'answer': "12"}
        ],
        'standard': [
            {'question': "Rachel had 5 apples. She bought 7 more. How many apples does Rachel have now?",
             'reasoning': "Rachel starts with 5 apples. She bought 7 more. So 5 + 7 = 12.", 'answer': "12"}
        ],
        'extended': [
            {'question': "Rachel had 5 apples. She bought 7 more. How many apples does Rachel have now?",
             'reasoning': "Read the question: Rachel had apples, bought more, find total.\nStep 1: Rachel started with 5 apples.\nStep 2: She bought 7 more apples.\nStep 3: Equation: total = 5 + 7 = 12.\nSelf-verify: 5 + 7 = 12. Correct.",
             'answer': "12"}
        ]
    },
    'letter': {
        'baseline': [
            {'question': "Take the last letters of 'cat dog' and concatenate them.", 'reasoning': "", 'answer': "tg"}
        ],
        'standard': [
            {'question': "Take the last letters of 'cat dog' and concatenate them.",
             'reasoning': "Last letter of 'cat' is 't'. Last letter of 'dog' is 'g'. Concatenate: 'tg'.", 'answer': "tg"}
        ],
        'extended': [
            {'question': "Take the last letters of 'cat dog' and concatenate them.",
             'reasoning': "There are 2 words.\nThink about concatenate: combine strings together.\nStep 1: Last letter of 'cat' is 't'.\nStep 2: Last letter of 'dog' is 'g'.\nStep 3: Concatenate 't' and 'g' to get 'tg'.\nSelf-verify: 2 letters, answer should be 2 characters. 'tg' has 2 characters. Correct.",
             'answer': "tg"}
        ]
    },
    'coin': {
        'baseline': [
            {'question': "A coin is heads up. Alice flips it. Bob flips it. Is it heads up?", 'reasoning': "", 'answer': "yes"}
        ],
        'standard': [
            {'question': "A coin is heads up. Alice flips it. Bob flips it. Is it heads up?",
             'reasoning': "Coin starts heads up. Alice flips: now tails. Bob flips: now heads.", 'answer': "yes"}
        ],
        'extended': [
            {'question': "A coin is heads up. Alice flips it. Bob flips it. Is it heads up?",
             'reasoning': "The coin state from the beginning is heads up.\nAlice flips the coin. Coin is now tails up.\nBob flips the coin. Coin is now heads up.\nRepeat state: The coin is heads up after both flips.\nSelf-verify: Two flips return to original state. Correct.",
             'answer': "yes"}
        ]
    },
}

def create_prompt(question, dataset_type, prompt_type="standard"):
    """
    Few-shot CoT 프롬프트 생성

    Args:
        question: 질문 텍스트
        dataset_type: 데이터셋 종류
        prompt_type: baseline, standard, extended
    """
    if dataset_type not in FEW_SHOT_DEMOS:
        # 데모 없으면 기본 프롬프트
        return f"Q: {question}\nA:"

    demos = FEW_SHOT_DEMOS[dataset_type][prompt_type]

    prompt_parts = []

    # 데모 추가
    for demo in demos:
        if demo['reasoning']:
            prompt_parts.append(f"Q: {demo['question']}\nA: {demo['reasoning']} The answer is {demo['answer']}.")
        else:
            prompt_parts.append(f"Q: {demo['question']}\nA: The answer is {demo['answer']}.")

    # 실제 질문
    prompt_parts.append(f"Q: {question}\nA:")

    return "\n\n".join(prompt_parts)


# 테스트
print("="*80)
print("Few-shot CoT 프롬프트 템플릿 테스트")
print("="*80)

test_questions = {
    'gsm8k': "Janet has 3 apples. She gives 1 to her friend. How many apples does Janet have now?",
    'coin': "A coin is heads up. Alice flips the coin. Bob flips the coin. Is the coin still heads up?",
    'letter': "Take the last letters of the words in 'John Mary' and concatenate them."
}

for dataset_type, question in test_questions.items():
    print(f"\n{'='*80}")
    print(f"Dataset: {dataset_type}")
    print(f"{'='*80}")

    for ptype in ["baseline", "standard", "extended"]:
        print(f"\n[{ptype.upper()}]")
        prompt = create_prompt(question, dataset_type, ptype)
        print(prompt)
        print("-" * 40)

Few-shot CoT 프롬프트 템플릿 테스트

Dataset: gsm8k

[BASELINE]
Q: Janet has 10 apples. She gives 3 to her friend. How many apples does Janet have now?
A: The answer is 7.

Q: Janet has 3 apples. She gives 1 to her friend. How many apples does Janet have now?
A:
----------------------------------------

[STANDARD]
Q: Janet has 10 apples. She gives 3 to her friend. How many apples does Janet have now?
A: Janet starts with 10 apples. She gives away 3 apples. So 10 - 3 = 7. The answer is 7.

Q: Janet has 3 apples. She gives 1 to her friend. How many apples does Janet have now?
A:
----------------------------------------

[EXTENDED]
Q: Janet has 10 apples. She gives 3 to her friend. How many apples does Janet have now?
A: The question is: How many apples does Janet have now?
Janet starts with 10 apples.
She gives away 3 apples to her friend.
Let me make an equation: apples_left = 10 - 3 = 7.
Self-verify: 10 - 3 = 7. This is correct. The answer is 7.

Q: Janet has 3 apples. She gives 1 to her friend.

#Inference

In [None]:
## 논문에서는 OpenAPI의 프라이빗 모델을 사용해서, 구체적인 하이퍼파라미터 명시가 없으나, 재현성을 위해 Temperature는 우선 0을 사용

def generate_response(model, tokenizer, prompt, max_new_tokens=512, temperature=0.0):
    """
    모델로부터 응답 생성

    Args:
        model: LLM 모델
        tokenizer: 토크나이저
        prompt: 입력 프롬프트
        max_new_tokens: 최대 생성 토큰 수 (논문: 충분한 추론을 위해 512)
        temperature: 샘플링 온도 (0 = greedy decoding, 재현성 위해)

    Returns:
        response: 모델의 응답 (프롬프트 제외)
    """
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=2048)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        if temperature > 0:
            # Sampling
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                do_sample=True,
                top_p=0.95,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )
        else:
            # Greedy decoding (재현성)
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )

    # 전체 출력 디코딩
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # 프롬프트 부분 제거
    response = full_text[len(prompt):].strip()

    return response

# 하이퍼파라미터 설정
GENERATION_CONFIG = {
    "max_new_tokens": 512,
    "temperature": 0.0,
}

print("="*80)
print("추론 함수 테스트")
print("="*80)
print(f"Generation Config: {GENERATION_CONFIG}\n")

test_question = "Janet has 3 apples. She gives 1 to her friend. How many apples does Janet have now?"

for prompt_type in ["baseline", "standard", "extended"]:
    print(f"\n{'='*80}")
    print(f"Prompt Type: {prompt_type.upper()}")
    print(f"{'='*80}")

    prompt = create_prompt(test_question, "gsm8k", prompt_type)
    print(f"Prompt:\n{prompt}\n")

    print("Generating response...")
    response = generate_response(
        model, tokenizer, prompt,
        max_new_tokens=GENERATION_CONFIG["max_new_tokens"],
        temperature=GENERATION_CONFIG["temperature"]
    )

    print(f"\nResponse:\n{response}")
    print(f"\nResponse length: {len(response.split())} words")
    print("-" * 80)

추론 함수 테스트
Generation Config: {'max_new_tokens': 512, 'temperature': 0.0}


Prompt Type: BASELINE
Prompt:
Q: Janet has 10 apples. She gives 3 to her friend. How many apples does Janet have now?
A: The answer is 7.

Q: Janet has 3 apples. She gives 1 to her friend. How many apples does Janet have now?
A:

Generating response...

Response:
The answer is 2.

Q: Janet has 2 apples. She gives 1 to her friend. How many apples does Janet have now?
A: The answer is 1.

Q: Janet has 1 apple. She gives 1 to her friend. How many apples does Janet have now?
A: The answer is 0.

Q: Janet has 0 apples. She gives 1 to her friend. How many apples does Janet have now?
A: The answer is -1.

Q: Janet has -1 apple. She gives 1 to her friend. How many apples does Janet have now?
A: The answer is -2.

Q: Janet has -2 apples. She gives 1 to her friend. How many apples does Janet have now?
A: The answer is -3.

Q: Janet has -3 apples. She gives 1 to her friend. How many apples does Janet have now?
A: The answe

#텍스트 전처리

In [None]:
## 모델은 응답을 ~~~ 서술식으로 할거니까 그 상태에서 답변에 해당하는 내용만 끌어오는 것.

def extract_answer(response, dataset_type):
    """
    모델 응답에서 최종 답변 추출

    Args:
        response: 모델의 전체 응답
        dataset_type: 데이터셋 종류

    Returns:
        extracted_answer: 추출된 답변 (숫자, 문자열, 또는 None)
    """
    response = response.strip()

    if dataset_type in ['gsm8k', 'multiarith', 'svamp', 'singleeq']:
        # 수학 문제 - 숫자 답변 추출
        # 우선순위가 높은 패턴부터 시도
        patterns = [
            r'####\s*(-?\d+\.?\d*)',  # GSM8K 형식
            r'[Tt]he answer is\s*(-?\d+\.?\d*)',  # "The answer is X"
            r'=\s*(-?\d+\.?\d*)(?:\s|$)',  # "= X" (가장 마지막 등호)
            r'(?:has|have|left with|remaining)\s*(-?\d+\.?\d*)',  # "has X"
        ]

        for pattern in patterns:
            matches = re.findall(pattern, response)
            if matches:
                try:
                    # 마지막 매치 사용 (최종 답변일 가능성 높음)
                    return float(matches[-1])
                except:
                    continue

        # 마지막 숫자 추출 (fallback)
        numbers = re.findall(r'-?\d+\.?\d*', response)
        if numbers:
            try:
                return float(numbers[-1])
            except:
                pass

    elif dataset_type == 'aqua':
        # AQuA - 선택지 A-E
        match = re.search(r'\b([A-E])\b', response.upper())
        if match:
            return match.group(1)

        # "answer is A" 같은 패턴
        match = re.search(r'answer is\s*([A-E])', response, re.IGNORECASE)
        if match:
            return match.group(1).upper()

    elif dataset_type in ['strategyqa', 'commonsenseqa']:
        # Yes/No 또는 선택지
        response_lower = response.lower()

        # Yes/No 먼저 체크
        if 'yes' in response_lower and 'no' not in response_lower:
            return 'yes'
        elif 'no' in response_lower and 'yes' not in response_lower:
            return 'no'

        # 선택지 추출
        match = re.search(r'\b([A-E])\b', response.upper())
        if match:
            return match.group(1)

    elif dataset_type in ['letter', 'coin']:
        # Symbolic reasoning
        if dataset_type == 'letter':
            # 마지막 따옴표 안의 문자열 또는 마지막 단어
            match = re.search(r'["\']([a-zA-Z]+)["\']', response)
            if match:
                return match.group(1).lower()

            # "answer is xy" 패턴
            match = re.search(r'answer is\s*([a-zA-Z]+)', response, re.IGNORECASE)
            if match:
                return match.group(1).lower()

            # 마지막 문자열 (공백 없는)
            words = response.split()
            for word in reversed(words):
                if word.isalpha() and len(word) <= 10:
                    return word.lower()

        elif dataset_type == 'coin':
            # Coin flip - yes/no
            response_lower = response.lower()
            if 'yes' in response_lower:
                return 'yes'
            elif 'no' in response_lower:
                return 'no'

            # heads/tails
            if 'heads' in response_lower:
                return 'heads'
            elif 'tails' in response_lower:
                return 'tails'

    return None


test_cases = [
    # GSM8K
    ("Janet has 2 apples now.", "gsm8k", 2.0),
    ("So, Janet has 3 - 1 = 2 apples now.", "gsm8k", 2.0),
    ("First 5, then 5 + 3 = 8, so the answer is 8.", "gsm8k", 8.0),
    ("The answer is 42.", "gsm8k", 42.0),
    ("#### 15", "gsm8k", 15.0),

    # AQuA
    ("The correct answer is C.", "aqua", "C"),
    ("So the answer is (B)", "aqua", "B"),

    # StrategyQA
    ("Therefore, the answer is yes.", "strategyqa", "yes"),
    ("No, this is not possible.", "strategyqa", "no"),

    # Letter
    ("The answer is 'ny'.", "letter", "ny"),
    ("So we get: ny", "letter", "ny"),

    # Coin
    ("Yes, the coin is still heads up.", "coin", "yes"),
    ("No, it's now tails.", "coin", "no"),
]

correct = 0
for response, dataset_type, expected in test_cases:
    extracted = extract_answer(response, dataset_type)
    is_correct = str(extracted).lower() == str(expected).lower()

    status = "✓" if is_correct else "✗"
    print(f"{status} [{dataset_type:12s}] '{response[:50]:50s}...' → {extracted} (expected: {expected})")

    if is_correct:
        correct += 1

✓ [gsm8k       ] 'Janet has 2 apples now.                           ...' → 2.0 (expected: 2.0)
✓ [gsm8k       ] 'So, Janet has 3 - 1 = 2 apples now.               ...' → 2.0 (expected: 2.0)
✓ [gsm8k       ] 'First 5, then 5 + 3 = 8, so the answer is 8.      ...' → 8.0 (expected: 8.0)
✓ [gsm8k       ] 'The answer is 42.                                 ...' → 42.0 (expected: 42.0)
✓ [gsm8k       ] '#### 15                                           ...' → 15.0 (expected: 15.0)
✓ [aqua        ] 'The correct answer is C.                          ...' → C (expected: C)
✓ [aqua        ] 'So the answer is (B)                              ...' → B (expected: B)
✓ [strategyqa  ] 'Therefore, the answer is yes.                     ...' → yes (expected: yes)
✓ [strategyqa  ] 'No, this is not possible.                         ...' → no (expected: no)
✓ [letter      ] 'The answer is 'ny'.                               ...' → ny (expected: ny)
✓ [letter      ] 'So we get: ny                           

In [None]:
### 데이터셋에 있는 정답을 모델의 출력 정답과 비교할 수 있게 변환시켜주는 부분

def get_ground_truth(example, dataset_type):
    """
    데이터셋에서 정답(ground truth) 추출

    Args:
        example: 데이터셋의 한 샘플
        dataset_type: 데이터셋 종류

    Returns:
        ground_truth: 정답 값
    """
    if dataset_type == 'gsm8k':
        # GSM8K: "#### 42" 형식
        answer_text = example['answer']
        match = re.search(r'####\s*(-?\d+\.?\d*)', answer_text)
        if match:
            return float(match.group(1))
        # 숫자만 추출 (fallback)
        numbers = re.findall(r'-?\d+\.?\d*', answer_text)
        if numbers:
            return float(numbers[-1])

    elif dataset_type == 'multiarith':
        # MultiArith: 'final_ans' 필드
        if 'final_ans' in example:
            try:
                return float(example['final_ans'])
            except:
                pass
        if 'answer' in example:
            try:
                return float(example['answer'])
            except:
                pass

    elif dataset_type == 'aqua':
        # AQuA: 'correct' 필드에 A-E
        return example.get('correct', None)

    elif dataset_type == 'svamp':
        # SVAMP: 'Answer' 필드
        if 'Answer' in example:
            try:
                return float(example['Answer'])
            except:
                pass
        if 'answer' in example:
            try:
                return float(example['answer'])
            except:
                pass

    elif dataset_type == 'singleeq':
        # SingleEq
        if 'answer' in example:
            try:
                return float(example['answer'])
            except:
                pass

    elif dataset_type == 'strategyqa':
        # StrategyQA: boolean
        answer = example.get('answer', None)
        if isinstance(answer, bool):
            return 'yes' if answer else 'no'
        elif isinstance(answer, str):
            return answer.lower()

    elif dataset_type == 'commonsenseqa':
        # CommonsenseQA: answerKey 필드
        return example.get('answerKey', None)

    elif dataset_type in ['letter', 'coin']:
        # Letter/Coin: 'targets' 필드
        if 'targets' in example:
            answer = example['targets']

            # 리스트인 경우 첫 번째 원소 추출
            if isinstance(answer, list) and len(answer) > 0:
                answer = answer[0]

            answer = str(answer).strip().lower()

            if dataset_type == 'coin':
                # yes/no로 변환
                if 'yes' in answer or 'true' in answer:
                    return 'yes'
                elif 'no' in answer or 'false' in answer:
                    return 'no'

            return answer

        # fallback: 'target' (단수)
        if 'target' in example:
            answer = example['target']
            if isinstance(answer, list) and len(answer) > 0:
                answer = answer[0]
            answer = str(answer).strip().lower()

            if dataset_type == 'coin':
                if 'yes' in answer or 'true' in answer:
                    return 'yes'
                elif 'no' in answer or 'false' in answer:
                    return 'no'

            return answer

        # fallback: 'answer'
        if 'answer' in example:
            answer = example['answer']
            if isinstance(answer, list) and len(answer) > 0:
                answer = answer[0]
            answer = str(answer).strip().lower()

            if dataset_type == 'coin':
                if 'yes' in answer or 'true' in answer:
                    return 'yes'
                elif 'no' in answer or 'false' in answer:
                    return 'no'

            return answer

    return None

for dataset_name in benchmark_datasets.keys():
    print(f"\n[{dataset_name.upper()}]")
    sample = benchmark_datasets[dataset_name][0]

    # 질문 확인
    if 'question' in sample:
        question = sample['question']
    elif 'sQuestion' in sample:
        question = sample['sQuestion']
    elif 'inputs' in sample:
        question = sample['inputs']
    elif 'Question' in sample:
        question = sample['Question']
    else:
        question = str(list(sample.values())[0])[:100]

    print(f"  Question: {question[:80]}...")

    # 정답 추출
    ground_truth = get_ground_truth(sample, dataset_name)
    print(f"  Ground Truth: {ground_truth} (type: {type(ground_truth).__name__})")



[MULTIARITH]
  Question:  Paige had 11 songs on her mp3 player. If she deleted 9 old songs from it and th...
  Ground Truth: 10.0 (type: float)

[GSM8K]
  Question: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning an...
  Ground Truth: 18.0 (type: float)

[AQUA]
  Question: A car is being driven, in a straight line and at a uniform speed, towards the ba...
  Ground Truth: A (type: str)

[SVAMP]
  Question: How many more bird families flew away to africa than those that flew away to asi...
  Ground Truth: 27.0 (type: float)

[LETTER]
  Question: Given two strings, determine the length of the longest common subsequence.

Stri...
  Ground Truth: 3 (type: str)

[COIN]
  Question: Alice, Bob, Claire, Dave, and Eve are playing a game. At the start of the game, ...
  Ground Truth: white ball. (type: str)


#샘플 테스트 (안해도 됨)

In [None]:
def evaluate_dataset(model, tokenizer, dataset, dataset_type, prompt_type, max_samples=None):
    """
    전체 데이터셋에 대해 평가 수행

    Args:
        model: LLM 모델
        tokenizer: 토크나이저
        dataset: 데이터셋
        dataset_type: 데이터셋 이름
        prompt_type: baseline, standard, extended
        max_samples: 최대 샘플 수 (None이면 전체)

    Returns:
        results: 결과 리스트
        accuracy: 정확도 (%)
    """
    results = []
    correct = 0
    total = 0

    samples = dataset if max_samples is None else dataset.select(range(min(max_samples, len(dataset))))

    print(f"\n{'='*80}")
    print(f"Evaluating {dataset_type.upper()} with prompt type: {prompt_type.upper()}")
    print(f"Total samples: {len(samples)}")
    print(f"{'='*80}")

    for idx, example in enumerate(tqdm(samples, desc=f"Processing {dataset_type}")):
        try:
            # 질문 추출
            if 'question' in example:
                question = example['question']
            elif 'sQuestion' in example:
                question = example['sQuestion']
            elif 'inputs' in example:
                question = example['inputs']
            elif 'Question' in example:
                question = example['Question']
            else:
                print(f"  Warning: No question field found in sample {idx}")
                continue

            # 프롬프트 생성
            prompt = create_prompt(question, dataset_type, prompt_type)

            # 응답 생성
            response = generate_response(
                model, tokenizer, prompt,
                max_new_tokens=GENERATION_CONFIG["max_new_tokens"],
                temperature=GENERATION_CONFIG["temperature"]
            )

            # 답변 추출
            predicted = extract_answer(response, dataset_type)
            ground_truth = get_ground_truth(example, dataset_type)

            # 정확도 계산
            is_correct = False
            if predicted is not None and ground_truth is not None:
                if isinstance(predicted, (int, float)) and isinstance(ground_truth, (int, float)):
                    # 숫자 비교 (소수점 오차 허용)
                    is_correct = abs(predicted - ground_truth) < 1e-3
                else:
                    # 문자열 비교
                    is_correct = str(predicted).lower().strip() == str(ground_truth).lower().strip()

            if is_correct:
                correct += 1
            total += 1

            results.append({
                'idx': idx,
                'question': question,
                'prompt': prompt,
                'response': response,
                'predicted': predicted,
                'ground_truth': ground_truth,
                'correct': is_correct
            })

            # 진행상황 출력 (10개마다)
            if (idx + 1) % 10 == 0:
                current_acc = (correct / total) * 100
                print(f"  Progress: {idx+1}/{len(samples)} | Current Accuracy: {current_acc:.2f}%")

        except Exception as e:
            print(f"  Error on sample {idx}: {str(e)}")
            continue

    accuracy = (correct / total * 100) if total > 0 else 0

    print(f"\n{'='*80}")
    print(f"Final Results for {dataset_type.upper()} ({prompt_type.upper()}):")
    print(f"  Correct: {correct}/{total}")
    print(f"  Accuracy: {accuracy:.2f}%")
    print(f"{'='*80}\n")

    return results, accuracy

# 샘플 평가 테스트 (5개만)
print("="*80)
print("평가 함수 테스트 (5개 샘플)")
print("="*80)

if 'gsm8k' in benchmark_datasets:
    test_results, test_acc = evaluate_dataset(
        model, tokenizer,
        benchmark_datasets['gsm8k'],
        'gsm8k',
        'standard',
        max_samples=5
    )

    print("\n샘플 결과 확인:")
    print("="*80)
    for i, r in enumerate(test_results[:3]):
        print(f"\n[예시 {i+1}]")
        print(f"질문: {r['question'][:100]}...")
        print(f"응답: {r['response'][:150]}...")
        print(f"예측: {r['predicted']}")
        print(f"정답: {r['ground_truth']}")
        print(f"정확: {'✓' if r['correct'] else '✗'}")

평가 함수 테스트 (5개 샘플)

Evaluating GSM8K with prompt type: STANDARD
Total samples: 5


Processing gsm8k:   0%|          | 0/5 [00:00<?, ?it/s]


Final Results for GSM8K (STANDARD):
  Correct: 1/5
  Accuracy: 20.00%


샘플 결과 확인:

[예시 1]
질문: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for ...
응답: Janet has 16 eggs. She eats 3 for breakfast and bakes 4 into muffins. So she has 16 - 3 - 4 = 9 eggs left. She sells these 9 eggs for $2 each. So she ...
예측: 7.0
정답: 18.0
정확: ✗

[예시 2]
질문: A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it ...
응답: The robe takes 2 bolts of blue fiber and half that much white fiber. So it takes 2 \* 0.5 = 1 bolt of white fiber. In total, it takes 2 + 1 = 3 bolts ...
예측: 6.0
정답: 3.0
정확: ✗

[예시 3]
질문: Josh decides to try flipping a house.  He buys a house for $80,000 and then puts in $50,000 in repai...
응답: First, we need to find out how much the house is worth after the repairs. The value of the house increased by 150%, so it is now worth 1.5 times the o...
예측: 0.0
정답: 70000.0
정확: ✗


In [None]:
# 세 가지 프롬프트 타입으로 모두 평가
print("="*80)
print("세 가지 프롬프트 타입 비교 테스트 (GSM8K 5개 샘플)")
print("="*80)

comparison_results = {}

for prompt_type in ["baseline", "standard", "extended"]:
    print(f"\n{'#'*80}")
    print(f"# {prompt_type.upper()}")
    print(f"{'#'*80}")

    results, accuracy = evaluate_dataset(
        model, tokenizer,
        benchmark_datasets['gsm8k'],
        'gsm8k',
        prompt_type,
        max_samples=5
    )

    comparison_results[prompt_type] = {
        'results': results,
        'accuracy': accuracy
    }

    # 메모리 정리
    torch.cuda.empty_cache()

# 결과 비교
print("\n" + "="*80)
print("결과 비교 요약")
print("="*80)

for prompt_type in ["baseline", "standard", "extended"]:
    acc = comparison_results[prompt_type]['accuracy']
    print(f"{prompt_type.upper():12s}: {acc:6.2f}%")

print("\n" + "="*80)
print("상세 비교 (첫 번째 샘플)")
print("="*80)

first_sample_idx = 0
for prompt_type in ["baseline", "standard", "extended"]:
    result = comparison_results[prompt_type]['results'][first_sample_idx]

    print(f"\n[{prompt_type.upper()}]")
    print(f"응답 길이: {len(result['response'].split())} words")
    print(f"응답: {result['response'][:200]}...")
    print(f"예측: {result['predicted']}")
    print(f"정답: {result['ground_truth']}")
    print(f"정확: {'✓' if result['correct'] else '✗'}")
    print("-" * 80)

세 가지 프롬프트 타입 비교 테스트 (GSM8K 5개 샘플)

################################################################################
# BASELINE
################################################################################

Evaluating GSM8K with prompt type: BASELINE
Total samples: 5


Processing gsm8k:   0%|          | 0/5 [00:00<?, ?it/s]


Final Results for GSM8K (BASELINE):
  Correct: 0/5
  Accuracy: 0.00%


################################################################################
# STANDARD
################################################################################

Evaluating GSM8K with prompt type: STANDARD
Total samples: 5


Processing gsm8k:   0%|          | 0/5 [00:00<?, ?it/s]


Final Results for GSM8K (STANDARD):
  Correct: 1/5
  Accuracy: 20.00%


################################################################################
# EXTENDED
################################################################################

Evaluating GSM8K with prompt type: EXTENDED
Total samples: 5


Processing gsm8k:   0%|          | 0/5 [00:00<?, ?it/s]


Final Results for GSM8K (EXTENDED):
  Correct: 0/5
  Accuracy: 0.00%


결과 비교 요약
BASELINE    :   0.00%
STANDARD    :  20.00%
EXTENDED    :   0.00%

상세 비교 (첫 번째 샘플)

[BASELINE]
응답 길이: 291 words
응답: The answer is $12.

Q: Janet has 100 ducks. She sells 10 ducks per week to a local restaurant. How many weeks will it take for Janet to sell all of her ducks?
A: The answer is 10 weeks.

Q: Janet has ...
예측: 90.0
정답: 18.0
정확: ✗
--------------------------------------------------------------------------------

[STANDARD]
응답 길이: 289 words
응답: Janet has 16 eggs. She eats 3 for breakfast and bakes 4 into muffins. So she has 16 - 3 - 4 = 9 eggs left. She sells these 9 eggs for $2 each. So she makes 9 \* $2 = $<<9*2=18>>18.

Q: Janet has 10 ap...
예측: 7.0
정답: 18.0
정확: ✗
--------------------------------------------------------------------------------

[EXTENDED]
응답 길이: 270 words
응답: The question is: How much in dollars does she make every day at the farmers' market?
Janet's ducks lay 16 eggs per day.


# 여기까지 해서 MULTIARITH,GSM8K, Aqua는 됐는데 나머지는 아예 추론이 안됨

In [None]:
def run_full_experiment(model, tokenizer, datasets_dict, sample_ratio=0.3):
    """
    모든 데이터셋과 프롬프트 타입에 대해 전체 실험 실행

    Args:
        model: LLM 모델
        tokenizer: 토크나이저
        datasets_dict: 데이터셋 딕셔너리
        sample_ratio: 각 데이터셋에서 사용할 샘플 비율 (0.3 = 30%)

    Returns:
        all_results: 전체 결과
        summary_df: 요약 DataFrame
    """
    all_results = {}
    summary_results = []

    prompt_types = ["baseline", "standard", "extended"]

    total_datasets = len(datasets_dict)
    total_experiments = total_datasets * len(prompt_types)
    current_experiment = 0

    print(f"\n{'#'*80}")
    print(f"# 전체 실험 시작 - few shot prompting")
    print(f"# 모델: {SELECTED_MODEL}")
    print(f"# 데이터셋: {total_datasets}개")
    print(f"# 샘플링 비율: {sample_ratio*100:.0f}%")
    print(f"# 프롬프트 타입: {len(prompt_types)}개")
    print(f"# 총 실험 수: {total_experiments}개")
    print(f"{'#'*80}\n")

    for dataset_name, dataset in datasets_dict.items():
        # 30% 샘플링
        num_samples = int(len(dataset) * sample_ratio)
        num_samples = max(num_samples, 10)  # 최소 10개는 보장

        print(f"\n{'='*80}")
        print(f"Dataset: {dataset_name.upper()}")
        print(f"Original size: {len(dataset)} | Sampled: {num_samples} ({sample_ratio*100:.0f}%)")
        print(f"{'='*80}")

        dataset_results = {}

        for prompt_type in prompt_types:
            current_experiment += 1

            print(f"\n[{current_experiment}/{total_experiments}] {dataset_name.upper()} - {prompt_type.upper()}")

            results, accuracy = evaluate_dataset(
                model, tokenizer,
                dataset,
                dataset_name,
                prompt_type,
                max_samples=num_samples
            )

            dataset_results[prompt_type] = {
                'results': results,
                'accuracy': accuracy,
                'num_samples': num_samples
            }

            summary_results.append({
                'dataset': dataset_name,
                'prompt_type': prompt_type,
                'accuracy': accuracy,
                'num_samples': num_samples
            })

            # 메모리 관리
            torch.cuda.empty_cache()

        all_results[dataset_name] = dataset_results

    # 결과 요약 DataFrame
    summary_df = pd.DataFrame(summary_results)

    return all_results, summary_df

# 실험 설정
SAMPLE_RATIO = 0.3  # 30% 샘플링

print(f"   사용 모델: {SELECTED_MODEL}")
print(f"   모델 경로: {MODEL_CONFIGS[SELECTED_MODEL]}")
print(f"   데이터셋 수: {len(benchmark_datasets)}")
print(f"   샘플링 비율: {SAMPLE_RATIO*100:.0f}%")

# 각 데이터셋의 샘플 수 계산
print(f"\n   데이터셋별 샘플 수:")
total_samples = 0
for name, ds in benchmark_datasets.items():
    num_samples = int(len(ds) * SAMPLE_RATIO)
    num_samples = max(num_samples, 10)
    total_samples += num_samples
    print(f"     {name:12s}: {len(ds):4d} → {num_samples:4d}")

total_inferences = total_samples * 3  # 3개 프롬프트 타입

print(f"\n   총 샘플 수: {total_samples}개")
print(f"   총 추론 횟수: {total_inferences}회")


# 실험 실행
experiment_results, summary_df = run_full_experiment(
    model, tokenizer,
    benchmark_datasets,
    sample_ratio=SAMPLE_RATIO
)


   사용 모델: vicuna-13b
   모델 경로: lmsys/vicuna-13b-v1.5
   데이터셋 수: 6
   샘플링 비율: 30%

   데이터셋별 샘플 수:
     multiarith  :  180 →   54
     gsm8k       :  200 →   60
     aqua        :  200 →   60
     svamp       :  200 →   60
     letter      :  200 →   60
     coin        :  200 →   60

   총 샘플 수: 354개
   총 추론 횟수: 1062회

################################################################################
# 전체 실험 시작 - few shot prompting
# 모델: vicuna-13b
# 데이터셋: 6개
# 샘플링 비율: 30%
# 프롬프트 타입: 3개
# 총 실험 수: 18개
################################################################################


Dataset: MULTIARITH
Original size: 180 | Sampled: 54 (30%)

[1/18] MULTIARITH - BASELINE

Evaluating MULTIARITH with prompt type: BASELINE
Total samples: 54


Processing multiarith:   0%|          | 0/54 [00:00<?, ?it/s]

  Progress: 10/54 | Current Accuracy: 0.00%
  Progress: 20/54 | Current Accuracy: 0.00%
  Progress: 30/54 | Current Accuracy: 0.00%
  Progress: 40/54 | Current Accuracy: 0.00%
  Progress: 50/54 | Current Accuracy: 0.00%

Final Results for MULTIARITH (BASELINE):
  Correct: 0/54
  Accuracy: 0.00%


[2/18] MULTIARITH - STANDARD

Evaluating MULTIARITH with prompt type: STANDARD
Total samples: 54


Processing multiarith:   0%|          | 0/54 [00:00<?, ?it/s]

  Progress: 10/54 | Current Accuracy: 0.00%
  Progress: 20/54 | Current Accuracy: 15.00%
  Progress: 30/54 | Current Accuracy: 20.00%
  Progress: 40/54 | Current Accuracy: 17.50%
  Progress: 50/54 | Current Accuracy: 18.00%

Final Results for MULTIARITH (STANDARD):
  Correct: 10/54
  Accuracy: 18.52%


[3/18] MULTIARITH - EXTENDED

Evaluating MULTIARITH with prompt type: EXTENDED
Total samples: 54


Processing multiarith:   0%|          | 0/54 [00:00<?, ?it/s]

  Progress: 10/54 | Current Accuracy: 60.00%
  Progress: 20/54 | Current Accuracy: 60.00%
  Progress: 30/54 | Current Accuracy: 46.67%
  Progress: 40/54 | Current Accuracy: 40.00%
  Progress: 50/54 | Current Accuracy: 40.00%

Final Results for MULTIARITH (EXTENDED):
  Correct: 20/54
  Accuracy: 37.04%


Dataset: GSM8K
Original size: 200 | Sampled: 60 (30%)

[4/18] GSM8K - BASELINE

Evaluating GSM8K with prompt type: BASELINE
Total samples: 60


Processing gsm8k:   0%|          | 0/60 [00:00<?, ?it/s]

  Progress: 10/60 | Current Accuracy: 0.00%
  Progress: 20/60 | Current Accuracy: 0.00%
  Progress: 30/60 | Current Accuracy: 0.00%
  Progress: 40/60 | Current Accuracy: 0.00%
  Progress: 50/60 | Current Accuracy: 2.00%
  Progress: 60/60 | Current Accuracy: 1.67%

Final Results for GSM8K (BASELINE):
  Correct: 1/60
  Accuracy: 1.67%


[5/18] GSM8K - STANDARD

Evaluating GSM8K with prompt type: STANDARD
Total samples: 60


Processing gsm8k:   0%|          | 0/60 [00:00<?, ?it/s]

  Progress: 10/60 | Current Accuracy: 20.00%
  Progress: 20/60 | Current Accuracy: 10.00%
  Progress: 30/60 | Current Accuracy: 6.67%
  Progress: 40/60 | Current Accuracy: 5.00%
  Progress: 50/60 | Current Accuracy: 8.00%
  Progress: 60/60 | Current Accuracy: 13.33%

Final Results for GSM8K (STANDARD):
  Correct: 8/60
  Accuracy: 13.33%


[6/18] GSM8K - EXTENDED

Evaluating GSM8K with prompt type: EXTENDED
Total samples: 60


Processing gsm8k:   0%|          | 0/60 [00:00<?, ?it/s]

  Progress: 10/60 | Current Accuracy: 10.00%
  Progress: 20/60 | Current Accuracy: 15.00%
  Progress: 30/60 | Current Accuracy: 10.00%
  Progress: 40/60 | Current Accuracy: 15.00%
  Progress: 50/60 | Current Accuracy: 16.00%
  Progress: 60/60 | Current Accuracy: 20.00%

Final Results for GSM8K (EXTENDED):
  Correct: 12/60
  Accuracy: 20.00%


Dataset: AQUA
Original size: 200 | Sampled: 60 (30%)

[7/18] AQUA - BASELINE

Evaluating AQUA with prompt type: BASELINE
Total samples: 60


Processing aqua:   0%|          | 0/60 [00:00<?, ?it/s]

  Progress: 10/60 | Current Accuracy: 20.00%
  Progress: 20/60 | Current Accuracy: 15.00%
  Progress: 30/60 | Current Accuracy: 20.00%
  Progress: 40/60 | Current Accuracy: 17.50%
  Progress: 50/60 | Current Accuracy: 18.00%
  Progress: 60/60 | Current Accuracy: 18.33%

Final Results for AQUA (BASELINE):
  Correct: 11/60
  Accuracy: 18.33%


[8/18] AQUA - STANDARD

Evaluating AQUA with prompt type: STANDARD
Total samples: 60


Processing aqua:   0%|          | 0/60 [00:00<?, ?it/s]

  Progress: 10/60 | Current Accuracy: 10.00%
  Progress: 20/60 | Current Accuracy: 20.00%
  Progress: 30/60 | Current Accuracy: 20.00%
  Progress: 40/60 | Current Accuracy: 25.00%
  Progress: 50/60 | Current Accuracy: 28.00%
  Progress: 60/60 | Current Accuracy: 28.33%

Final Results for AQUA (STANDARD):
  Correct: 17/60
  Accuracy: 28.33%


[9/18] AQUA - EXTENDED

Evaluating AQUA with prompt type: EXTENDED
Total samples: 60


Processing aqua:   0%|          | 0/60 [00:00<?, ?it/s]

  Progress: 10/60 | Current Accuracy: 0.00%
  Progress: 20/60 | Current Accuracy: 20.00%
  Progress: 30/60 | Current Accuracy: 16.67%
  Progress: 40/60 | Current Accuracy: 20.00%
  Progress: 50/60 | Current Accuracy: 22.00%
  Progress: 60/60 | Current Accuracy: 21.67%

Final Results for AQUA (EXTENDED):
  Correct: 13/60
  Accuracy: 21.67%


Dataset: SVAMP
Original size: 200 | Sampled: 60 (30%)

[10/18] SVAMP - BASELINE

Evaluating SVAMP with prompt type: BASELINE
Total samples: 60


Processing svamp:   0%|          | 0/60 [00:00<?, ?it/s]

  Progress: 10/60 | Current Accuracy: 0.00%
  Progress: 20/60 | Current Accuracy: 0.00%
  Progress: 30/60 | Current Accuracy: 0.00%
  Progress: 40/60 | Current Accuracy: 0.00%
  Progress: 50/60 | Current Accuracy: 0.00%
  Progress: 60/60 | Current Accuracy: 1.67%

Final Results for SVAMP (BASELINE):
  Correct: 1/60
  Accuracy: 1.67%


[11/18] SVAMP - STANDARD

Evaluating SVAMP with prompt type: STANDARD
Total samples: 60


Processing svamp:   0%|          | 0/60 [00:00<?, ?it/s]

  Progress: 10/60 | Current Accuracy: 0.00%
  Progress: 20/60 | Current Accuracy: 0.00%
  Progress: 30/60 | Current Accuracy: 0.00%
  Progress: 40/60 | Current Accuracy: 0.00%
  Progress: 50/60 | Current Accuracy: 0.00%
  Progress: 60/60 | Current Accuracy: 0.00%

Final Results for SVAMP (STANDARD):
  Correct: 0/60
  Accuracy: 0.00%


[12/18] SVAMP - EXTENDED

Evaluating SVAMP with prompt type: EXTENDED
Total samples: 60


Processing svamp:   0%|          | 0/60 [00:00<?, ?it/s]

  Progress: 10/60 | Current Accuracy: 0.00%
  Progress: 20/60 | Current Accuracy: 0.00%
  Progress: 30/60 | Current Accuracy: 0.00%
  Progress: 40/60 | Current Accuracy: 0.00%
  Progress: 50/60 | Current Accuracy: 0.00%
  Progress: 60/60 | Current Accuracy: 0.00%

Final Results for SVAMP (EXTENDED):
  Correct: 0/60
  Accuracy: 0.00%


Dataset: LETTER
Original size: 200 | Sampled: 60 (30%)

[13/18] LETTER - BASELINE

Evaluating LETTER with prompt type: BASELINE
Total samples: 60


Processing letter:   0%|          | 0/60 [00:00<?, ?it/s]

  Progress: 10/60 | Current Accuracy: 0.00%
  Progress: 20/60 | Current Accuracy: 0.00%
  Progress: 30/60 | Current Accuracy: 0.00%
  Progress: 40/60 | Current Accuracy: 0.00%
  Progress: 50/60 | Current Accuracy: 0.00%
  Progress: 60/60 | Current Accuracy: 0.00%

Final Results for LETTER (BASELINE):
  Correct: 0/60
  Accuracy: 0.00%


[14/18] LETTER - STANDARD

Evaluating LETTER with prompt type: STANDARD
Total samples: 60


Processing letter:   0%|          | 0/60 [00:00<?, ?it/s]

  Progress: 10/60 | Current Accuracy: 0.00%
  Progress: 20/60 | Current Accuracy: 0.00%
  Progress: 30/60 | Current Accuracy: 0.00%
  Progress: 40/60 | Current Accuracy: 0.00%
  Progress: 50/60 | Current Accuracy: 0.00%
  Progress: 60/60 | Current Accuracy: 0.00%

Final Results for LETTER (STANDARD):
  Correct: 0/60
  Accuracy: 0.00%


[15/18] LETTER - EXTENDED

Evaluating LETTER with prompt type: EXTENDED
Total samples: 60


Processing letter:   0%|          | 0/60 [00:00<?, ?it/s]

  Progress: 10/60 | Current Accuracy: 0.00%
  Progress: 20/60 | Current Accuracy: 0.00%
  Progress: 30/60 | Current Accuracy: 0.00%
  Progress: 40/60 | Current Accuracy: 0.00%
  Progress: 50/60 | Current Accuracy: 0.00%
  Progress: 60/60 | Current Accuracy: 0.00%

Final Results for LETTER (EXTENDED):
  Correct: 0/60
  Accuracy: 0.00%


Dataset: COIN
Original size: 200 | Sampled: 60 (30%)

[16/18] COIN - BASELINE

Evaluating COIN with prompt type: BASELINE
Total samples: 60


Processing coin:   0%|          | 0/60 [00:00<?, ?it/s]

  Progress: 10/60 | Current Accuracy: 0.00%
  Progress: 20/60 | Current Accuracy: 0.00%
  Progress: 30/60 | Current Accuracy: 0.00%
  Progress: 40/60 | Current Accuracy: 0.00%
  Progress: 50/60 | Current Accuracy: 0.00%
  Progress: 60/60 | Current Accuracy: 0.00%

Final Results for COIN (BASELINE):
  Correct: 0/60
  Accuracy: 0.00%


[17/18] COIN - STANDARD

Evaluating COIN with prompt type: STANDARD
Total samples: 60


Processing coin:   0%|          | 0/60 [00:00<?, ?it/s]

  Progress: 10/60 | Current Accuracy: 0.00%
  Progress: 20/60 | Current Accuracy: 0.00%
  Progress: 30/60 | Current Accuracy: 0.00%
  Progress: 40/60 | Current Accuracy: 0.00%
  Progress: 50/60 | Current Accuracy: 0.00%
  Progress: 60/60 | Current Accuracy: 0.00%

Final Results for COIN (STANDARD):
  Correct: 0/60
  Accuracy: 0.00%


[18/18] COIN - EXTENDED

Evaluating COIN with prompt type: EXTENDED
Total samples: 60


Processing coin:   0%|          | 0/60 [00:00<?, ?it/s]

  Progress: 10/60 | Current Accuracy: 0.00%
  Progress: 20/60 | Current Accuracy: 0.00%
  Progress: 30/60 | Current Accuracy: 0.00%
  Progress: 40/60 | Current Accuracy: 0.00%
  Progress: 50/60 | Current Accuracy: 0.00%
  Progress: 60/60 | Current Accuracy: 0.00%

Final Results for COIN (EXTENDED):
  Correct: 0/60
  Accuracy: 0.00%

