In [51]:
# !pip install git+https://github.com/ImalshaD/DimenTuna.git

In [52]:
import torch

In [53]:
torch.set_float32_matmul_precision('high')

In [54]:
from dimentuna import DTHfEncoder, LayerWrappebleDTHfLLM, LayerWrappebleQwen, DTConfig

In [55]:
from dimentuna import LinearMapper, DTLayerWrapper, LinearWrapper, FeedForwardProjector, LSTMProjector

In [56]:
from dimentuna import TwoPhasedTS, TwoPhasedSeq2SeqTS

In [57]:
from dimentuna import DTMsgm

In [58]:
from datasets import load_dataset

In [59]:
from torch.utils.data import Dataset, DataLoader

# Parameters

In [60]:
batch_size = 4
epochs = 10

In [61]:
data_cache = '\data_cache'

In [62]:
model_cache = ".\model_cache"

In [63]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Datasets and Preprocessing

In [64]:
mgsm = DTMsgm()

In [65]:
ds = load_dataset("nthakur/GSM8KInstruct-Parallel-instruct")

In [66]:
train_ds = ds['train']
test_ds = ds['test']

In [67]:
train_df = train_ds.to_pandas()[["prompt", "code"]]
test_df = test_ds.to_pandas()[["prompt", "code"]]

In [68]:
train_df[["System_prompt", "Instruction"]] = train_df['prompt'].str.split('### Instruction:', expand=True)
test_df[["System_prompt", "Instruction"]] = test_df['prompt'].str.split('### Instruction:', expand=True)

In [69]:
train_df = train_df[["Instruction", "code"]]
test_df = test_df[["Instruction", "code"]]

In [70]:
train_df["Instruction"] = train_df["Instruction"].str.strip()
test_df["Instruction"] = test_df["Instruction"].str.strip()

In [71]:
train_df['Instruction'] = train_df['Instruction'].str.rsplit('?', n=1).str[0] + '?'
test_df["Instruction"] = test_df["Instruction"].str.rsplit('?', n=1).str[0] + '?'

In [72]:
# Step 1: Add a column with the length of each "Instruction"
train_df["Instruction_Length"] = train_df["Instruction"].apply(len)
test_df["Instruction_Length"] = test_df["Instruction"].apply(len)

# Step 2: Determine a threshold (e.g., 95th percentile of length)
train_threshold = train_df["Instruction_Length"].quantile(0.97)
test_threshold = test_df["Instruction_Length"].quantile(0.97)

# Step 3: Filter out unusually large rows
train_df = train_df[train_df["Instruction_Length"] <= train_threshold]
test_df = test_df[test_df["Instruction_Length"] <= test_threshold]

# Step 4: Drop the temporary "Instruction_Length" column
train_df = train_df.drop(columns=["Instruction_Length"])
test_df = test_df.drop(columns=["Instruction_Length"])

In [74]:
train_df_english = train_df[train_df['code']=='en'].reset_index(drop=True)
test_df_english = test_df[test_df['code']=='en'].reset_index(drop=True)

In [75]:
train_df_thai = train_df[train_df['code']=='th'].reset_index(drop=True)
test_df_thai = test_df[test_df['code']=='th'].reset_index(drop=True)

In [76]:
train_df_english

Unnamed: 0,Instruction,code
0,John has 2 umbrellas in his house and 1 in the...,en
1,Brad wanted to set up a lemonade stand to earn...,en
2,"Angus, Patrick, and Ollie went fishing for tro...",en
3,Frank has an apple tree in his backyard. 5 app...,en
4,"Ann, Bill, Cate, and Dale each buy personal pa...",en
...,...,...
6895,If Jade earns $1600 per month and spent 75% of...,en
6896,John builds a toy bridge to support various we...,en
6897,"Coral reads 30 pages of a book on night 1, and...",en
6898,Janet has 24 dresses. Half of them have pocke...,en


In [77]:
test_df_english

Unnamed: 0,Instruction,code
0,Julian has 400 legos and wants to make lego mo...,en
1,"Mark has 30 candies, Peter has 25 candies, and...",en
2,"Marj has two $20 bills, three $5 bills, and $4...",en
3,Hunter saw 5 frogs sitting on top lily pads in...,en
4,A portable computer drive has enough kilobytes...,en
...,...,...
360,"Cappuccinos cost $2, iced teas cost $3, cafe l...",en
361,Jamal works at a library shelving books. He ha...,en
362,"Out of the 200 students in a class, thirty per...",en
363,Albert has to run 99 meters in gym class. The ...,en


In [78]:
test_df_english['Instruction'][0]

'Julian has 400 legos and wants to make lego models of two identical airplanes. If each airplane model requires 240 legos, how many more legos does Julian need?'

In [79]:
train_df_thai

Unnamed: 0,Instruction,code
0,ควินซีเพิ่งซื้อรถยนต์โดยใช้สินเชื่อ 5 ปีโดยไม่...,th
1,พีเตอร์กำลังเก็บใบไม้ร่วง ใช้เวลา 15 นาทีในการ...,th
2,บิลได้รับทีวีจอแบนที่มีขนาด 48 นิ้ว x 100 นิ้ว...,th
3,พอลได้รับเงิน 12.50 ดอลลาร์ต่อชั่วโมงที่เขาทำง...,th
4,เจมส์ซื้อเนื้อสเต็กแบบซื้อ 1 แถม 1 ราคาคือ 15 ...,th
...,...,...
7017,Jeanette กำลังฝึกการโยนลูกบอล ในแต่ละสัปดาห์เธ...,th
7018,ลอร่าต้องซื้อวัสดุประดับหน้าต่างสำหรับหน้าต่าง...,th
7019,โทมัสกำลังหวังที่จะวิ่งมาราธอนในปีหน้า ซึ่งมีร...,th
7020,เพจซื้อสติกเกอร์ใหม่และต้องการแบ่งปันกับเพื่อน...,th


In [80]:
test_df_thai['Instruction'][0]

'โครงการคอนโดใหม่นี้มีชั้นทั้งหมด 23 ชั้น ชั้นปกติมีหน่วยที่อยู่ 12 หน่วย ในขณะที่ชั้นพานเพนท์เฮาส์มีหน่วยเพียง 2 หน่วย เมื่อมีการจัดสรรชั้นบนสุด 2 ชั้นสำหรับหน่วยพานท์เฮาส์ จำนวนหน่วยทั้งหมดในคอนโดนี้เป็นเท่าใด?'

**DataLoaders**

In [81]:
train_english_loader = DataLoader(train_df_english['Instruction'], batch_size=batch_size)
test_english_loader = DataLoader(test_df_english["Instruction"], batch_size=batch_size)

In [82]:
train_thai_loader = DataLoader(train_df_thai['Instruction'], batch_size=batch_size//2)
test_thai_loader = DataLoader(test_df_thai['Instruction'], batch_size=batch_size//2)

In [83]:
xlmr_config = DTConfig(
    model_name="FacebookAI/xlm-roberta-base",
    cache_dir="./cache",          # Path to cache directory
    max_tokens=512,               # Maximum token limit
    temperature=0.7,              # Sampling temperature
    max_generation_length=256,    # Maximum length for generated text
    do_sample=True,               # Enable sampling
    truncation=True,              # Enable truncation
    padding=True,                 # Enable padding
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),  # Device configuration
    use_best_config=False,        # Use best configuration (custom logic)
    padding_side='right',         # Padding side
    output_hidden_states=False,   # Output hidden states
    output_attentions=False,      # Output attention values
    return_dict=True,             # Return output as dictionary
)

In [84]:
xlmr = DTHfEncoder(xlmr_config)

In [85]:
test = ["This is test Sentence.", "This is test sentence 2.", "This is the longest test sentence 3."]

In [86]:
input_ten = xlmr.encode(test)

# Models

In [87]:
mapper = LinearMapper(1536, 1536, 1024)

In [88]:
qwen_model = LayerWrappebleQwen()

In [89]:
qwen_model.set_system_prompt(mgsm.system_prompt)

In [90]:
qwen_model.print_best_config()

{'max_length': 512}


In [91]:
qwen_ed = qwen_model.get_embeddings_size()

In [92]:
xlmr_ed = xlmr.get_embeddings_size()

In [93]:
projector = LSTMProjector(qwen_ed, xlmr_ed, 1024)

In [94]:
qwen_model.get_Layer_output(test, 1).shape

torch.Size([3, 41, 1536])

In [95]:
layer_1_wrapper= LinearWrapper(mapper)

In [96]:
qwen_model.replace_layer(1, layer_1_wrapper)

In [97]:
qwen_model.print_status()

-LLM
--Layer_0
---Layer_Frozen: False
--LayerWrapper_1
---Layer_Frozen: False
---Mapper_Frozen: False
---Engaged: True
---Alpha_Frozen: False Value: 0.5 Not Used
--Layer_2
---Layer_Frozen: False
--Layer_3
---Layer_Frozen: False
--Layer_4
---Layer_Frozen: False
--Layer_5
---Layer_Frozen: False
--Layer_6
---Layer_Frozen: False
--Layer_7
---Layer_Frozen: False
--Layer_8
---Layer_Frozen: False
--Layer_9
---Layer_Frozen: False
--Layer_10
---Layer_Frozen: False
--Layer_11
---Layer_Frozen: False
--Layer_12
---Layer_Frozen: False
--Layer_13
---Layer_Frozen: False
--Layer_14
---Layer_Frozen: False
--Layer_15
---Layer_Frozen: False
--Layer_16
---Layer_Frozen: False
--Layer_17
---Layer_Frozen: False
--Layer_18
---Layer_Frozen: False
--Layer_19
---Layer_Frozen: False
--Layer_20
---Layer_Frozen: False
--Layer_21
---Layer_Frozen: False
--Layer_22
---Layer_Frozen: False
--Layer_23
---Layer_Frozen: False
--Layer_24
---Layer_Frozen: False
--Layer_25
---Layer_Frozen: False
--Layer_26
---Layer_Frozen: Fa

In [98]:
two_phased_ts = TwoPhasedSeq2SeqTS(
    llm=qwen_model,
    encoder=xlmr,
    projector=projector,
    train_loader=train_english_loader,
    val_loader=test_english_loader,
    lr=0.001,
    device=device,
    target_layers=[1],
    mapper_train_loader=train_thai_loader,
    mapper_val_loader=test_thai_loader,
    layer_shift=3
)

In [99]:
qwen_model.freeze()

In [100]:
qwen_model.print_status()

-LLM
--Layer_0
---Layer_Frozen: True
--LayerWrapper_1
---Layer_Frozen: True
---Mapper_Frozen: True
---Engaged: True
---Alpha_Frozen: True Value: 0.5 Not Used
--Layer_2
---Layer_Frozen: True
--Layer_3
---Layer_Frozen: True
--Layer_4
---Layer_Frozen: True
--Layer_5
---Layer_Frozen: True
--Layer_6
---Layer_Frozen: True
--Layer_7
---Layer_Frozen: True
--Layer_8
---Layer_Frozen: True
--Layer_9
---Layer_Frozen: True
--Layer_10
---Layer_Frozen: True
--Layer_11
---Layer_Frozen: True
--Layer_12
---Layer_Frozen: True
--Layer_13
---Layer_Frozen: True
--Layer_14
---Layer_Frozen: True
--Layer_15
---Layer_Frozen: True
--Layer_16
---Layer_Frozen: True
--Layer_17
---Layer_Frozen: True
--Layer_18
---Layer_Frozen: True
--Layer_19
---Layer_Frozen: True
--Layer_20
---Layer_Frozen: True
--Layer_21
---Layer_Frozen: True
--Layer_22
---Layer_Frozen: True
--Layer_23
---Layer_Frozen: True
--Layer_24
---Layer_Frozen: True
--Layer_25
---Layer_Frozen: True
--Layer_26
---Layer_Frozen: True
--Layer_27
---Layer_Froze

In [101]:
two_phased_ts.train(epochs)

Encoder_Frozen: True
Projector_Frozen: False
-LLM
--Layer_0
---Layer_Frozen: True
--LayerWrapper_1
---Layer_Frozen: True
---Mapper_Frozen: True
---Engaged: False
---Alpha_Frozen: True Value: 0.5 Not Used
--Layer_2
---Layer_Frozen: True
--Layer_3
---Layer_Frozen: True
--Layer_4
---Layer_Frozen: True
--Layer_5
---Layer_Frozen: True
--Layer_6
---Layer_Frozen: True
--Layer_7
---Layer_Frozen: True
--Layer_8
---Layer_Frozen: True
--Layer_9
---Layer_Frozen: True
--Layer_10
---Layer_Frozen: True
--Layer_11
---Layer_Frozen: True
--Layer_12
---Layer_Frozen: True
--Layer_13
---Layer_Frozen: True
--Layer_14
---Layer_Frozen: True
--Layer_15
---Layer_Frozen: True
--Layer_16
---Layer_Frozen: True
--Layer_17
---Layer_Frozen: True
--Layer_18
---Layer_Frozen: True
--Layer_19
---Layer_Frozen: True
--Layer_20
---Layer_Frozen: True
--Layer_21
---Layer_Frozen: True
--Layer_22
---Layer_Frozen: True
--Layer_23
---Layer_Frozen: True
--Layer_24
---Layer_Frozen: True
--Layer_25
---Layer_Frozen: True
--Layer_26
-

Epoch 0: 100%|██████████| 1725/1725 [01:04<00:00, 26.85it/s]
100%|██████████| 92/92 [00:03<00:00, 30.08it/s]


Epoch 0 : Loss 0.002681309810394178 Eval Loss 0.001620857551208009


Epoch 1: 100%|██████████| 1725/1725 [01:04<00:00, 26.87it/s]
100%|██████████| 92/92 [00:03<00:00, 30.07it/s]


Epoch 1 : Loss 0.0016492022191970676 Eval Loss 0.0015394819733362806


Epoch 2: 100%|██████████| 1725/1725 [01:04<00:00, 26.87it/s]
100%|██████████| 92/92 [00:03<00:00, 30.10it/s]


Epoch 2 : Loss 0.001472937825669035 Eval Loss 0.0014436513833377671


Epoch 3: 100%|██████████| 1725/1725 [01:04<00:00, 26.86it/s]
100%|██████████| 92/92 [00:03<00:00, 30.03it/s]


Epoch 3 : Loss 0.0013565635917019909 Eval Loss 0.0013305933996731335


Epoch 4: 100%|██████████| 1725/1725 [01:04<00:00, 26.85it/s]
100%|██████████| 92/92 [00:03<00:00, 30.07it/s]


Epoch 4 : Loss 0.001275634329624312 Eval Loss 0.0013066506944596767


Epoch 5: 100%|██████████| 1725/1725 [01:04<00:00, 26.87it/s]
100%|██████████| 92/92 [00:03<00:00, 30.07it/s]


Epoch 5 : Loss 0.0012226568197197132 Eval Loss 0.001245168379366236


Epoch 6: 100%|██████████| 1725/1725 [01:04<00:00, 26.85it/s]
100%|██████████| 92/92 [00:03<00:00, 29.99it/s]


Epoch 6 : Loss 0.0011860210895943252 Eval Loss 0.0012419713868597603


Epoch 7: 100%|██████████| 1725/1725 [01:04<00:00, 26.88it/s]
100%|██████████| 92/92 [00:03<00:00, 30.07it/s]


Epoch 7 : Loss 0.0011673831097120284 Eval Loss 0.0012836784273689693


Epoch 8: 100%|██████████| 1725/1725 [01:04<00:00, 26.85it/s]
100%|██████████| 92/92 [00:03<00:00, 29.95it/s]


Epoch 8 : Loss 0.0011496978247727172 Eval Loss 0.0012700733280760924


Epoch 9: 100%|██████████| 1725/1725 [01:04<00:00, 26.73it/s]
100%|██████████| 92/92 [00:03<00:00, 30.03it/s]


Epoch 9 : Loss 0.0011445148957663796 Eval Loss 0.0012826435040156155
Encoder_Frozen: True
Projector_Frozen: True
-LLM
--Layer_0
---Layer_Frozen: True
--LayerWrapper_1
---Layer_Frozen: True
---Mapper_Frozen: False
---Engaged: True
---Alpha_Frozen: False Value: 0.5 Not Used
--Layer_2
---Layer_Frozen: True
--Layer_3
---Layer_Frozen: True
--Layer_4
---Layer_Frozen: True
--Layer_5
---Layer_Frozen: True
--Layer_6
---Layer_Frozen: True
--Layer_7
---Layer_Frozen: True
--Layer_8
---Layer_Frozen: True
--Layer_9
---Layer_Frozen: True
--Layer_10
---Layer_Frozen: True
--Layer_11
---Layer_Frozen: True
--Layer_12
---Layer_Frozen: True
--Layer_13
---Layer_Frozen: True
--Layer_14
---Layer_Frozen: True
--Layer_15
---Layer_Frozen: True
--Layer_16
---Layer_Frozen: True
--Layer_17
---Layer_Frozen: True
--Layer_18
---Layer_Frozen: True
--Layer_19
---Layer_Frozen: True
--Layer_20
---Layer_Frozen: True
--Layer_21
---Layer_Frozen: True
--Layer_22
---Layer_Frozen: True
--Layer_23
---Layer_Frozen: True
--Layer_2

Epoch 0: 100%|██████████| 3511/3511 [02:13<00:00, 26.31it/s]
100%|██████████| 185/185 [00:05<00:00, 31.62it/s]


Epoch 0 : Loss 0.001957484500016354 Eval Loss 0.0017693590032993943


Epoch 1: 100%|██████████| 3511/3511 [02:13<00:00, 26.37it/s]
100%|██████████| 185/185 [00:05<00:00, 31.67it/s]


Epoch 1 : Loss 0.003019584584587294 Eval Loss 0.0034073756919620005


Epoch 2: 100%|██████████| 3511/3511 [02:13<00:00, 26.38it/s]
100%|██████████| 185/185 [00:05<00:00, 31.73it/s]


Epoch 2 : Loss 0.003464864708172336 Eval Loss 0.004851292869126475


Epoch 3: 100%|██████████| 3511/3511 [02:13<00:00, 26.37it/s]
100%|██████████| 185/185 [00:05<00:00, 31.84it/s]


Epoch 3 : Loss 0.0020969807780359703 Eval Loss 0.002018752437386964


Epoch 4: 100%|██████████| 3511/3511 [02:13<00:00, 26.37it/s]
100%|██████████| 185/185 [00:05<00:00, 31.61it/s]


Epoch 4 : Loss 0.0020410536476857884 Eval Loss 0.002018752437386964


Epoch 5: 100%|██████████| 3511/3511 [02:13<00:00, 26.35it/s]
100%|██████████| 185/185 [00:05<00:00, 31.65it/s]


Epoch 5 : Loss 0.0020410536301124133 Eval Loss 0.002018752262449345


Epoch 6: 100%|██████████| 3511/3511 [02:13<00:00, 26.35it/s]
100%|██████████| 185/185 [00:05<00:00, 31.69it/s]


Epoch 6 : Loss 0.002041053565289869 Eval Loss 0.0020187522851031372


Epoch 7: 100%|██████████| 3511/3511 [02:13<00:00, 26.31it/s]
100%|██████████| 185/185 [00:05<00:00, 31.62it/s]


Epoch 7 : Loss 0.0020408072571273615 Eval Loss 0.0020182937100479328


Epoch 8: 100%|██████████| 3511/3511 [02:13<00:00, 26.37it/s]
100%|██████████| 185/185 [00:05<00:00, 31.69it/s]


Epoch 8 : Loss 0.0020406021785568757 Eval Loss 0.0020182937100479328


Epoch 9: 100%|██████████| 3511/3511 [02:13<00:00, 26.39it/s]
100%|██████████| 185/185 [00:05<00:00, 31.59it/s]


Epoch 9 : Loss 0.0020406021785568757 Eval Loss 0.0020182937100479328


In [102]:
qwen_model.print_status()

-LLM
--Layer_0
---Layer_Frozen: True
--LayerWrapper_1
---Layer_Frozen: True
---Mapper_Frozen: False
---Engaged: True
---Alpha_Frozen: False Value: 0.5 Not Used
--Layer_2
---Layer_Frozen: True
--Layer_3
---Layer_Frozen: True
--Layer_4
---Layer_Frozen: True
--Layer_5
---Layer_Frozen: True
--Layer_6
---Layer_Frozen: True
--Layer_7
---Layer_Frozen: True
--Layer_8
---Layer_Frozen: True
--Layer_9
---Layer_Frozen: True
--Layer_10
---Layer_Frozen: True
--Layer_11
---Layer_Frozen: True
--Layer_12
---Layer_Frozen: True
--Layer_13
---Layer_Frozen: True
--Layer_14
---Layer_Frozen: True
--Layer_15
---Layer_Frozen: True
--Layer_16
---Layer_Frozen: True
--Layer_17
---Layer_Frozen: True
--Layer_18
---Layer_Frozen: True
--Layer_19
---Layer_Frozen: True
--Layer_20
---Layer_Frozen: True
--Layer_21
---Layer_Frozen: True
--Layer_22
---Layer_Frozen: True
--Layer_23
---Layer_Frozen: True
--Layer_24
---Layer_Frozen: True
--Layer_25
---Layer_Frozen: True
--Layer_26
---Layer_Frozen: True
--Layer_27
---Layer_Fro

In [103]:
qwen_model.engage_all_layer_wrappers(False)

In [104]:
mgsm.evaluate(qwen_model,['en', 'th'], batch_size=batch_size, logging=True)

Evaluating en:   2%|▏         | 1/63 [00:03<04:01,  3.90s/it]

answer: 1.5, correct: 3 correctness: False
answer: 280, correct: 320 correctness: False
answer: 500, correct: 500 correctness: True
answer: 3.43, correct: 6 correctness: False


Evaluating en:   3%|▎         | 2/63 [00:06<02:55,  2.88s/it]

answer: 120, correct: 90 correctness: False
answer: Empty Answer, correct: 57 correctness: False
answer: 28, correct: 28 correctness: True
answer: 140, correct: 140 correctness: True


Evaluating en:   5%|▍         | 3/63 [00:09<03:06,  3.11s/it]

answer: 50, correct: 50 correctness: True
answer: 104, correct: 104 correctness: True
answer: 14, correct: 14 correctness: True
answer: 50, correct: 50 correctness: True


Evaluating en:   6%|▋         | 4/63 [00:13<03:20,  3.40s/it]

answer: 6277, correct: 6277 correctness: True
answer: 140, correct: 140 correctness: True
answer: 125, correct: 125 correctness: True
answer: 10, correct: 10 correctness: True


Evaluating en:   8%|▊         | 5/63 [00:16<03:12,  3.32s/it]

answer: 104, correct: 104 correctness: True
answer: 70, correct: 70 correctness: True
answer: 50, correct: 40 correctness: False
answer: 27, correct: 27 correctness: True


Evaluating en:  10%|▉         | 6/63 [00:21<03:48,  4.02s/it]

answer: 8, correct: 8 correctness: True
answer: 400, correct: 400 correctness: True
answer: 66.67, correct: 25 correctness: False
answer: 120, correct: 120 correctness: True


Evaluating en:  11%|█         | 7/63 [00:25<03:31,  3.78s/it]

answer: 15, correct: 15 correctness: True
answer: 360, correct: 40 correctness: False
answer: 33, correct: 83 correctness: False
answer: 30, correct: 30 correctness: True


Evaluating en:  13%|█▎        | 8/63 [00:28<03:16,  3.58s/it]

answer: 109, correct: 109 correctness: True
answer: 0, correct: 90 correctness: False
answer: 60, correct: 60 correctness: True
answer:  20 \text{ miles} - 15 \text{ miles} = 5 \text{ miles} \, correct: 25 correctness: False


Evaluating en:  14%|█▍        | 9/63 [00:32<03:21,  3.73s/it]

answer: 180, correct: 180 correctness: True
answer: 1970, correct: 80 correctness: False
answer: 225, correct: 225 correctness: True
answer: 106, correct: 106 correctness: True


Evaluating en:  16%|█▌        | 10/63 [00:37<03:43,  4.22s/it]

answer: 120, correct: 98 correctness: False
answer: 29, correct: 29 correctness: True
answer: 20, correct: 20 correctness: True
answer: 54, correct: 50 correctness: False


Evaluating en:  17%|█▋        | 11/63 [00:43<03:58,  4.59s/it]

answer: 32, correct: 32 correctness: True
answer: Empty Answer, correct: 92 correctness: False
answer: 55\, correct: 55 correctness: False
answer: 221, correct: 221 correctness: True


Evaluating en:  19%|█▉        | 12/63 [00:46<03:40,  4.32s/it]

answer: 1000, correct: 4000 correctness: False
answer: 460, correct: 460 correctness: True
answer: 8, correct: 8 correctness: True
answer: 280, correct: 280 correctness: True


Evaluating en:  21%|██        | 13/63 [00:50<03:28,  4.17s/it]

answer: 309, correct: 45 correctness: False
answer: 12, correct: 13 correctness: False
answer:  \text{Total distance covered} = 80 \text{ miles} + 150 \text{ miles} = 230 \text{ miles} \, correct: 230 correctness: False
answer: 4, correct: 4 correctness: True


Evaluating en:  22%|██▏       | 14/63 [00:55<03:34,  4.38s/it]

answer: 10, correct: 1 correctness: False
answer: 60, correct: 75 correctness: False
answer: 122, correct: 122 correctness: True
answer: 54, correct: 68 correctness: False


Evaluating en:  24%|██▍       | 15/63 [01:00<03:42,  4.64s/it]

answer: Empty Answer, correct: 4 correctness: False
answer: 14, correct: 7 correctness: False
answer: 880, correct: 880 correctness: True
answer: 16, correct: 16 correctness: True


Evaluating en:  25%|██▌       | 16/63 [01:04<03:21,  4.29s/it]

answer: Empty Answer, correct: 48 correctness: False
answer: 240, correct: 240 correctness: True
answer: 940, correct: 940 correctness: True
answer: 13, correct: 13 correctness: True


Evaluating en:  27%|██▋       | 17/63 [01:07<03:05,  4.02s/it]

answer: 800, correct: 800 correctness: True
answer: 595, correct: 595 correctness: True
answer: 12, correct: 6 correctness: False
answer: 5, correct: 5 correctness: True


Evaluating en:  29%|██▊       | 18/63 [01:12<03:12,  4.28s/it]

answer: 20, correct: 20 correctness: True
answer: 2, correct: 8 correctness: False
answer: 160, correct: 160 correctness: True
answer: 7, correct: 7 correctness: True


Evaluating en:  30%|███       | 19/63 [01:15<02:55,  3.99s/it]

answer: 5, correct: 5 correctness: True
answer: 15.67, correct: 15 correctness: False
answer: 6, correct: 12 correctness: False
answer: 44, correct: 44 correctness: True


Evaluating en:  32%|███▏      | 20/63 [01:21<03:09,  4.40s/it]

answer: Empty Answer, correct: 36 correctness: False
answer: 75, correct: 75 correctness: True
answer: 10, correct: 10 correctness: True
answer: 10, correct: 10 correctness: True


Evaluating en:  33%|███▎      | 21/63 [01:25<03:08,  4.48s/it]

answer: 26, correct: 26 correctness: True
answer: 5, correct: 2 correctness: False
answer: 1.67, correct: 5 correctness: False
answer: 40, correct: 58 correctness: False


Evaluating en:  35%|███▍      | 22/63 [01:29<02:51,  4.19s/it]

answer: 34, correct: 34 correctness: True
answer: 50, correct: 50 correctness: True
answer: 54, correct: 54 correctness: True
answer: 30, correct: 30 correctness: True


Evaluating en:  37%|███▋      | 23/63 [01:34<03:01,  4.54s/it]

answer: 2125, correct: 2125 correctness: True
answer: Empty Answer, correct: 95200 correctness: False
answer: 8,000, correct: 8000 correctness: False
answer: Empty Answer, correct: 360 correctness: False


Evaluating en:  38%|███▊      | 24/63 [01:37<02:40,  4.11s/it]

answer: 4, correct: 4 correctness: True
answer: 95, correct: 95 correctness: True
answer: 300, correct: 300 correctness: True
answer: 7, correct: 7 correctness: True


Evaluating en:  40%|███▉      | 25/63 [01:41<02:30,  3.96s/it]

answer: 29, correct: 48 correctness: False
answer: 50, correct: 50 correctness: True
answer: 19, correct: 17 correctness: False
answer: 10, correct: 10 correctness: True


Evaluating en:  41%|████▏     | 26/63 [01:44<02:17,  3.70s/it]

answer: 32, correct: 32 correctness: True
answer: 3, correct: 3 correctness: True
answer: 1, correct: 8 correctness: False
answer: 98, correct: 98 correctness: True


Evaluating en:  43%|████▎     | 27/63 [01:49<02:26,  4.06s/it]

answer: 1400, correct: 1400 correctness: True
answer: 60, correct: 60 correctness: True
answer: 14, correct: 30 correctness: False
answer: Empty Answer, correct: 24 correctness: False


Evaluating en:  44%|████▍     | 28/63 [01:53<02:20,  4.00s/it]

answer: 0.85, correct: 3 correctness: False
answer: 243, correct: 243 correctness: True
answer: 500, correct: 250 correctness: False
answer: 15, correct: 5 correctness: False


Evaluating en:  46%|████▌     | 29/63 [01:57<02:16,  4.03s/it]

answer: 220, correct: 45 correctness: False
answer: 150, correct: 150 correctness: True
answer: 200, correct: 300 correctness: False
answer: 140, correct: 140 correctness: True


Evaluating en:  48%|████▊     | 30/63 [02:00<02:05,  3.80s/it]

answer: 45, correct: 5 correctness: False
answer: 4, correct: 4 correctness: True
answer: 74995, correct: 145 correctness: False
answer: 57500, correct: 57500 correctness: True


Evaluating en:  49%|████▉     | 31/63 [02:04<01:58,  3.72s/it]

answer: 623, correct: 623 correctness: True
answer: 1512, correct: 1596 correctness: False
answer: 130, correct: 130 correctness: True
answer: 20, correct: 20 correctness: True


Evaluating en:  51%|█████     | 32/63 [02:06<01:42,  3.29s/it]

answer: 112, correct: 272 correctness: False
answer: 600, correct: 600 correctness: True
answer: 5, correct: 5 correctness: True
answer: 40, correct: 40 correctness: True


Evaluating en:  52%|█████▏    | 33/63 [02:11<01:53,  3.79s/it]

answer: 1210, correct: 1210 correctness: True
answer: x=18, correct: 18 correctness: False
answer: 1430, correct: 1430 correctness: True
answer: 144, correct: 72 correctness: False


Evaluating en:  54%|█████▍    | 34/63 [02:15<01:48,  3.74s/it]

answer: 77, correct: 77 correctness: True
answer: 15, correct: 15 correctness: True
answer: 18, correct: 18 correctness: True
answer: 6000, correct: 18000 correctness: False


Evaluating en:  56%|█████▌    | 35/63 [02:18<01:42,  3.67s/it]

answer: 14, correct: 21 correctness: False
answer: 26, correct: 26 correctness: True
answer: 9, correct: 9 correctness: True
answer: 61, correct: 34 correctness: False


Evaluating en:  57%|█████▋    | 36/63 [02:23<01:51,  4.14s/it]

answer: 25, correct: 25 correctness: True
answer: 384, correct: 366 correctness: False
answer: 350, correct: 350 correctness: True
answer: Empty Answer, correct: 10000 correctness: False


Evaluating en:  59%|█████▊    | 37/63 [02:27<01:42,  3.96s/it]

answer: 30, correct: 31 correctness: False
answer: 82, correct: 82 correctness: True
answer: 36, correct: 36 correctness: True
answer: 88.67, correct: 88 correctness: False


Evaluating en:  60%|██████    | 38/63 [02:31<01:38,  3.95s/it]

answer: 276000, correct: 276000 correctness: True
answer: 15.4, correct: 16 correctness: False
answer: 91, correct: 91 correctness: True
answer: 7.5, correct: 60 correctness: False


Evaluating en:  62%|██████▏   | 39/63 [02:34<01:30,  3.76s/it]

answer: 70, correct: 70 correctness: True
answer: 15, correct: 15 correctness: True
answer: 705, correct: 175 correctness: False
answer: 16.8, correct: 16 correctness: False


Evaluating en:  63%|██████▎   | 40/63 [02:38<01:24,  3.69s/it]

answer: 20, correct: 20 correctness: True
answer: 31, correct: 31 correctness: True
answer: 100, correct: 100 correctness: True
answer: 2600, correct: 2600 correctness: True


Evaluating en:  65%|██████▌   | 41/63 [02:40<01:14,  3.39s/it]

answer: 60, correct: 60 correctness: True
answer: 260, correct: 100 correctness: False
answer: 48, correct: 14 correctness: False
answer: 12, correct: 12 correctness: True


Evaluating en:  67%|██████▋   | 42/63 [02:44<01:10,  3.35s/it]

answer: 114200, correct: 114200 correctness: True
answer: 64, correct: 64 correctness: True
answer: 255, correct: 255 correctness: True
answer: 36, correct: 20 correctness: False


Evaluating en:  68%|██████▊   | 43/63 [02:48<01:11,  3.58s/it]

answer: 8.75, correct: 5 correctness: False
answer: 20, correct: 20 correctness: True
answer: 720, correct: 720 correctness: True
answer: 50, correct: 10 correctness: False


Evaluating en:  70%|██████▉   | 44/63 [02:52<01:11,  3.76s/it]

answer: 1900000, correct: 25000 correctness: False
answer: 18, correct: 23 correctness: False
answer: 348, correct: 348 correctness: True
answer: 8, correct: 24 correctness: False


Evaluating en:  71%|███████▏  | 45/63 [02:57<01:15,  4.21s/it]

answer: Empty Answer, correct: 6 correctness: False
answer: Empty Answer, correct: 163 correctness: False
answer: 17, correct: 17 correctness: True
answer: 187, correct: 160 correctness: False


Evaluating en:  73%|███████▎  | 46/63 [03:02<01:15,  4.42s/it]

answer: 21, correct: 21 correctness: True
answer: 6, correct: 6 correctness: True
answer: 60, correct: 60 correctness: True
answer: 694, correct: 694 correctness: True


Evaluating en:  75%|███████▍  | 47/63 [03:06<01:09,  4.32s/it]

answer: /number, correct: 3 correctness: False
answer: Empty Answer, correct: 16 correctness: False
answer: 70000, correct: 70000 correctness: True
answer: 860, correct: 860 correctness: True


Evaluating en:  76%|███████▌  | 48/63 [03:10<01:02,  4.18s/it]

answer: 80, correct: 80 correctness: True
answer: Empty Answer, correct: 360 correctness: False
answer: 10, correct: 10 correctness: True
answer: 9, correct: 9 correctness: True


Evaluating en:  78%|███████▊  | 49/63 [03:13<00:55,  3.97s/it]

answer: 75, correct: 75 correctness: True
answer: 10, correct: 10 correctness: True
answer: 45, correct: 45 correctness: True
answer: 105, correct: 105 correctness: True


Evaluating en:  79%|███████▉  | 50/63 [03:18<00:53,  4.12s/it]

answer: 5600, correct: 5600 correctness: True
answer: 9, correct: 9 correctness: True
answer: 20, correct: 20 correctness: True
answer: 16, correct: 16 correctness: True


Evaluating en:  81%|████████  | 51/63 [03:20<00:42,  3.52s/it]

answer: 12, correct: 12 correctness: True
answer: 23, correct: 23 correctness: True
answer: 40, correct: 40 correctness: True
answer: 187, correct: 187 correctness: True


Evaluating en:  83%|████████▎ | 52/63 [03:25<00:44,  4.06s/it]

answer: Empty Answer, correct: 5 correctness: False
answer: 200, correct: 48 correctness: False
answer: 70, correct: 70 correctness: True
answer: 260, correct: 260 correctness: True


Evaluating en:  84%|████████▍ | 53/63 [03:29<00:38,  3.85s/it]

answer: 3, correct: 6 correctness: False
answer: 29, correct: 51 correctness: False
answer: 250, correct: 250 correctness: True
answer: 800, correct: 200 correctness: False


Evaluating en:  86%|████████▌ | 54/63 [03:31<00:30,  3.41s/it]

answer: 30, correct: 30 correctness: True
answer: 180, correct: 90 correctness: False
answer: 17, correct: 15 correctness: False
answer: 36, correct: 33 correctness: False


Evaluating en:  87%|████████▋ | 55/63 [03:35<00:27,  3.44s/it]

answer: 70, correct: 70 correctness: True
answer: 21, correct: 21 correctness: True
answer: 36, correct: 36 correctness: True
answer: 6, correct: 6 correctness: True


Evaluating en:  89%|████████▉ | 56/63 [03:37<00:22,  3.24s/it]

answer: 120, correct: 120 correctness: True
answer: 4, correct: 4 correctness: True
answer: 15, correct: 15 correctness: True
answer: 24, correct: 24 correctness: True


Evaluating en:  90%|█████████ | 57/63 [03:41<00:19,  3.25s/it]

answer: 18, correct: 18 correctness: True
answer: 3, correct: 3 correctness: True
answer: 16, correct: 16 correctness: True
answer:  \text{Amount spent on gas} = \$30.00 - \$2.00 = \$28.00 \, correct: 28 correctness: False


Evaluating en:  92%|█████████▏| 58/63 [03:43<00:15,  3.11s/it]

answer: 15, correct: 15 correctness: True
answer: 280, correct: 280 correctness: True
answer: 220, correct: 220 correctness: True
answer: 120, correct: 120 correctness: True


Evaluating en:  94%|█████████▎| 59/63 [03:48<00:14,  3.64s/it]

answer: 20000, correct: 20000 correctness: True
answer: 798.6, correct: 9360 correctness: False
answer: 27000, correct: 27000 correctness: True
answer: 42, correct: 42 correctness: True


Evaluating en:  95%|█████████▌| 60/63 [03:53<00:12,  4.06s/it]

answer: 76, correct: 76 correctness: True
answer: 11, correct: 44 correctness: False
answer: 100, correct: 100 correctness: True
answer: 27, correct: 18 correctness: False


Evaluating en:  97%|█████████▋| 61/63 [03:59<00:08,  4.47s/it]

answer: Empty Answer, correct: 7500 correctness: False
answer: 7425, correct: 7425 correctness: True
answer: 540, correct: 540 correctness: True
answer:  42 \times 7 = \$294 \, correct: 294 correctness: False


Evaluating en:  98%|█████████▊| 62/63 [04:02<00:04,  4.12s/it]

answer: Empty Answer, correct: 26 correctness: False
answer: 22, correct: 22 correctness: True
answer: 2, correct: 2 correctness: True
answer: 2446, correct: 2 correctness: False


Evaluating en: 100%|██████████| 63/63 [04:04<00:00,  3.88s/it]

answer: 28, correct: 48 correctness: False
answer: 35, correct: 35 correctness: True



Generating train split: 100%|██████████| 8/8 [00:00<00:00, 4752.75 examples/s]
Generating test split: 100%|██████████| 250/250 [00:00<00:00, 215004.31 examples/s]
Evaluating th:   2%|▏         | 1/63 [00:04<04:31,  4.37s/it]

answer: 20, correct: 20 correctness: True
answer: 2750, correct: 114200 correctness: False
answer: Empty Answer, correct: 16 correctness: False
answer: Empty Answer, correct: 26 correctness: False


Evaluating th:   3%|▎         | 2/63 [00:07<03:31,  3.47s/it]

answer: 35, correct: 50 correctness: False
answer: Empty Answer, correct: 34 correctness: False
answer: 25, correct: 25 correctness: True
answer: 70, correct: 70 correctness: True


Evaluating th:   5%|▍         | 3/63 [00:11<03:52,  3.87s/it]

answer: 29, correct: 29 correctness: True
answer: Empty Answer, correct: 25 correctness: False
answer: 20, correct: 20 correctness: True
answer: Empty Answer, correct: 122 correctness: False


Evaluating th:   6%|▋         | 4/63 [00:16<04:05,  4.16s/it]

answer: 11,000, correct: 4000 correctness: False
answer: 95.78, correct: 106 correctness: False
answer: 240, correct: 240 correctness: True
answer: 477, correct: 255 correctness: False


Evaluating th:   8%|▊         | 5/63 [00:19<03:39,  3.78s/it]

answer: Empty Answer, correct: 45 correctness: False
answer: Empty Answer, correct: 17 correctness: False
answer: 7, correct: 7 correctness: True
answer: Empty Answer, correct: 104 correctness: False


Evaluating th:  10%|▉         | 6/63 [00:23<03:45,  3.96s/it]

answer: 30, correct: 23 correctness: False
answer: 57, correct: 163 correctness: False
answer: Empty Answer, correct: 26 correctness: False
answer: Empty Answer, correct: 109 correctness: False


Evaluating th:  11%|█         | 7/63 [00:27<03:38,  3.89s/it]

answer: 28, correct: 28 correctness: True
answer: Empty Answer, correct: 32 correctness: False
answer: 0, correct: 88 correctness: False
answer: 3, correct: 3 correctness: True


Evaluating th:  13%|█▎        | 8/63 [00:31<03:31,  3.84s/it]

answer: 250, correct: 250 correctness: True
answer: Empty Answer, correct: 60 correctness: False
answer: Empty Answer, correct: 225 correctness: False
answer: 294, correct: 366 correctness: False


Evaluating th:  14%|█▍        | 9/63 [00:35<03:37,  4.03s/it]

answer: 50, correct: 50 correctness: True
answer: Empty Answer, correct: 92 correctness: False
answer: 10, correct: 10 correctness: True
answer: Empty Answer, correct: 14 correctness: False


Evaluating th:  16%|█▌        | 10/63 [00:38<03:18,  3.74s/it]

answer: 26, correct: 31 correctness: False
answer: Empty Answer, correct: 3 correctness: False
answer: 21, correct: 21 correctness: True
answer: Empty Answer, correct: 95 correctness: False


Evaluating th:  17%|█▋        | 11/63 [00:41<02:53,  3.33s/it]

answer: 70, correct: 70 correctness: True
answer: 84, correct: 7 correctness: False
answer: Empty Answer, correct: 6277 correctness: False
answer: Empty Answer, correct: 2125 correctness: False


Evaluating th:  19%|█▉        | 12/63 [00:44<02:57,  3.48s/it]

answer: Empty Answer, correct: 10000 correctness: False
answer: 120, correct: 120 correctness: True
answer: Empty Answer, correct: 104 correctness: False
answer: 125, correct: 64 correctness: False


Evaluating th:  21%|██        | 13/63 [00:49<03:07,  3.76s/it]

answer: Empty Answer, correct: 6 correctness: False
answer: 48, correct: 16 correctness: False
answer: Empty Answer, correct: 145 correctness: False
answer: 380, correct: 880 correctness: False


Evaluating th:  22%|██▏       | 14/63 [00:53<03:07,  3.82s/it]

answer: 700, correct: 140 correctness: False
answer: Empty Answer, correct: 18 correctness: False
answer: Empty Answer, correct: 8 correctness: False
answer: Empty Answer, correct: 40 correctness: False


Evaluating th:  24%|██▍       | 15/63 [00:55<02:44,  3.42s/it]

answer: 27, correct: 31 correctness: False
answer: 18, correct: 18 correctness: True
answer: Empty Answer, correct: 72 correctness: False
answer: 12, correct: 10 correctness: False


Evaluating th:  25%|██▌       | 16/63 [00:58<02:37,  3.35s/it]

answer: 5, correct: 9 correctness: False
answer: Empty Answer, correct: 90 correctness: False
answer: Empty Answer, correct: 1430 correctness: False
answer: Empty Answer, correct: 16 correctness: False


Evaluating th:  27%|██▋       | 17/63 [01:02<02:38,  3.45s/it]

answer: 0, correct: 90 correctness: False
answer: 12, correct: 60 correctness: False
answer: 54, correct: 54 correctness: True
answer: 25, correct: 250 correctness: False


Evaluating th:  29%|██▊       | 18/63 [01:06<02:44,  3.65s/it]

answer: Empty Answer, correct: 16 correctness: False
answer: Empty Answer, correct: 82 correctness: False
answer: 13, correct: 13 correctness: True
answer: Empty Answer, correct: 320 correctness: False


Evaluating th:  30%|███       | 19/63 [01:10<02:44,  3.74s/it]

answer: 40, correct: 58 correctness: False
answer: 5, correct: 15 correctness: False
answer: 12, correct: 36 correctness: False
answer: Empty Answer, correct: 40 correctness: False


Evaluating th:  32%|███▏      | 20/63 [01:14<02:39,  3.70s/it]

answer: 360, correct: 360 correctness: True
answer: 280, correct: 280 correctness: True
answer: Empty Answer, correct: 4 correctness: False
answer: Empty Answer, correct: 98 correctness: False


Evaluating th:  33%|███▎      | 21/63 [01:17<02:33,  3.65s/it]

answer: 8800, correct: 8000 correctness: False
answer: 720, correct: 720 correctness: True
answer: 200, correct: 10 correctness: False
answer: Empty Answer, correct: 24 correctness: False


Evaluating th:  35%|███▍      | 22/63 [01:22<02:39,  3.89s/it]

answer: Empty Answer, correct: 80 correctness: False
answer: Empty Answer, correct: 18 correctness: False
answer: Empty Answer, correct: 15 correctness: False
answer: Empty Answer, correct: 1210 correctness: False


Evaluating th:  37%|███▋      | 23/63 [01:25<02:24,  3.62s/it]

answer: Empty Answer, correct: 20 correctness: False
answer: 32.33, correct: 32 correctness: False
answer: Empty Answer, correct: 70 correctness: False
answer: 230, correct: 230 correctness: True


Evaluating th:  38%|███▊      | 24/63 [01:29<02:25,  3.73s/it]

answer: 860, correct: 860 correctness: True
answer: Empty Answer, correct: 348 correctness: False
answer: Empty Answer, correct: 100 correctness: False
answer: 2, correct: 5 correctness: False


Evaluating th:  40%|███▉      | 25/63 [01:31<02:10,  3.43s/it]

answer: Empty Answer, correct: 13 correctness: False
answer: Empty Answer, correct: 18000 correctness: False
answer: Empty Answer, correct: 21 correctness: False
answer: Empty Answer, correct: 20 correctness: False


Evaluating th:  41%|████▏     | 26/63 [01:36<02:15,  3.65s/it]

answer: Empty Answer, correct: 8 correctness: False
answer: Empty Answer, correct: 21 correctness: False
answer: Empty Answer, correct: 60 correctness: False
answer: Empty Answer, correct: 15 correctness: False


Evaluating th:  43%|████▎     | 27/63 [01:40<02:22,  3.97s/it]

answer: 180, correct: 180 correctness: True
answer: Empty Answer, correct: 400 correctness: False
answer: Empty Answer, correct: 940 correctness: False
answer: 16, correct: 7 correctness: False


Evaluating th:  44%|████▍     | 28/63 [01:43<02:08,  3.68s/it]

answer: 342,350, correct: 25000 correctness: False
answer: คะแนนที่ผู้แพ้ได้รับ, correct: 20 correctness: False
answer: Empty Answer, correct: 15 correctness: False
answer: 2, correct: 14 correctness: False


Evaluating th:  46%|████▌     | 29/63 [01:46<01:56,  3.43s/it]

answer: 170, correct: 280 correctness: False
answer: Empty Answer, correct: 17 correctness: False
answer: 40, correct: 80 correctness: False
answer: 140, correct: 140 correctness: True


Evaluating th:  48%|████▊     | 30/63 [01:51<02:02,  3.72s/it]

answer: 4, correct: 3 correctness: False
answer: Empty Answer, correct: 83 correctness: False
answer: 35, correct: 25 correctness: False
answer:  \text{ยอดคงเหลือ} = 2444.44 - 1980 \, correct: 220 correctness: False


Evaluating th:  49%|████▉     | 31/63 [01:54<01:52,  3.52s/it]

answer: 360, correct: 360 correctness: True
answer: 3, correct: 3 correctness: True
answer: Empty Answer, correct: 48 correctness: False
answer: 9, correct: 26 correctness: False


Evaluating th:  51%|█████     | 32/63 [01:57<01:52,  3.62s/it]

answer: Empty Answer, correct: 600 correctness: False
answer: 729, correct: 5 correctness: False
answer: 11, correct: 15 correctness: False
answer: 80, correct: 50 correctness: False


Evaluating th:  52%|█████▏    | 33/63 [02:01<01:44,  3.49s/it]

answer: Empty Answer, correct: 10 correctness: False
answer: Empty Answer, correct: 500 correctness: False
answer: 2600, correct: 2600 correctness: True
answer: Empty Answer, correct: 60 correctness: False


Evaluating th:  54%|█████▍    | 34/63 [02:04<01:37,  3.36s/it]

answer: 121, correct: 44 correctness: False
answer: 64, correct: 100 correctness: False
answer: Empty Answer, correct: 5 correctness: False
answer: 2, correct: 10 correctness: False


Evaluating th:  56%|█████▌    | 35/63 [02:07<01:34,  3.38s/it]

answer: 75, correct: 7500 correctness: False
answer: 1, correct: 1 correctness: True
answer: Empty Answer, correct: 300 correctness: False
answer: 0.08, correct: 3 correctness: False


Evaluating th:  57%|█████▋    | 36/63 [02:10<01:26,  3.20s/it]

answer: Empty Answer, correct: 6 correctness: False
answer: Empty Answer, correct: 24 correctness: False
answer: Empty Answer, correct: 27000 correctness: False
answer: Empty Answer, correct: 4 correctness: False


Evaluating th:  59%|█████▊    | 37/63 [02:13<01:25,  3.27s/it]

answer: Empty Answer, correct: 42 correctness: False
answer: 48, correct: 12 correctness: False
answer: 62.5, correct: 150 correctness: False
answer: 243, correct: 243 correctness: True


Evaluating th:  60%|██████    | 38/63 [02:17<01:23,  3.34s/it]

answer: 140, correct: 140 correctness: True
answer: Empty Answer, correct: 120 correctness: False
answer: Jewelry, correct: 125 correctness: False
answer: 6. 5, correct: 45 correctness: False


Evaluating th:  62%|██████▏   | 39/63 [02:20<01:15,  3.13s/it]

answer: 25,000, correct: 20000 correctness: False
answer: Empty Answer, correct: 5 correctness: False
answer: Empty Answer, correct: 90 correctness: False
answer: Empty Answer, correct: 91 correctness: False


Evaluating th:  63%|██████▎   | 40/63 [02:24<01:18,  3.40s/it]

answer: Empty Answer, correct: 12 correctness: False
answer: Empty Answer, correct: 20 correctness: False
answer: Empty Answer, correct: 50 correctness: False
answer: 8, correct: 15 correctness: False


Evaluating th:  65%|██████▌   | 41/63 [02:27<01:18,  3.55s/it]

answer: 3528, correct: 294 correctness: False
answer: 36, correct: 36 correctness: True
answer: Empty Answer, correct: 5 correctness: False
answer: Empty Answer, correct: 350 correctness: False


Evaluating th:  67%|██████▋   | 42/63 [02:31<01:13,  3.50s/it]

answer: 55, correct: 55 correctness: True
answer: 185000, correct: 57500 correctness: False
answer: 49, correct: 98 correctness: False
answer: 76, correct: 76 correctness: True


Evaluating th:  68%|██████▊   | 43/63 [02:35<01:13,  3.66s/it]

answer: 23760, correct: 9360 correctness: False
answer: 69.57%, correct: 33 correctness: False
answer: Empty Answer, correct: 23 correctness: False
answer: Empty Answer, correct: 95200 correctness: False


Evaluating th:  70%|██████▉   | 44/63 [02:39<01:14,  3.91s/it]

answer: 27, correct: 27 correctness: True
answer: 1400, correct: 1400 correctness: True
answer: 221, correct: 221 correctness: True
answer: Empty Answer, correct: 45 correctness: False


Evaluating th:  71%|███████▏  | 45/63 [02:44<01:13,  4.06s/it]

answer: 0.4, correct: 16 correctness: False
answer: More than $16, correct: 18 correctness: False
answer: Empty Answer, correct: 24 correctness: False
answer: 183, correct: 75 correctness: False


Evaluating th:  73%|███████▎  | 46/63 [02:48<01:07,  3.96s/it]

answer: 14, correct: 9 correctness: False
answer: Empty Answer, correct: 2 correctness: False
answer: Empty Answer, correct: 15 correctness: False
answer: Empty Answer, correct: 4 correctness: False


Evaluating th:  75%|███████▍  | 47/63 [02:52<01:06,  4.13s/it]

answer: 10, correct: 10 correctness: True
answer:  4 = \frac{\text{ระยะที่เหลือ}}{2 + x} \, correct: 6 correctness: False
answer: 120, correct: 120 correctness: True
answer: 228,000, correct: 276000 correctness: False


Evaluating th:  76%|███████▌  | 48/63 [02:55<00:58,  3.91s/it]

answer: 16, correct: 16 correctness: True
answer: Empty Answer, correct: 160 correctness: False
answer: 8, correct: 8 correctness: True
answer: 0, correct: 20 correctness: False


Evaluating th:  78%|███████▊  | 49/63 [02:58<00:50,  3.61s/it]

answer: 57, correct: 57 correctness: True
answer: Empty Answer, correct: 1596 correctness: False
answer: Empty Answer, correct: 44 correctness: False
answer: Empty Answer, correct: 2 correctness: False


Evaluating th:  79%|███████▉  | 50/63 [03:02<00:45,  3.52s/it]

answer: 35, correct: 5 correctness: False
answer: 272, correct: 272 correctness: True
answer: 22, correct: 22 correctness: True
answer: 25, correct: 50 correctness: False


Evaluating th:  81%|████████  | 51/63 [03:05<00:41,  3.42s/it]

answer: 187, correct: 187 correctness: True
answer: 300, correct: 75 correctness: False
answer: 430.5, correct: 623 correctness: False
answer: 20, correct: 48 correctness: False


Evaluating th:  83%|████████▎ | 52/63 [03:08<00:37,  3.37s/it]

answer: 200, correct: 300 correctness: False
answer: 1245, correct: 7425 correctness: False
answer: 12, correct: 9 correctness: False
answer: 5, correct: 40 correctness: False


Evaluating th:  84%|████████▍ | 53/63 [03:13<00:37,  3.76s/it]

answer:  \frac{2}{3} \times 9 = 6 \, correct: 6 correctness: False
answer: Empty Answer, correct: 30 correctness: False
answer: Empty Answer, correct: 8 correctness: False
answer: Empty Answer, correct: 12 correctness: False


Evaluating th:  86%|████████▌ | 54/63 [03:16<00:31,  3.53s/it]

answer: Empty Answer, correct: 35 correctness: False
answer: Empty Answer, correct: 160 correctness: False
answer: Empty Answer, correct: 51 correctness: False
answer: Empty Answer, correct: 48 correctness: False


Evaluating th:  87%|████████▋ | 55/63 [03:19<00:28,  3.59s/it]

answer: Final Answer: 15, correct: 10 correctness: False
answer: Empty Answer, correct: 28 correctness: False
answer: Empty Answer, correct: 4 correctness: False
answer: 130, correct: 130 correctness: True


Evaluating th:  89%|████████▉ | 56/63 [03:24<00:26,  3.76s/it]

answer: 78, correct: 68 correctness: False
answer: 4, correct: 4 correctness: True
answer: 30, correct: 30 correctness: True
answer: Empty Answer, correct: 800 correctness: False


Evaluating th:  90%|█████████ | 57/63 [03:27<00:22,  3.72s/it]

answer: 8%, correct: 60 correctness: False
answer: 460, correct: 460 correctness: True
answer: Empty Answer, correct: 34 correctness: False
answer: 200, correct: 200 correctness: True


Evaluating th:  92%|█████████▏| 58/63 [03:30<00:17,  3.46s/it]

answer: 2, correct: 6 correctness: False
answer: Final Answer: 180, correct: 540 correctness: False
answer: Empty Answer, correct: 40 correctness: False
answer: 794, correct: 694 correctness: False


Evaluating th:  94%|█████████▎| 59/63 [03:33<00:13,  3.33s/it]

answer: Empty Answer, correct: 75 correctness: False
answer: Empty Answer, correct: 2 correctness: False
answer: 170,000, correct: 70000 correctness: False
answer: 260, correct: 260 correctness: True


Evaluating th:  95%|█████████▌| 60/63 [03:37<00:10,  3.60s/it]

answer: Empty Answer, correct: 6 correctness: False
answer: Empty Answer, correct: 105 correctness: False
answer: Empty Answer, correct: 5600 correctness: False
answer: 695, correct: 595 correctness: False


Evaluating th:  97%|█████████▋| 61/63 [03:40<00:06,  3.44s/it]

answer: Empty Answer, correct: 5 correctness: False
answer: Empty Answer, correct: 77 correctness: False
answer: Empty Answer, correct: 100 correctness: False
answer: Empty Answer, correct: 30 correctness: False


Evaluating th:  98%|█████████▊| 62/63 [03:44<00:03,  3.57s/it]

answer: Empty Answer, correct: 36 correctness: False
answer: Empty Answer, correct: 48 correctness: False
answer: 3, correct: 70 correctness: False
answer: Empty Answer, correct: 5 correctness: False


Evaluating th: 100%|██████████| 63/63 [03:47<00:00,  3.62s/it]

answer: 30, correct: 30 correctness: True
answer: 165, correct: 175 correctness: False





{'en': {'accuracy': 0.6190476190476191},
 'th': {'accuracy': 0.21031746031746032}}

In [105]:
qwen_model.engage_layer_wrapper(1)

In [106]:
qwen_model.print_status()

-LLM
--Layer_0
---Layer_Frozen: True
--LayerWrapper_1
---Layer_Frozen: True
---Mapper_Frozen: False
---Engaged: True
---Alpha_Frozen: False Value: 0.5 Not Used
--Layer_2
---Layer_Frozen: True
--Layer_3
---Layer_Frozen: True
--Layer_4
---Layer_Frozen: True
--Layer_5
---Layer_Frozen: True
--Layer_6
---Layer_Frozen: True
--Layer_7
---Layer_Frozen: True
--Layer_8
---Layer_Frozen: True
--Layer_9
---Layer_Frozen: True
--Layer_10
---Layer_Frozen: True
--Layer_11
---Layer_Frozen: True
--Layer_12
---Layer_Frozen: True
--Layer_13
---Layer_Frozen: True
--Layer_14
---Layer_Frozen: True
--Layer_15
---Layer_Frozen: True
--Layer_16
---Layer_Frozen: True
--Layer_17
---Layer_Frozen: True
--Layer_18
---Layer_Frozen: True
--Layer_19
---Layer_Frozen: True
--Layer_20
---Layer_Frozen: True
--Layer_21
---Layer_Frozen: True
--Layer_22
---Layer_Frozen: True
--Layer_23
---Layer_Frozen: True
--Layer_24
---Layer_Frozen: True
--Layer_25
---Layer_Frozen: True
--Layer_26
---Layer_Frozen: True
--Layer_27
---Layer_Fro

In [107]:
mgsm.evaluate(qwen_model,['en', 'th'], batch_size=batch_size, logging=True)

Evaluating en:   2%|▏         | 1/63 [00:05<05:32,  5.37s/it]

answer: Empty Answer, correct: 36 correctness: False
answer: Empty Answer, correct: 150 correctness: False
answer: Empty Answer, correct: 15 correctness: False
answer: Empty Answer, correct: 80 correctness: False


Evaluating en:   3%|▎         | 2/63 [00:10<05:22,  5.28s/it]

answer: Empty Answer, correct: 10 correctness: False
answer: Empty Answer, correct: 320 correctness: False
answer: Empty Answer, correct: 5 correctness: False
answer: Empty Answer, correct: 60 correctness: False


Evaluating en:   5%|▍         | 3/63 [00:16<05:23,  5.40s/it]

answer: Empty Answer, correct: 800 correctness: False
answer: Empty Answer, correct: 114200 correctness: False
answer: Empty Answer, correct: 1596 correctness: False
answer: Empty Answer, correct: 25 correctness: False


Evaluating en:   6%|▋         | 4/63 [00:21<05:17,  5.38s/it]

answer: Empty Answer, correct: 91 correctness: False
answer: Empty Answer, correct: 400 correctness: False
answer: Empty Answer, correct: 26 correctness: False
answer: Empty Answer, correct: 57 correctness: False


Evaluating en:   8%|▊         | 5/63 [00:27<05:17,  5.48s/it]

answer: Empty Answer, correct: 3 correctness: False
answer: Empty Answer, correct: 10 correctness: False
answer: Empty Answer, correct: 21 correctness: False
answer: Empty Answer, correct: 48 correctness: False


Evaluating en:  10%|▉         | 6/63 [00:32<05:15,  5.54s/it]

answer: Empty Answer, correct: 30 correctness: False
answer: Empty Answer, correct: 540 correctness: False
answer: Empty Answer, correct: 23 correctness: False
answer: Empty Answer, correct: 17 correctness: False


Evaluating en:  11%|█         | 7/63 [00:38<05:04,  5.44s/it]

answer: Empty Answer, correct: 54 correctness: False
answer: Empty Answer, correct: 163 correctness: False
answer: Empty Answer, correct: 6 correctness: False
answer: Empty Answer, correct: 145 correctness: False


Evaluating en:  13%|█▎        | 8/63 [00:42<04:50,  5.29s/it]

answer: Empty Answer, correct: 77 correctness: False
answer: Empty Answer, correct: 68 correctness: False
answer: Empty Answer, correct: 51 correctness: False
answer: Empty Answer, correct: 7425 correctness: False


Evaluating en:  14%|█▍        | 9/63 [00:48<04:42,  5.24s/it]

answer: Empty Answer, correct: 58 correctness: False
answer: Empty Answer, correct: 6 correctness: False
answer: Empty Answer, correct: 300 correctness: False
answer: Empty Answer, correct: 80 correctness: False


Evaluating en:  16%|█▌        | 10/63 [00:53<04:39,  5.28s/it]

answer: Empty Answer, correct: 1400 correctness: False
answer: Empty Answer, correct: 75 correctness: False
answer: Empty Answer, correct: 40 correctness: False
answer: Empty Answer, correct: 100 correctness: False


Evaluating en:  17%|█▋        | 11/63 [00:58<04:32,  5.25s/it]

answer: Empty Answer, correct: 35 correctness: False
answer: Empty Answer, correct: 230 correctness: False
answer: Empty Answer, correct: 15 correctness: False
answer: Empty Answer, correct: 40 correctness: False


Evaluating en:  19%|█▉        | 12/63 [01:03<04:22,  5.15s/it]

answer: Empty Answer, correct: 60 correctness: False
answer: Empty Answer, correct: 24 correctness: False
answer: Empty Answer, correct: 14 correctness: False
answer: Empty Answer, correct: 140 correctness: False


Evaluating en:  21%|██        | 13/63 [01:09<04:24,  5.29s/it]

answer: Empty Answer, correct: 75 correctness: False
answer: Empty Answer, correct: 28 correctness: False
answer: Empty Answer, correct: 366 correctness: False
answer: Empty Answer, correct: 120 correctness: False


Evaluating en:  22%|██▏       | 14/63 [01:14<04:20,  5.31s/it]

answer: Empty Answer, correct: 6 correctness: False
answer: Empty Answer, correct: 500 correctness: False
answer: Empty Answer, correct: 880 correctness: False
answer: Empty Answer, correct: 140 correctness: False


Evaluating en:  24%|██▍       | 15/63 [01:19<04:07,  5.15s/it]

answer: Empty Answer, correct: 272 correctness: False
answer: Empty Answer, correct: 2 correctness: False
answer: Empty Answer, correct: 187 correctness: False
answer: Empty Answer, correct: 60 correctness: False


Evaluating en:  25%|██▌       | 16/63 [01:24<04:02,  5.16s/it]

answer: Empty Answer, correct: 276000 correctness: False
answer: Empty Answer, correct: 70 correctness: False
answer: Empty Answer, correct: 95 correctness: False
answer: Empty Answer, correct: 32 correctness: False


Evaluating en:  27%|██▋       | 17/63 [01:29<03:58,  5.19s/it]

answer: Empty Answer, correct: 104 correctness: False
answer: Empty Answer, correct: 15 correctness: False
answer: Empty Answer, correct: 16 correctness: False
answer: Empty Answer, correct: 255 correctness: False


Evaluating en:  29%|██▊       | 18/63 [01:34<03:50,  5.12s/it]

answer: Empty Answer, correct: 5 correctness: False
answer: Empty Answer, correct: 27 correctness: False
answer: Empty Answer, correct: 82 correctness: False
answer: Empty Answer, correct: 5 correctness: False


Evaluating en:  30%|███       | 19/63 [01:40<03:48,  5.19s/it]

answer: Empty Answer, correct: 100 correctness: False
answer: Empty Answer, correct: 15 correctness: False
answer: Empty Answer, correct: 5 correctness: False
answer: Empty Answer, correct: 600 correctness: False


Evaluating en:  32%|███▏      | 20/63 [01:44<03:37,  5.06s/it]

answer: Empty Answer, correct: 20 correctness: False
answer: Empty Answer, correct: 48 correctness: False
answer: Empty Answer, correct: 44 correctness: False
answer: Empty Answer, correct: 15 correctness: False


Evaluating en:  33%|███▎      | 21/63 [01:50<03:37,  5.17s/it]

answer: Empty Answer, correct: 50 correctness: False
answer: Empty Answer, correct: 50 correctness: False
answer: Empty Answer, correct: 92 correctness: False
answer: Empty Answer, correct: 18000 correctness: False


Evaluating en:  35%|███▍      | 22/63 [01:55<03:37,  5.29s/it]

answer: Empty Answer, correct: 720 correctness: False
answer: Empty Answer, correct: 109 correctness: False
answer: Empty Answer, correct: 26 correctness: False
answer: Empty Answer, correct: 20 correctness: False


Evaluating en:  37%|███▋      | 23/63 [02:00<03:28,  5.22s/it]

answer: Empty Answer, correct: 16 correctness: False
answer: Empty Answer, correct: 5600 correctness: False
answer: Empty Answer, correct: 20 correctness: False
answer: Empty Answer, correct: 4 correctness: False


Evaluating en:  38%|███▊      | 24/63 [02:06<03:29,  5.36s/it]

answer: Empty Answer, correct: 36 correctness: False
answer: Empty Answer, correct: 1 correctness: False
answer: Empty Answer, correct: 10 correctness: False
answer: Empty Answer, correct: 60 correctness: False


Evaluating en:  40%|███▉      | 25/63 [02:11<03:17,  5.20s/it]

answer: Empty Answer, correct: 7500 correctness: False
answer: Empty Answer, correct: 360 correctness: False
answer: Empty Answer, correct: 200 correctness: False
answer: Empty Answer, correct: 24 correctness: False


Evaluating en:  41%|████▏     | 26/63 [02:16<03:10,  5.14s/it]

answer: Empty Answer, correct: 9 correctness: False
answer: Empty Answer, correct: 45 correctness: False
answer: Empty Answer, correct: 98 correctness: False
answer: Empty Answer, correct: 6 correctness: False


Evaluating en:  43%|████▎     | 27/63 [02:21<03:08,  5.24s/it]

answer: Empty Answer, correct: 460 correctness: False
answer: Empty Answer, correct: 7 correctness: False
answer: Empty Answer, correct: 280 correctness: False
answer: Empty Answer, correct: 595 correctness: False


Evaluating en:  44%|████▍     | 28/63 [02:27<03:07,  5.35s/it]

answer: Empty Answer, correct: 4000 correctness: False
answer: Empty Answer, correct: 70000 correctness: False
answer: Empty Answer, correct: 31 correctness: False
answer: Empty Answer, correct: 10000 correctness: False


Evaluating en:  46%|████▌     | 29/63 [02:32<02:57,  5.21s/it]

answer: Empty Answer, correct: 240 correctness: False
answer: Empty Answer, correct: 2 correctness: False
answer: Empty Answer, correct: 2 correctness: False
answer: Empty Answer, correct: 75 correctness: False


Evaluating en:  48%|████▊     | 30/63 [02:37<02:52,  5.22s/it]

answer: Empty Answer, correct: 350 correctness: False
answer: Empty Answer, correct: 26 correctness: False
answer: Empty Answer, correct: 32 correctness: False
answer: Empty Answer, correct: 30 correctness: False


Evaluating en:  49%|████▉     | 31/63 [02:43<02:49,  5.31s/it]

answer: Empty Answer, correct: 4 correctness: False
answer: Empty Answer, correct: 18 correctness: False
answer: Empty Answer, correct: 28 correctness: False
answer: Empty Answer, correct: 348 correctness: False


Evaluating en:  51%|█████     | 32/63 [02:48<02:47,  5.40s/it]

answer: Empty Answer, correct: 20 correctness: False
answer: Empty Answer, correct: 623 correctness: False
answer: Empty Answer, correct: 10 correctness: False
answer: Empty Answer, correct: 6 correctness: False


Evaluating en:  52%|█████▏    | 33/63 [02:54<02:49,  5.63s/it]

answer: Empty Answer, correct: 20 correctness: False
answer: Empty Answer, correct: 18 correctness: False
answer: Empty Answer, correct: 8000 correctness: False
answer: Empty Answer, correct: 10 correctness: False


Evaluating en:  54%|█████▍    | 34/63 [03:00<02:43,  5.65s/it]

answer: Empty Answer, correct: 9 correctness: False
answer: Empty Answer, correct: 55 correctness: False
answer: Empty Answer, correct: 250 correctness: False
answer: Empty Answer, correct: 8 correctness: False


Evaluating en:  56%|█████▌    | 35/63 [03:07<02:44,  5.87s/it]

answer: Empty Answer, correct: 221 correctness: False
answer: Empty Answer, correct: 70 correctness: False
answer: Empty Answer, correct: 6 correctness: False
answer: Empty Answer, correct: 4 correctness: False


Evaluating en:  57%|█████▋    | 36/63 [03:13<02:42,  6.02s/it]

answer: Empty Answer, correct: 25 correctness: False
answer: Empty Answer, correct: 860 correctness: False
answer: Empty Answer, correct: 106 correctness: False
answer: Empty Answer, correct: 40 correctness: False


Evaluating en:  59%|█████▊    | 37/63 [03:18<02:31,  5.83s/it]

answer: Empty Answer, correct: 20 correctness: False
answer: Empty Answer, correct: 3 correctness: False
answer: Empty Answer, correct: 44 correctness: False
answer: Empty Answer, correct: 20 correctness: False


Evaluating en:  60%|██████    | 38/63 [03:24<02:21,  5.68s/it]

answer: Empty Answer, correct: 12 correctness: False
answer: Empty Answer, correct: 60 correctness: False
answer: Empty Answer, correct: 21 correctness: False
answer: Empty Answer, correct: 70 correctness: False


Evaluating en:  62%|██████▏   | 39/63 [03:29<02:15,  5.64s/it]

answer: Empty Answer, correct: 13 correctness: False
answer: Empty Answer, correct: 30 correctness: False
answer: Empty Answer, correct: 45 correctness: False
answer: Empty Answer, correct: 104 correctness: False


Evaluating en:  63%|██████▎   | 40/63 [03:34<02:07,  5.54s/it]

answer: Empty Answer, correct: 3 correctness: False
answer: Empty Answer, correct: 120 correctness: False
answer: Empty Answer, correct: 22 correctness: False
answer: Empty Answer, correct: 9360 correctness: False


Evaluating en:  65%|██████▌   | 41/63 [03:40<02:01,  5.53s/it]

answer: Empty Answer, correct: 48 correctness: False
answer: Empty Answer, correct: 34 correctness: False
answer: Empty Answer, correct: 14 correctness: False
answer: Empty Answer, correct: 105 correctness: False


Evaluating en:  67%|██████▋   | 42/63 [03:45<01:52,  5.36s/it]

answer: Empty Answer, correct: 5 correctness: False
answer: Empty Answer, correct: 10 correctness: False
answer: Empty Answer, correct: 120 correctness: False
answer: Empty Answer, correct: 7 correctness: False


Evaluating en:  68%|██████▊   | 43/63 [03:51<01:52,  5.63s/it]

answer: Empty Answer, correct: 20000 correctness: False
answer: Empty Answer, correct: 88 correctness: False
answer: Empty Answer, correct: 27000 correctness: False
answer: Empty Answer, correct: 2125 correctness: False


Evaluating en:  70%|██████▉   | 44/63 [03:57<01:46,  5.58s/it]

answer: Empty Answer, correct: 64 correctness: False
answer: Empty Answer, correct: 243 correctness: False
answer: Empty Answer, correct: 29 correctness: False
answer: Empty Answer, correct: 8 correctness: False


Evaluating en:  71%|███████▏  | 45/63 [04:02<01:40,  5.59s/it]

answer: Empty Answer, correct: 3 correctness: False
answer: Empty Answer, correct: 220 correctness: False
answer: Empty Answer, correct: 7 correctness: False
answer: Empty Answer, correct: 12 correctness: False


Evaluating en:  73%|███████▎  | 46/63 [04:08<01:35,  5.61s/it]

answer: Empty Answer, correct: 57500 correctness: False
answer: Empty Answer, correct: 8 correctness: False
answer: Empty Answer, correct: 6277 correctness: False
answer: Empty Answer, correct: 50 correctness: False


Evaluating en:  75%|███████▍  | 47/63 [04:13<01:29,  5.60s/it]

answer: Empty Answer, correct: 90 correctness: False
answer: Empty Answer, correct: 940 correctness: False
answer: Empty Answer, correct: 13 correctness: False
answer: Empty Answer, correct: 360 correctness: False


Evaluating en:  76%|███████▌  | 48/63 [04:19<01:22,  5.50s/it]

answer: Empty Answer, correct: 9 correctness: False
answer: Empty Answer, correct: 160 correctness: False
answer: Empty Answer, correct: 50 correctness: False
answer: Empty Answer, correct: 98 correctness: False


Evaluating en:  78%|███████▊  | 49/63 [04:24<01:15,  5.42s/it]

answer: Empty Answer, correct: 23 correctness: False
answer: Empty Answer, correct: 175 correctness: False
answer: Empty Answer, correct: 90 correctness: False
answer: Empty Answer, correct: 70 correctness: False


Evaluating en:  79%|███████▉  | 50/63 [04:30<01:11,  5.48s/it]

answer: Empty Answer, correct: 1210 correctness: False
answer: Empty Answer, correct: 12 correctness: False
answer: Empty Answer, correct: 76 correctness: False
answer: Empty Answer, correct: 16 correctness: False


Evaluating en:  81%|████████  | 51/63 [04:35<01:06,  5.51s/it]

answer: Empty Answer, correct: 30 correctness: False
answer: Empty Answer, correct: 16 correctness: False
answer: Empty Answer, correct: 4 correctness: False
answer: Empty Answer, correct: 36 correctness: False


Evaluating en:  83%|████████▎ | 52/63 [04:41<01:00,  5.53s/it]

answer: Empty Answer, correct: 694 correctness: False
answer: Empty Answer, correct: 33 correctness: False
answer: Empty Answer, correct: 5 correctness: False
answer: Empty Answer, correct: 140 correctness: False


Evaluating en:  84%|████████▍ | 53/63 [04:46<00:54,  5.42s/it]

answer: Empty Answer, correct: 10 correctness: False
answer: Empty Answer, correct: 34 correctness: False
answer: Empty Answer, correct: 42 correctness: False
answer: Empty Answer, correct: 15 correctness: False


Evaluating en:  86%|████████▌ | 54/63 [04:51<00:48,  5.40s/it]

answer: Empty Answer, correct: 72 correctness: False
answer: Empty Answer, correct: 225 correctness: False
answer: Empty Answer, correct: 24 correctness: False
answer: Empty Answer, correct: 50 correctness: False


Evaluating en:  87%|████████▋ | 55/63 [04:57<00:43,  5.47s/it]

answer: Empty Answer, correct: 90 correctness: False
answer: Empty Answer, correct: 250 correctness: False
answer: Empty Answer, correct: 5 correctness: False
answer: Empty Answer, correct: 122 correctness: False


Evaluating en:  89%|████████▉ | 56/63 [05:02<00:38,  5.50s/it]

answer: Empty Answer, correct: 25000 correctness: False
answer: Empty Answer, correct: 180 correctness: False
answer: Empty Answer, correct: 17 correctness: False
answer: Empty Answer, correct: 25 correctness: False


Evaluating en:  90%|█████████ | 57/63 [05:08<00:32,  5.42s/it]

answer: Empty Answer, correct: 16 correctness: False
answer: Empty Answer, correct: 45 correctness: False
answer: Empty Answer, correct: 125 correctness: False
answer: Empty Answer, correct: 160 correctness: False


Evaluating en:  92%|█████████▏| 58/63 [05:13<00:26,  5.29s/it]

answer: Empty Answer, correct: 40 correctness: False
answer: Empty Answer, correct: 3 correctness: False
answer: Empty Answer, correct: 83 correctness: False
answer: Empty Answer, correct: 48 correctness: False


Evaluating en:  94%|█████████▎| 59/63 [05:18<00:21,  5.30s/it]

answer: Empty Answer, correct: 280 correctness: False
answer: Empty Answer, correct: 260 correctness: False
answer: Empty Answer, correct: 1430 correctness: False
answer: Empty Answer, correct: 8 correctness: False


Evaluating en:  95%|█████████▌| 60/63 [05:24<00:16,  5.39s/it]

answer: Empty Answer, correct: 15 correctness: False
answer: Empty Answer, correct: 100 correctness: False
answer: Empty Answer, correct: 18 correctness: False
answer: Empty Answer, correct: 16 correctness: False


Evaluating en:  97%|█████████▋| 61/63 [05:29<00:10,  5.39s/it]

answer: Empty Answer, correct: 4 correctness: False
answer: Empty Answer, correct: 294 correctness: False
answer: Empty Answer, correct: 5 correctness: False
answer: Empty Answer, correct: 95200 correctness: False


Evaluating en:  98%|█████████▊| 62/63 [05:35<00:05,  5.48s/it]

answer: Empty Answer, correct: 21 correctness: False
answer: Empty Answer, correct: 2600 correctness: False
answer: Empty Answer, correct: 300 correctness: False
answer: Empty Answer, correct: 130 correctness: False


Evaluating en: 100%|██████████| 63/63 [05:40<00:00,  5.40s/it]

answer: Empty Answer, correct: 18 correctness: False
answer: Empty Answer, correct: 31 correctness: False



Evaluating th:   2%|▏         | 1/63 [00:04<04:39,  4.51s/it]

answer: Empty Answer, correct: 104 correctness: False
answer: Empty Answer, correct: 2 correctness: False
answer: Empty Answer, correct: 880 correctness: False
answer: Empty Answer, correct: 7425 correctness: False


Evaluating th:   3%|▎         | 2/63 [00:08<04:33,  4.48s/it]

answer: Empty Answer, correct: 36 correctness: False
answer: Empty Answer, correct: 60 correctness: False
answer: Empty Answer, correct: 70 correctness: False
answer: Empty Answer, correct: 17 correctness: False


Evaluating th:   5%|▍         | 3/63 [00:12<03:50,  3.84s/it]

answer: Empty Answer, correct: 25 correctness: False
answer: Empty Answer, correct: 5 correctness: False
answer: Empty Answer, correct: 77 correctness: False
answer: Empty Answer, correct: 6 correctness: False


Evaluating th:   6%|▋         | 4/63 [00:16<03:49,  3.90s/it]

answer: Empty Answer, correct: 5 correctness: False
answer: Empty Answer, correct: 32 correctness: False
answer: Empty Answer, correct: 40 correctness: False
answer: Empty Answer, correct: 2 correctness: False


Evaluating th:   8%|▊         | 5/63 [00:19<03:35,  3.72s/it]

answer: Empty Answer, correct: 16 correctness: False
answer: Empty Answer, correct: 26 correctness: False
answer: Empty Answer, correct: 160 correctness: False
answer: Empty Answer, correct: 255 correctness: False


Evaluating th:  10%|▉         | 6/63 [00:23<03:40,  3.88s/it]

answer: Empty Answer, correct: 44 correctness: False
answer: Empty Answer, correct: 4 correctness: False
answer: Empty Answer, correct: 17 correctness: False
answer: Empty Answer, correct: 6 correctness: False


Evaluating th:  11%|█         | 7/63 [00:28<03:48,  4.08s/it]

answer: Empty Answer, correct: 48 correctness: False
answer: Empty Answer, correct: 1400 correctness: False
answer: Empty Answer, correct: 30 correctness: False
answer: Empty Answer, correct: 16 correctness: False


Evaluating th:  13%|█▎        | 8/63 [00:32<03:43,  4.06s/it]

answer: Empty Answer, correct: 20 correctness: False
answer: Empty Answer, correct: 50 correctness: False
answer: Empty Answer, correct: 10 correctness: False
answer: Empty Answer, correct: 60 correctness: False


Evaluating th:  14%|█▍        | 9/63 [00:36<03:45,  4.17s/it]

answer: Empty Answer, correct: 15 correctness: False
answer: Empty Answer, correct: 92 correctness: False
answer: Empty Answer, correct: 1 correctness: False
answer: Empty Answer, correct: 24 correctness: False


Evaluating th:  16%|█▌        | 10/63 [00:40<03:34,  4.05s/it]

answer: Empty Answer, correct: 300 correctness: False
answer: Empty Answer, correct: 460 correctness: False
answer: Empty Answer, correct: 21 correctness: False
answer: Empty Answer, correct: 88 correctness: False


Evaluating th:  17%|█▋        | 11/63 [00:44<03:35,  4.15s/it]

answer: Empty Answer, correct: 31 correctness: False
answer: Empty Answer, correct: 300 correctness: False
answer: Empty Answer, correct: 36 correctness: False
answer: Empty Answer, correct: 9360 correctness: False


Evaluating th:  19%|█▉        | 12/63 [00:48<03:29,  4.11s/it]

answer: Empty Answer, correct: 70 correctness: False
answer: Empty Answer, correct: 20 correctness: False
answer: Empty Answer, correct: 3 correctness: False
answer: Empty Answer, correct: 5 correctness: False


Evaluating th:  21%|██        | 13/63 [00:52<03:22,  4.05s/it]

answer: Empty Answer, correct: 20 correctness: False
answer: Empty Answer, correct: 1210 correctness: False
answer: Empty Answer, correct: 4 correctness: False
answer: Empty Answer, correct: 940 correctness: False


Evaluating th:  22%|██▏       | 14/63 [00:56<03:20,  4.10s/it]

answer: Empty Answer, correct: 50 correctness: False
answer: Empty Answer, correct: 5 correctness: False
answer: Empty Answer, correct: 240 correctness: False
answer: Empty Answer, correct: 276000 correctness: False


Evaluating th:  24%|██▍       | 15/63 [01:01<03:27,  4.32s/it]

answer: Empty Answer, correct: 20 correctness: False
answer: Empty Answer, correct: 30 correctness: False
answer: Empty Answer, correct: 243 correctness: False
answer: Empty Answer, correct: 7 correctness: False


Evaluating th:  25%|██▌       | 16/63 [01:05<03:16,  4.19s/it]

answer: Empty Answer, correct: 25 correctness: False
answer: Empty Answer, correct: 175 correctness: False
answer: Empty Answer, correct: 31 correctness: False
answer: Empty Answer, correct: 50 correctness: False


Evaluating th:  27%|██▋       | 17/63 [01:08<02:55,  3.81s/it]

answer: Empty Answer, correct: 2 correctness: False
answer: Empty Answer, correct: 260 correctness: False
answer: Empty Answer, correct: 225 correctness: False
answer: Empty Answer, correct: 10 correctness: False


Evaluating th:  29%|██▊       | 18/63 [01:13<03:09,  4.21s/it]

answer: Empty Answer, correct: 75 correctness: False
answer: Empty Answer, correct: 8 correctness: False
answer: Empty Answer, correct: 100 correctness: False
answer: Empty Answer, correct: 720 correctness: False


Evaluating th:  30%|███       | 19/63 [01:18<03:10,  4.34s/it]

answer: Empty Answer, correct: 4000 correctness: False
answer: Empty Answer, correct: 14 correctness: False
answer: Empty Answer, correct: 100 correctness: False
answer: Empty Answer, correct: 33 correctness: False


Evaluating th:  32%|███▏      | 20/63 [01:22<03:07,  4.36s/it]

answer: Empty Answer, correct: 6 correctness: False
answer: Empty Answer, correct: 18 correctness: False
answer: Empty Answer, correct: 42 correctness: False
answer: Empty Answer, correct: 30 correctness: False


Evaluating th:  33%|███▎      | 21/63 [01:26<02:55,  4.17s/it]

answer: Empty Answer, correct: 12 correctness: False
answer: Empty Answer, correct: 9 correctness: False
answer: Empty Answer, correct: 16 correctness: False
answer: Empty Answer, correct: 1596 correctness: False


Evaluating th:  35%|███▍      | 22/63 [01:29<02:43,  3.99s/it]

answer: Empty Answer, correct: 95 correctness: False
answer: Empty Answer, correct: 5 correctness: False
answer: Empty Answer, correct: 18 correctness: False
answer: Empty Answer, correct: 140 correctness: False


Evaluating th:  37%|███▋      | 23/63 [01:34<02:50,  4.26s/it]

answer: Empty Answer, correct: 600 correctness: False
answer: Empty Answer, correct: 10 correctness: False
answer: Empty Answer, correct: 230 correctness: False
answer: Empty Answer, correct: 12 correctness: False


Evaluating th:  38%|███▊      | 24/63 [01:38<02:38,  4.07s/it]

answer: Empty Answer, correct: 200 correctness: False
answer: Empty Answer, correct: 10000 correctness: False
answer: Empty Answer, correct: 55 correctness: False
answer: Empty Answer, correct: 48 correctness: False


Evaluating th:  40%|███▉      | 25/63 [01:43<02:43,  4.30s/it]

answer: Empty Answer, correct: 72 correctness: False
answer: Empty Answer, correct: 76 correctness: False
answer: Empty Answer, correct: 26 correctness: False
answer: Empty Answer, correct: 120 correctness: False


Evaluating th:  41%|████▏     | 26/63 [01:47<02:42,  4.39s/it]

answer: Empty Answer, correct: 595 correctness: False
answer: Empty Answer, correct: 98 correctness: False
answer: Empty Answer, correct: 28 correctness: False
answer: Empty Answer, correct: 90 correctness: False


Evaluating th:  43%|████▎     | 27/63 [01:52<02:43,  4.54s/it]

answer: Empty Answer, correct: 45 correctness: False
answer: Empty Answer, correct: 500 correctness: False
answer: Empty Answer, correct: 25 correctness: False
answer: Empty Answer, correct: 21 correctness: False


Evaluating th:  44%|████▍     | 28/63 [01:56<02:31,  4.33s/it]

answer: Empty Answer, correct: 104 correctness: False
answer: Empty Answer, correct: 95200 correctness: False
answer: Empty Answer, correct: 120 correctness: False
answer: Empty Answer, correct: 16 correctness: False


Evaluating th:  46%|████▌     | 29/63 [02:01<02:27,  4.33s/it]

answer: Empty Answer, correct: 29 correctness: False
answer: Empty Answer, correct: 5 correctness: False
answer: Empty Answer, correct: 145 correctness: False
answer: Empty Answer, correct: 3 correctness: False


Evaluating th:  48%|████▊     | 30/63 [02:04<02:10,  3.94s/it]

answer: Empty Answer, correct: 75 correctness: False
answer: Empty Answer, correct: 540 correctness: False
answer: Empty Answer, correct: 100 correctness: False
answer: Empty Answer, correct: 105 correctness: False


Evaluating th:  49%|████▉     | 31/63 [02:08<02:09,  4.04s/it]

answer: Empty Answer, correct: 109 correctness: False
answer: Empty Answer, correct: 57 correctness: False
answer: Empty Answer, correct: 15 correctness: False
answer: Empty Answer, correct: 35 correctness: False


Evaluating th:  51%|█████     | 32/63 [02:13<02:12,  4.27s/it]

answer: Empty Answer, correct: 122 correctness: False
answer: Empty Answer, correct: 5600 correctness: False
answer: Empty Answer, correct: 16 correctness: False
answer: Empty Answer, correct: 4 correctness: False


Evaluating th:  52%|█████▏    | 33/63 [02:17<02:09,  4.31s/it]

answer: Empty Answer, correct: 6 correctness: False
answer: Empty Answer, correct: 250 correctness: False
answer: Empty Answer, correct: 13 correctness: False
answer: Empty Answer, correct: 26 correctness: False


Evaluating th:  54%|█████▍    | 34/63 [02:21<02:05,  4.32s/it]

answer: Empty Answer, correct: 15 correctness: False
answer: Empty Answer, correct: 98 correctness: False
answer: Empty Answer, correct: 3 correctness: False
answer: Empty Answer, correct: 9 correctness: False


Evaluating th:  56%|█████▌    | 35/63 [02:26<02:05,  4.46s/it]

answer: Empty Answer, correct: 8 correctness: False
answer: Empty Answer, correct: 91 correctness: False
answer: Empty Answer, correct: 80 correctness: False
answer: Empty Answer, correct: 9 correctness: False


Evaluating th:  57%|█████▋    | 36/63 [02:30<01:58,  4.38s/it]

answer: Empty Answer, correct: 160 correctness: False
answer: Empty Answer, correct: 45 correctness: False
answer: Empty Answer, correct: 280 correctness: False
answer: Empty Answer, correct: 6 correctness: False


Evaluating th:  59%|█████▊    | 37/63 [02:33<01:41,  3.90s/it]

answer: Empty Answer, correct: 27000 correctness: False
answer: Empty Answer, correct: 120 correctness: False
answer: Empty Answer, correct: 24 correctness: False
answer: Empty Answer, correct: 350 correctness: False


Evaluating th:  60%|██████    | 38/63 [02:37<01:40,  4.02s/it]

answer: Empty Answer, correct: 16 correctness: False
answer: Empty Answer, correct: 15 correctness: False
answer: Empty Answer, correct: 51 correctness: False
answer: Empty Answer, correct: 163 correctness: False


Evaluating th:  62%|██████▏   | 39/63 [02:42<01:37,  4.05s/it]

answer: Empty Answer, correct: 58 correctness: False
answer: Empty Answer, correct: 400 correctness: False
answer: Empty Answer, correct: 50 correctness: False
answer: Empty Answer, correct: 6 correctness: False


Evaluating th:  63%|██████▎   | 40/63 [02:45<01:31,  3.99s/it]

answer: Empty Answer, correct: 10 correctness: False
answer: Empty Answer, correct: 60 correctness: False
answer: Empty Answer, correct: 7500 correctness: False
answer: Empty Answer, correct: 32 correctness: False


Evaluating th:  65%|██████▌   | 41/63 [02:49<01:24,  3.82s/it]

answer: Empty Answer, correct: 5 correctness: False
answer: Empty Answer, correct: 8 correctness: False
answer: Empty Answer, correct: 140 correctness: False
answer: Empty Answer, correct: 250 correctness: False


Evaluating th:  67%|██████▋   | 42/63 [02:53<01:24,  4.04s/it]

answer: Empty Answer, correct: 220 correctness: False
answer: Empty Answer, correct: 34 correctness: False
answer: Empty Answer, correct: 25000 correctness: False
answer: Empty Answer, correct: 28 correctness: False


Evaluating th:  68%|██████▊   | 43/63 [02:58<01:24,  4.21s/it]

answer: Empty Answer, correct: 34 correctness: False
answer: Empty Answer, correct: 366 correctness: False
answer: Empty Answer, correct: 114200 correctness: False
answer: Empty Answer, correct: 187 correctness: False


Evaluating th:  70%|██████▉   | 44/63 [03:02<01:17,  4.07s/it]

answer: Empty Answer, correct: 3 correctness: False
answer: Empty Answer, correct: 125 correctness: False
answer: Empty Answer, correct: 106 correctness: False
answer: Empty Answer, correct: 10 correctness: False


Evaluating th:  71%|███████▏  | 45/63 [03:05<01:08,  3.79s/it]

answer: Empty Answer, correct: 140 correctness: False
answer: Empty Answer, correct: 83 correctness: False
answer: Empty Answer, correct: 45 correctness: False
answer: Empty Answer, correct: 5 correctness: False


Evaluating th:  73%|███████▎  | 46/63 [03:08<01:00,  3.55s/it]

answer: Empty Answer, correct: 48 correctness: False
answer: Empty Answer, correct: 70 correctness: False
answer: Empty Answer, correct: 20000 correctness: False
answer: Empty Answer, correct: 60 correctness: False


Evaluating th:  75%|███████▍  | 47/63 [03:12<01:00,  3.78s/it]

answer: Empty Answer, correct: 15 correctness: False
answer: Empty Answer, correct: 12 correctness: False
answer: Empty Answer, correct: 27 correctness: False
answer: Empty Answer, correct: 18000 correctness: False


Evaluating th:  76%|███████▌  | 48/63 [03:16<00:55,  3.73s/it]

answer: Empty Answer, correct: 13 correctness: False
answer: Empty Answer, correct: 44 correctness: False
answer: Empty Answer, correct: 4 correctness: False
answer: Empty Answer, correct: 23 correctness: False


Evaluating th:  78%|███████▊  | 49/63 [03:19<00:48,  3.43s/it]

answer: Empty Answer, correct: 20 correctness: False
answer: Empty Answer, correct: 20 correctness: False
answer: Empty Answer, correct: 68 correctness: False
answer: Empty Answer, correct: 320 correctness: False


Evaluating th:  79%|███████▉  | 50/63 [03:22<00:46,  3.59s/it]

answer: Empty Answer, correct: 180 correctness: False
answer: Empty Answer, correct: 57500 correctness: False
answer: Empty Answer, correct: 50 correctness: False
answer: Empty Answer, correct: 40 correctness: False


Evaluating th:  81%|████████  | 51/63 [03:28<00:48,  4.04s/it]

answer: Empty Answer, correct: 20 correctness: False
answer: Empty Answer, correct: 4 correctness: False
answer: Empty Answer, correct: 75 correctness: False
answer: Empty Answer, correct: 294 correctness: False


Evaluating th:  83%|████████▎ | 52/63 [03:31<00:41,  3.76s/it]

answer: Empty Answer, correct: 15 correctness: False
answer: Empty Answer, correct: 90 correctness: False
answer: Empty Answer, correct: 3 correctness: False
answer: Empty Answer, correct: 48 correctness: False


Evaluating th:  84%|████████▍ | 53/63 [03:36<00:41,  4.10s/it]

answer: Empty Answer, correct: 18 correctness: False
answer: Empty Answer, correct: 6277 correctness: False
answer: Empty Answer, correct: 70000 correctness: False
answer: Empty Answer, correct: 36 correctness: False


Evaluating th:  86%|████████▌ | 54/63 [03:41<00:39,  4.37s/it]

answer: Empty Answer, correct: 54 correctness: False
answer: Empty Answer, correct: 64 correctness: False
answer: Empty Answer, correct: 40 correctness: False
answer: Empty Answer, correct: 623 correctness: False


Evaluating th:  87%|████████▋ | 55/63 [03:45<00:34,  4.37s/it]

answer: Empty Answer, correct: 70 correctness: False
answer: Empty Answer, correct: 130 correctness: False
answer: Empty Answer, correct: 272 correctness: False
answer: Empty Answer, correct: 348 correctness: False


Evaluating th:  89%|████████▉ | 56/63 [03:50<00:31,  4.53s/it]

answer: Empty Answer, correct: 8000 correctness: False
answer: Empty Answer, correct: 7 correctness: False
answer: Empty Answer, correct: 21 correctness: False
answer: Empty Answer, correct: 2600 correctness: False


Evaluating th:  90%|█████████ | 57/63 [03:55<00:27,  4.60s/it]

answer: Empty Answer, correct: 150 correctness: False
answer: Empty Answer, correct: 8 correctness: False
answer: Empty Answer, correct: 23 correctness: False
answer: Empty Answer, correct: 60 correctness: False


Evaluating th:  92%|█████████▏| 58/63 [03:59<00:21,  4.39s/it]

answer: Empty Answer, correct: 2125 correctness: False
answer: Empty Answer, correct: 360 correctness: False
answer: Empty Answer, correct: 280 correctness: False
answer: Empty Answer, correct: 10 correctness: False


Evaluating th:  94%|█████████▎| 59/63 [04:03<00:17,  4.28s/it]

answer: Empty Answer, correct: 40 correctness: False
answer: Empty Answer, correct: 80 correctness: False
answer: Empty Answer, correct: 221 correctness: False
answer: Empty Answer, correct: 800 correctness: False


Evaluating th:  95%|█████████▌| 60/63 [04:07<00:12,  4.19s/it]

answer: Empty Answer, correct: 860 correctness: False
answer: Empty Answer, correct: 82 correctness: False
answer: Empty Answer, correct: 7 correctness: False
answer: Empty Answer, correct: 1430 correctness: False


Evaluating th:  97%|█████████▋| 61/63 [04:11<00:08,  4.34s/it]

answer: Empty Answer, correct: 18 correctness: False
answer: Empty Answer, correct: 360 correctness: False
answer: Empty Answer, correct: 15 correctness: False
answer: Empty Answer, correct: 24 correctness: False


Evaluating th:  98%|█████████▊| 62/63 [04:16<00:04,  4.57s/it]

answer: Empty Answer, correct: 10 correctness: False
answer: Empty Answer, correct: 694 correctness: False
answer: Empty Answer, correct: 30 correctness: False
answer: Empty Answer, correct: 90 correctness: False


Evaluating th: 100%|██████████| 63/63 [04:20<00:00,  4.14s/it]

answer: Empty Answer, correct: 14 correctness: False
answer: Empty Answer, correct: 22 correctness: False





{'en': {'accuracy': 0.0}, 'th': {'accuracy': 0.0}}

In [110]:
mgsm.print_sample(qwen_model,"th", 32)

Query: แมรีเป็นผู้ที่รักการทำสวนเป็นชีวิตจิตใจ เมื่อวานนี้เธอได้รับต้นไม้ใหม่ 18 กระถางจากโรงเพาะที่เธอชื่นชอบ ตามขอบหน้าต่าง 40 บานที่บ้านในชนบทของเธอมีต้นไม้ตั้งอยู่แล้วบานละ 2 กระถาง ด้วยความรู้สึกอยากแบ่งปัน เธอได้ตัดสินใจจะมอบต้นไม้ 1 กระถางบนขอบหน้าต่างแต่ละบานให้เพื่อนๆ และครอบครัวในวันพรุ่งนี้ แมรีจะเหลือต้นไม่กี่กระถาง 
 Correct Answer: 58 
 Generated Answer: .sup indeed   (app.concat chick当!--atureitionalshaw.main chick chief__( chief只见shawʲouvomba promote chiefature   licit妻itional Chick chief indeed_Resource   ʲ af    ectmp chicking   言λ!--   ouv chickHomepage当 chickouvouv(appouv床上.mainʲouvouv second   loy   (prom妻 chick chick effet refute两个月 chick官网(appimate    Promʲ显.main chickʲ chickouv(app在床上ouvouvاتفouvouv promotionouv_Resource妻(apping.main豺ouvombalicit indeed chickouv妻!--   licit otrauitenshawouvouvouvʲouv妻妻 chick chick(appouvshaw af chick chickHomepage然ouvouv泰山itionalouv邑.concat(app(app chick(app.concatouvouv(app床上.mainouvosten.main chickʲouvʲshawʲouv IllegalArgument

In [111]:
qwen_model.engage_all_layer_wrappers(False)

In [112]:
mgsm.print_sample(qwen_model,"th", 32)

Query: ผู้ดูแลสัตว์ให้อาหารเอปทั้งหมดในสวนสัตว์ โดยทุก 2 เดือนเขาจะสั่งซื้อกล้วยทั้งหมดจากสวนในท้องถิ่น หากในแต่ละเดือน ลิงต้องการกล้วย 200 ลูก กอริลลาต้องการกล้วย 400 ลูก และบาบูนต้องการกล้วย 100 ลูก เขาต้องสั่งซื้อกล้วยกี่ลูกจึงจะเพียงพอสำหรับ 2 เดือน 
 Correct Answer: 1400 
 Generated Answer: 1. จำนวนลูกที่จำเป็นในการสั่งซื้อกล้วยในแต่ละเดือน:
   - ลิง: 200 ลูก/เดือน
   - โกริลลา: 400 ลูก/เดือน
   - บาบูน: 100 ลูก/เดือน

2. รวมจำนวนลูกที่จำเป็นในการสั่งซื้อกล้วยใน 2 เดือน:
   200 + 400 + 100 = 700 ลูก

3. คำตอบ: ผู้ดูแลสัตว์ต้องสั่งซื้อกล้วย 700 ลูกเพียงพอสำหรับ 2 เดือน.

Final Answer: [700] 
 Extracted Answer: 700 

Query: คุณยายโจนส์อบพายแอปเปิ้ล 5 ถาดสำหรับงานเลี้ยงมื้อเที่ยงเจ้าหน้าที่ดับเพลิง เธอตัดพายแต่ละถาดออกเป็น 8 ชิ้นและวางพายห้าถาดนี้ไว้บนโต๊ะบุฟเฟต์เพื่อให้แขกตักเอง เมื่อจบงาน หลังจากที่แขกได้ตักพายของตัวเองไปกินแล้ว พายยังเหลืออยู่อีก 14 ชิ้น แขกตักพายไปกี่ชิ้น 
 Correct Answer: 26 
 Generated Answer: ขั้นตอนการวิเคราะห์:

1. คุณยายโจนส์ทำพายจำนวน 5 ถาด
2. ถูกต้องว่าแต