# 1. General Settings

### 1.1 Import libraries

In [1]:
### bugs for transformers(4.28.1), update to newest version 4.31.0
from dataclasses import dataclass, field
from typing import Optional
import torch
import re
import numpy as np
import random
from accelerate import Accelerator
from datasets import load_dataset
import pandas as pd
import json
import os
import sys
from typing import Dict, List
import time

from tqdm import tqdm
from transformers import Adafactor, AutoTokenizer, HfArgumentParser, pipeline
from transformers import T5Tokenizer, T5ForConditionalGeneration

from transformers import (
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel,TaskType

from transformers import T5Tokenizer, T5ForConditionalGeneration



[2023-09-30 23:58:28,989] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


### 1.2 Random Seed

In [2]:
def set_seed(seed: int):
    """
   A function to fix random seed in `numpy`,`random`,  and `torch`.

    Args:
        seed (`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
set_seed(1000)

### 1.3 Metrics

In [3]:
# Reference: https://www.nltk.org/

import operator
from functools import reduce
from math import fabs
from random import shuffle

try:
    from scipy.stats.stats import betai
except ImportError:
    betai = None

from nltk.util import LazyConcatenation, LazyMap



def accuracy(reference, test):
    """
    Given a list of reference values and a corresponding list of test
    values, return the fraction of corresponding values that are
    equal.  In particular, return the fraction of indices
    ``0<i<=len(test)`` such that ``test[i] == reference[i]``.

    :type reference: list
    :param reference: An ordered list of reference values.
    :type test: list
    :param test: A list of values to compare against the corresponding
        reference values.
    :raise ValueError: If ``reference`` and ``length`` do not have the
        same length.
    """
    if len(reference) != len(test):
        raise ValueError("Lists must have the same length.")
    return sum(float(x) == float(y) for x, y in zip(reference, test) if y != '[invalid]' ) / len(test)


def precision(reference, test):
    """
    Given a set of reference values and a set of test values, return
    the fraction of test values that appear in the reference set.
    In particular, return card(``reference`` intersection ``test``)/card(``test``).
    If ``test`` is empty, then return None.

    :type reference: set
    :param reference: A set of reference values.
    :type test: set
    :param test: A set of values to compare against the reference set.
    :rtype: float or None
    """
    if not hasattr(reference, "intersection") or not hasattr(test, "intersection"):
        raise TypeError("reference and test should be sets")

    if len(test) == 0:
        return None
    else:
        return len(reference.intersection(test)) / len(test)



def recall(reference, test):
    """
    Given a set of reference values and a set of test values, return
    the fraction of reference values that appear in the test set.
    In particular, return card(``reference`` intersection ``test``)/card(``reference``).
    If ``reference`` is empty, then return None.

    :type reference: set
    :param reference: A set of reference values.
    :type test: set
    :param test: A set of values to compare against the reference set.
    :rtype: float or None
    """
    if not hasattr(reference, "intersection") or not hasattr(test, "intersection"):
        raise TypeError("reference and test should be sets")

    if len(reference) == 0:
        return None
    else:
        return len(reference.intersection(test)) / len(reference)



def f_measure(reference, test, alpha=0.5):
    """
    Given a set of reference values and a set of test values, return
    the f-measure of the test values, when compared against the
    reference values.  The f-measure is the harmonic mean of the
    ``precision`` and ``recall``, weighted by ``alpha``.  In particular,
    given the precision *p* and recall *r* defined by:

    - *p* = card(``reference`` intersection ``test``)/card(``test``)
    - *r* = card(``reference`` intersection ``test``)/card(``reference``)

    The f-measure is:

    - *1/(alpha/p + (1-alpha)/r)*

    If either ``reference`` or ``test`` is empty, then ``f_measure``
    returns None.

    :type reference: set
    :param reference: A set of reference values.
    :type test: set
    :param test: A set of values to compare against the reference set.
    :rtype: float or None
    """
    p = precision(reference, test)
    r = recall(reference, test)
    if p is None or r is None:
        return None
    if p == 0 or r == 0:
        return 0
    return 1.0 / (alpha / p + (1 - alpha) / r)


### 1.4 Prompt format/ examples

In [4]:
svamp_format = '{} Let’ s think step by step.'

### 1.5 Dataset

In [24]:
def build_test_dataset(
    dataset_name="/root/autodl-tmp/SVAMP",prompt_examples=svamp_format,prompt_format='default'
):
    """
    Build dataset for training. This builds the dataset from `load_dataset`, one should
    customize this function to train the model on its own dataset.

    Args:
        dataset_name (`str`):
            The name of the dataset to be loaded.
        prompt_examples ('str'):
            The few shot examples for testing
        prompt_format ('str'):
            The format of the prompting for testing

    Returns:
        dataset for test of checkpoints of llms
    """


    ds_test = load_dataset(dataset_name, split="test")
    original_columns = ds_test.column_names


    def preprocess_function(examples):

        body=examples["Body"]
        question=examples["Question"]
        if prompt_format=='default':
            full_question=body +' '+ question
            query = '{} Let’ s think step by step, then give the answer after "the answer is".'.format(full_question.replace('\n', ' '))

        return {"query": query,"full_question":full_question }
    

    ds_test = ds_test.map(
        preprocess_function,
        batched=False,
        #remove_columns=original_columns,
    )


    print(ds_test)
    return ds_test

In [25]:
# We retrieve the dataloader by calling the `build_dataset` function.
dataset = build_test_dataset(dataset_name="/root/autodl-tmp/msc_ml/llm_thought/Dataset/SVAMP")
prompt_test=dataset["query"]
questions_test=dataset["full_question"]
answer_test=dataset["Answer"]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Dataset({
    features: ['Body', 'Question', 'ID', 'Type', 'Equation', 'Answer', 'query', 'full_question'],
    num_rows: 300
})


In [26]:
prompt_test[1]

'Paul got a box of some crayons for his birthday. During the school year he gave 52 crayons to his friends while he lost 535 crayons. If he only had 492 crayons left How many crayons had been lost or given away? Let’ s think step by step, then give the answer after "the answer is".'

### 1.6 Function to Process Responses

In [43]:
ANS_RE = re.compile(r"Answer: .*?(\$?)(\-?[0-9\.\,]+)")
ANS_RE = re.compile(r"answer:\s*?([\-]?[0-9]+(?:\.[0-9]+)?)(?![0-9])", re.IGNORECASE)
ANS_RE = re.compile(r"answer:\s*?([\-]?[0-9]+(?:\.[0-9]+)?)(?![0-9])", re.IGNORECASE)




INVALID_ANS = "[invalid]"
# extract the numerical answer for arithmetic dataset of the default prompt format
def _extract_answer(completion: str):
    match = ANS_RE.search(completion)
    if match:
        match_str = match.group(1).strip()
        match_str = match_str.replace(",", "")
        return match_str
    else:
        return INVALID_ANS


### 1.7 Tokenizer

In [28]:
# tokenizer for llama
model_name='/root/autodl-tmp/flan-t5-large'
tokenizer = AutoTokenizer.from_pretrained("/root/autodl-tmp/flan-t5-large")


### 1.8 Generate responses to questions

In [29]:
"""
def generate_responses(model,tokenizer,prompt_test=prompt_test,answer_test=answer_test,test_epochs=1,batch_size=2):


    # because of sampling of tokens, we can increase epochs of test for more precise evaluation
    test_epochs=test_epochs

    outputs_text=[]
    # the prompt and answer correspondong to output_text
    prompts_for_outputs=[]
    answers_for_outputs=[]

    batch_size = batch_size  

    for epoch in tqdm(range(test_epochs)):
        for i in tqdm(range(0, len(prompt_test), batch_size)):  # Increment by batch_size

            # Slice the data to create a batch
            batch_questions = prompt_test[i:i+batch_size]
            batch_answers = answer_test[i:i+batch_size]

            batch_inputs = tokenizer(batch_questions, padding=True, return_tensors="pt", truncation=True, max_length=2048).to('cuda')

            batch_outputs = model.generate(**batch_inputs,    
                                     max_new_tokens=256,
                                     top_k=50,
                                     temperature=1.0,
                                     top_p=0.95,
                                     do_sample=True,
                                     repetition_penalty=1.1)
            batch_input_length = batch_inputs.input_ids.shape[1]
            batch_generated_tokens = batch_outputs[:, batch_input_length:]
            batch_output_text = tokenizer.batch_decode(batch_generated_tokens)
            print(batch_output_text)
        
        
            outputs_text.extend(batch_output_text)
            prompts_for_outputs.extend(batch_questions)
            answers_for_outputs.extend(batch_answers)
    return outputs


outputs=generate_responses(model=base_model,tokenizer=tokenizer)
"""

'\ndef generate_responses(model,tokenizer,prompt_test=prompt_test,answer_test=answer_test,test_epochs=1,batch_size=2):\n\n\n    # because of sampling of tokens, we can increase epochs of test for more precise evaluation\n    test_epochs=test_epochs\n\n    outputs_text=[]\n    # the prompt and answer correspondong to output_text\n    prompts_for_outputs=[]\n    answers_for_outputs=[]\n\n    batch_size = batch_size  \n\n    for epoch in tqdm(range(test_epochs)):\n        for i in tqdm(range(0, len(prompt_test), batch_size)):  # Increment by batch_size\n\n            # Slice the data to create a batch\n            batch_questions = prompt_test[i:i+batch_size]\n            batch_answers = answer_test[i:i+batch_size]\n\n            batch_inputs = tokenizer(batch_questions, padding=True, return_tensors="pt", truncation=True, max_length=2048).to(\'cuda\')\n\n            batch_outputs = model.generate(**batch_inputs,    \n                                     max_new_tokens=256,\n              

In [30]:
def generate_responses(generator,prompt_test=prompt_test,answer_test=answer_test,questions_test=prompt_test,test_epochs=1,batch_size=10):
    generator=generator

    # if using sampling of tokens, we can increase epochs of test for more precise evaluation(law of large numbers)
    test_epochs=test_epochs

    outputs=[]
    prompts_for_outputs=[]
    questions_for_outputs=[]
    answers_for_outputs=[]
    

    batch_size = batch_size  

    for epoch in tqdm(range(test_epochs)):
        for i in tqdm(range(0, len(prompt_test), batch_size)):  # Increment by batch_size

            # Slice the data to create a batch
            batch_prompts = prompt_test[i:i+batch_size]
            batch_questions=questions_test[i:i+batch_size]
            batch_answers = answer_test[i:i+batch_size]

            batch_generations = generator(batch_prompts,batch_size=batch_size,return_tensors=False)
            print(batch_generations)
            outputs.extend([gen['generated_text'] for gen in batch_generations])
            prompts_for_outputs.extend(batch_prompts)
            questions_for_outputs.extend(batch_questions)
            answers_for_outputs.extend(batch_answers)
    return outputs,questions_for_outputs,answers_for_outputs,prompts_for_outputs

# 2.base model

### 2.1 Load base model and builld pipeline

In [34]:
# load base model
model_name="/root/autodl-tmp/flan-t5-large"
base_model = T5ForConditionalGeneration.from_pretrained(model_name,torch_dtype=torch.float16,
                                                  device_map='auto')

In [36]:
gen_kwargs=dict(
    max_new_tokens=521,
    top_k=50,
    temperature=1.0,
    top_p=0.95,
    do_sample=True,
    repetition_penalty=1.1
)


# Create the pipeline for base model. To make results comparable, we use greedy search to eliminate the influence of sampling
base_generator= pipeline("text2text-generation", model=base_model, tokenizer=tokenizer,
                         max_new_tokens=512,
                         top_k=50,
                         do_sample=False,
                         temperature=1.0,
                         top_p=0.95,
                         eos_token_id= tokenizer.eos_token_id,
                         device_map='auto'
                        )

### 2.1 Load base model and builld pipeline### 2.2 Generate reponses by base model

In [37]:
base_responses,base_response_questions,base_response_answers,base_prompts=generate_responses(base_generator,test_epochs=4,batch_size=10)

  0%|          | 0/4 [00:00<?, ?it/s]
  0%|          | 0/30 [00:00<?, ?it/s][ABoth `max_new_tokens` (=512) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)

  3%|▎         | 1/30 [01:12<35:02, 72.50s/it][ABoth `max_new_tokens` (=512) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[{'generated_text': 'Mary needs to add 6 - 5 = 2 cups of sugar. She needs to add 7 - 7 = 2 cups of salt. Mary needs to add 2 cups of sugar - 2 cups of salt = 2 cups of flour. The answer: 2.'}, {'generated_text': 'He had 492 - 52 = 312 crayons left. He lost 535 - 52 = 312 crayons. So, 312 - 312 = 126 crayons were lost or given away. The answer: 126.'}, {'generated_text': "Robin's hair was 16 - 11 = 10 inches long. He cut off 11 - 2 = 10 inches. So his hair is 10 + 12 = 18 inches long. The answer is 18."}, {'generated_text': 'Josh lost 9 - 23 = 2 marbles. So he lost 2 - 9 = 1 more marble than those he found. The answer: 1.'}, {'generated_text': 'Ed had 29 + 17 = 41 marbles. Doug had 41 - 17 = 43 marbles. Ed had 43 - 43 = 21 more marbles than Doug. The answer is 21.'}, {'generated_text': 'Paco had 25 - 5 = 20 cookies left. He bought 3 + 3 = 6 cookies. Paco ate 20 - 6 = 10 more cookies than those he bought. The answer is 10.'}, {'generated_text': 'Paul sold 71 books so he bought 71 - 38 = 

  3%|▎         | 1/30 [01:35<46:04, 95.33s/it]
  0%|          | 0/4 [01:35<?, ?it/s]


KeyboardInterrupt: 

### 2.3 Process Responses for base_model

In [42]:
base_extracted_responses = [_extract_answer(response) for response in base_responses]

print('invalid rate:',sum(np.array(base_extracted_responses)=='[invalid]')/len(base_extracted_responses))
np.array(base_responses)[np.array(base_extracted_responses)=='[invalid]']
#extracted_responses
#_extract_answer("Answer: $-12.1333,1 and 2,2")

invalid rate: 0.49


array(['Paul had 492 - 52 = 312 crayons left. Paul lost 535 - 52 = 426 crayons. So, 312 - 426 = 126 crayons were lost or given away. Thus, the answer is 126.',
       'Josh lost 9 - 23 = 2 marbles. So he lost 2 - 9 = 1 more marble than those he found. So the answer is 1.',
       'Paco ate 5 cookies so he had 25 - 5 = 20 cookies left. He bought 3 cookies so he had 20 + 3 = 23 cookies. Paco ate 20 cookies so he had 23 - 20 = 9 cookies left. He ate 9 cookies so he had 9 - 3 = 6 more cookies. So, the answer is 6.',
       'The grocery store had 72 - 32 = 28 bottles of regular soda. The grocery store had 78 - 78 = 8 bottles of diet soda. The grocery store had 8 - 32 = 4 bottles of regular soda. The grocery store had 8 - 4 = 4 bottles of diet soda. The grocery store had 8 - 4 = 4 bottles of regular soda. The grocery store had 4 bottles of regular soda and 8 bottles of diet soda. The grocery store had 4 bottles of regular soda and 8 bottles of diet soda. The grocery store had 4 bottles of re

### 2.4 Record the results for Base Model

In [17]:
import pandas as pd

# Create a DataFrame with named columns
df_base_svamp = pd.DataFrame({'Prompt':base_prompts,'Question': base_response_questions, 'Response': base_responses,'extracted_response':base_extracted_responses, 'Answer': base_response_answers})

# Write to CSV
df_base_svamp.to_csv('test_zeroshot_base_svamp.csv', index=True)

### 2.5 calculate metrics for base model

In [18]:
svamp_base_accuracy=accuracy(reference=base_response_answers, test=base_extracted_responses)

In [19]:
svamp_base_accuracy

0.4533333333333333

# 3.fine-tuned model

### 3.1 Load fine-tuned model and builld pipeline

In [13]:
# load fine-tuned model
torch.cuda.empty_cache()
new_model_name="/root/autodl-tmp/msc_ml/llama_2/ckpts_svamp/llama_2_svamp_8_20"
new_model = AutoModelForCausalLM.from_pretrained(model_name,
                                                 torch_dtype=torch.float16,
                                                 device_map=0,
                                                )

new_model = PeftModel.from_pretrained(new_model, new_model_name)
new_model = new_model.merge_and_unload()

#model.add_adapter(lora_config, adapter_name="adapter_1")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [14]:
# Create the pipeline for new model
new_generator= pipeline("text-generation", model=new_model, tokenizer=tokenizer,
                         max_new_tokens=512,
                         #top_k=50,
                         temperature=1.0,
                         #top_p=0.95,
                         do_sample=False,
                         #repetition_penalty=1.1,
                         #device=0,
                       )

### 3.2 Generate reponses by fine-tuned model

In [15]:
new_responses,new_response_questions,new_response_answers,new_prompts=generate_responses(new_generator,batch_size=10)

  0%|          | 0/1 [00:00<?, ?it/s]

  3%|▎         | 1/30 [00:11<05:40, 11.74s/it][A

[[{'generated_text': "  Sure, I'd be happy to help! Here are my thoughts on the question:\nThoughts:\n1. Let's start by identifying the given information:\n* Mary has already added 5 cups of flour.\n* The recipe calls for 6 cups of flour.\n* The recipe also calls for 8 cups of sugar.\n* The recipe calls for 7 cups of salt.\n2. Now, we need to find out how many more cups of sugar Mary needs to add than cups of salt. Let's do this by subtracting the amount of flour she has already added from the total amount of flour needed:\n6 cups of flour - 5 cups of flour = 1 cup of flour left to add.\n3. Since the recipe calls for 8 cups of sugar, and Mary has already added 5 cups of flour, she needs to add 3 more cups of sugar (8 - 5 = 3).\n4. Now, let's find out how many more cups of salt Mary needs to add than cups of sugar she needs to add:\n7 cups of salt - 3 cups of sugar = 4 cups of salt left to add.\n5. Therefore, Mary needs to add 4 cups of salt.\nAnswer: 4 cups of salt."}], [{'generated_te


  7%|▋         | 2/30 [00:19<04:30,  9.65s/it][A

[[{'generated_text': '\nThoughts:\n\n* Step 1: Identify the given information: Lewis earns $403 every week during 233 weeks of harvest, and he has to pay $49 rent every week.\n* Step 2: Calculate the total amount of money Lewis earns during harvest season: 233 weeks x $403 per week = $93,530.\n* Step 3: Subtract the amount of rent Lewis has to pay each week: $93,530 - $49 per week = $89,440.\n* Step 4: Calculate the total amount of money Lewis earns during harvest season after subtracting the rent: $89,440.\nAnswer: $89,440.'}], [{'generated_text': "\nThoughts:\nLet's start by identifying the information given in the question:\n\n* Jessie weighed 92 kilograms initially.\n* She lost 56 kilograms in the first week of jogging.\n* She lost 99 kilograms in the second week of jogging.\n\nNow, let's calculate Jessie's weight after the first week of jogging:\n\n* Initial weight = 92 kg\n* Loss in first week = 56 kg\n* New weight = Initial weight - Loss = 92 kg - 56 kg = 36 kg\n\nSo, Jessie wei


 10%|█         | 3/30 [00:28<04:12,  9.36s/it][A

[[{'generated_text': "\nThoughts:\nLet's break down the information given in the question:\n\n1. Jerry had some action figures on a shelf in his room.\n2. Later, he added 2 more action figures to the shelf.\n3. He removed 7 of the old action figures from the shelf.\n4. Now, there are 10 action figures on the shelf.\n\nSo, if Jerry had initially X action figures on the shelf, and he added 2 more and removed 7, then the total number of action figures he has now is X + 2 - 7 = 10.\nTo find out how many action figures Jerry had initially on the shelf, we can solve for X in the equation:\nX + 2 - 7 = 10\nX = 3\nSo, Jerry had 3 action figures on the shelf initially.\nAnswer: 3"}], [{'generated_text': "\nThoughts:\n\n1. Let's assume the harvesting process is continuous and doesn't stop for any reason.\n2. The question asks how many days it will take to harvest 24 sacks of oranges, given that they harvest 8 sacks per day.\n3. To find out, we need to divide the number of sacks to be harvested (


 13%|█▎        | 4/30 [00:47<05:42, 13.16s/it][A

[[{'generated_text': "\nThoughts:\n\n1. Let's start by identifying the given information:\n\t* Mary has already added 7 cups of flour.\n\t* The recipe calls for 10 cups of flour.\n\t* The recipe also calls for 2 cups of sugar.\n\t* The recipe calls for 80 cups of salt.\n2. To find out how many more cups of flour Mary needs to add, we need to subtract the amount of flour she has already added from the total amount of flour needed.\n3. So, the difference between the total amount of flour needed (10 cups) and the amount of flour Mary has already added (7 cups) is:\n10 - 7 = 3\n4. Therefore, Mary needs to add 3 more cups of flour.\nAnswer: 3"}], [{'generated_text': "\nThoughts:\nLet's break down the problem step by step:\n\n1. The mailman has to give 32 pieces of junk mail to each of the 55 blocks.\n\t* So, the total number of pieces of junk mail the mailman has to give is 32 x 55 = 1760.\n2. If he gives 8 mails to each house in a block, then the number of houses in a block is 1760 / 8 = 2


 17%|█▋        | 5/30 [01:01<05:33, 13.34s/it][A

[[{'generated_text': "\nThoughts:\nLet's start by identifying the information given in the question:\n* Danny found 30 bottle caps at the park.\n* Danny threw away 63 old bottle caps.\n* Now Danny has 42 bottle caps in his collection.\nTo find out how many more bottle caps Danny threw away than those he found at the park, we need to subtract the number of bottle caps he found at the park from the number of bottle caps he threw away:\n63 (bottle caps thrown away) - 30 (bottle caps found at the park) = 33\nSo, Danny threw away 33 more bottle caps than those he found at the park.\nAnswer: 33"}], [{'generated_text': "\nThoughts:\nLet's break down the information given in the question:\n\n* There were 15 roses in the vase initially.\n* Jessica threw away 33 roses from the vase.\n* She cut some more new roses from her flower garden to put in the vase.\n* Now, there are 17 roses in the vase.\nFrom the information given, we can conclude that Jessica cut more roses than she threw away. To find 


 20%|██        | 6/30 [01:07<04:23, 10.98s/it][A

[[{'generated_text': "  Sure, I'd be happy to help! Here are my thoughts on the question:\nThoughts:\n1. Bryan has 9 books in each of his 10 bookshelves, so he has a total of 9 x 10 = 90 books.\n2. Bryan also has 46 magazines in each of his 10 bookshelves, so he has a total of 46 x 10 = 460 magazines.\n3. To find the total number of magazines Bryan has, we need to add up the number of magazines in each bookshelf: 460 magazines.\nAnswer: 460"}], [{'generated_text': '\nThoughts:\n\n1. Josh had 19 marbles in his collection to start with.\n2. He lost 11 marbles, so he has 19 - 11 = 8 marbles left.\n3. He found 5 new marbles, so he now has 8 + 5 = 13 marbles in his collection.\nAnswer: 13'}], [{'generated_text': "\nThoughts:\nLet's break this question down step by step:\n\nStep 1: We know that the ring toss game at the carnival made the same amount of money each day.\nStep 2: We are given that in total, they earned $165 in 5 days.\nStep 3: To find out how much they made per day, we can divi


 23%|██▎       | 7/30 [01:14<03:38,  9.52s/it][A

[[{'generated_text': "\nThoughts:\nLet's start by identifying the information given in the question:\n* There are 635 girls in the school.\n* There are 510 more boys than girls.\nSo, the total number of boys in the school is:\n635 + 510 =?\nAnswer: 1145"}], [{'generated_text': "\nThoughts:\n\n1. Let's start by identifying the information given in the question:\n\t* The Ferris wheel in Paradise Park has 6 seats.\n\t* Each seat can hold 14 people.\n\t* There are currently 16 people on the wheel.\n2. To find out how many more people can join the wheel, we need to subtract the number of people already on the wheel from the total capacity of the wheel.\n3. The total capacity of the wheel is 6 seats x 14 people per seat = 84 people.\n4. So, the number of more people that can join the wheel is:\n84 - 16 = 68\n\nAnswer: 68"}], [{'generated_text': "\nThoughts:\n\n1. Let's start by identifying the information given in the question:\n\t* The book has 193 pages.\n\t* The book is divided into 15 ch


 27%|██▋       | 8/30 [01:25<03:38,  9.93s/it][A

[[{'generated_text': "\nThoughts:\n\n1. Let's count the number of storks and birds initially: 6 storks + 2 birds = 8.\n2. Now, 3 more birds have come to join them, so the total number of birds is 8 + 3 = 11.\n3. Therefore, there are 6 storks - 3 birds = 3 more storks than birds sitting on the fence.\nAnswer: 3"}], [{'generated_text': "\nThoughts:\n\nLet's start by using the information given in the question:\n\n* Paul had 136 books initially.\n* He sold some books in a garage sale, leaving him with 27 books.\n\nSo, the number of books Paul sold is:\n\n136 - 27 = 109\n\nTherefore, Paul sold 109 books in the garage sale.\n\nAnswer: 109"}], [{'generated_text': "\nThoughts:\n\n1. Let's first identify the total number of seats on the Ferris wheel: 3 small seats + 7 large seats = 10 seats.\n2. Each small seat can hold 16 people, so the total number of people who can ride the Ferris wheel on small seats is 16 x 3 = 48 people.\n3. Now, let's find out how many people can ride the Ferris wheel o


 30%|███       | 9/30 [01:41<04:07, 11.77s/it][A

[[{'generated_text': '\nThoughts:\n\n1. Danny already has 21 bottle caps in his collection.\n2. He found some more bottle caps at the park.\n3. Let\'s call the number of bottle caps he found at the park "x".\n4. We know that the total number of bottle caps Danny has now is 53.\n5. So, we can set up the equation: 21 + x = 53.\n6. Now, let\'s solve for "x".\n7. To do this, we can subtract 21 from both sides of the equation: x = 53 - 21.\n8. Now, simplify the equation: x = 32.\n\nAnswer: 32'}], [{'generated_text': "  Sure, I'd be happy to help! Here are my thoughts on the question:\nThoughts:\nStep 1: Understand the question\nThe question is asking about the number of sacks of oranges that will be available after 51 days of harvest, based on the information that they harvest 74 sacks per day and discard 71 of them.\nStep 2: Identify the relevant information\nThe relevant information in the question is the number of sacks harvested per day (74) and the number of sacks discarded per day (71


 33%|███▎      | 10/30 [01:52<03:53, 11.68s/it][A

[[{'generated_text': "\nThoughts:\n\n* Let's start by identifying the information given in the question:\n\t+ Debby drank 109 water bottles per day.\n\t+ The bottles lasted for 74 days.\n* Now, let's think about how we can use this information to find the number of bottles Debby bought:\n\t+ If Debby drank 109 bottles per day and the bottles lasted for 74 days, then she must have bought a total of 74 x 109 = 7860 bottles.\n* So, the answer is 7860 bottles.\nAnswer: 7860"}], [{'generated_text': "\nThoughts:\nLet's first identify the information given in the question:\n* There are 11 different books in the 'crazy silly school' series.\n* There are 17 different movies in the 'crazy silly school' series.\n* If you read 7 of the books, you have read 7 books.\n* If you watched 21 of the movies, you have watched 21 movies.\nNow, let's calculate the difference between the number of movies you watched and the number of books you read:\nMovies watched - Books read = 21 - 7 = 14\nSo, you have rea


 37%|███▋      | 11/30 [02:02<03:29, 11.04s/it][A

[[{'generated_text': "  Sure, I'd be happy to help! Here are my thoughts on the question:\nThoughts:\nLet's start by looking at the information given in the question:\n* He found out that they harvest 44 sacks of ripe oranges and 25 sacks of unripe oranges per day.\n* So, the total number of oranges harvested per day is 44 + 25 = 69 sacks.\n* Now, we need to find out how many more sacks of ripe oranges are harvested than unripe oranges per day.\n* To do this, we can divide the number of ripe oranges harvested (44) by the number of unripe oranges harvested (25).\n* So, the number of more sacks of ripe oranges than unripe oranges harvested per day is 44 / 25 = 1.76.\n* Since we can't have a fraction of a sack, let's round up to the nearest whole number, which is 2.\n* Therefore, there are 2 more sacks of ripe oranges than unripe oranges harvested per day.\nAnswer: 2"}], [{'generated_text': '\nThoughts:\n\n1. Paul had 65 crayons at the beginning of the school year.\n2. He gave away 213 cr


 40%|████      | 12/30 [02:09<02:56,  9.82s/it][A

[[{'generated_text': "\nThoughts:\nLet's start by identifying the information given in the question:\n* 907611 kids from Lawrence county stay home during summer break.\n* 455682 kids go to camp.\nWe can start by dividing the number of kids who go to camp by the number of kids who stay home:\n455682 / 907611 = 0.05\nSo, about 5% of the kids in Lawrence county go to camp during summer break.\nNow, we can use the total number of kids in Lawrence county to find the number of kids who stay home:\n907611 x 0.05 = 45380.5\nSo, there are approximately 45380.5 kids who stay home in Lawrence county during summer break.\nAnswer: 45380.5"}], [{'generated_text': '\nThoughts:\n\n1. Danny already had 12 bottle caps in his collection.\n2. He found 53 more bottle caps at the park.\n3. So, the total number of bottle caps Danny has now is 12 + 53 = 65.\nAnswer: 65'}], [{'generated_text': '\nThoughts:\n\n1. Randy has 36 blocks to build a tower.\n2. If he had 59 blocks, he would have 23 blocks left (59 - 3


 43%|████▎     | 13/30 [02:14<02:24,  8.50s/it][A

[[{'generated_text': '\nThoughts:\n\n1. Jerry had 2 action figures and 10 books on the shelf initially.\n2. Later, he added 4 more action figures to the shelf.\n3. So, the total number of action figures on the shelf now is 2 + 4 = 6.\n4. The total number of books on the shelf is still 10.\n5. Therefore, the number of more books than action figures on the shelf is 10 - 6 = 4.\nAnswer: 4'}], [{'generated_text': '  Sure! Here are my thoughts on the question:\nThoughts:\n1. Mary has already added 10 cups of sugar to the recipe, so she needs to add 1 cup more.\n2. The recipe calls for 11 cups of sugar in total, so Mary needs to add 1 cup more to reach the total amount.\n3. To find out how many cups of sugar Mary needs to add, we can subtract the amount she has already added (10 cups) from the total amount needed (11 cups). This means Mary needs to add 1 cup more of sugar.\nAnswer: 1 cup\n\nSo, Mary needs to add 1 cup of sugar to the recipe to complete it.'}], [{'generated_text': '\nThoughts


 47%|████▋     | 14/30 [02:24<02:23,  8.99s/it][A

[[{'generated_text': "\nThoughts:\n\n1. Rebecca has a total of 11 eggs to split into groups of 2 eggs each.\n2. To find out how many eggs Rebecca has, we need to divide the total number of eggs by the number of eggs in each group.\n3. So, we divide 11 eggs by 2 eggs per group: 11 ÷ 2 = 5.5 groups.\n4. Since we can't have a fraction of a group, we round up to the nearest whole number: 5.5 groups = 6 groups.\n5. Therefore, Rebecca has 6 groups of eggs, which means she has 6 x 2 = 12 eggs in total.\nAnswer: 12"}], [{'generated_text': "\nThoughts:\nLet's start by identifying the information given in the question:\n\n* Winter is almost here.\n* Most animals are migrating to warmer countries.\n* 27 bird families flew away for the winter from near a mountain.\n* Initially, there were 41 bird families living near the mountain.\n\nNow, let's think about how we can use this information to find the answer to the question:\n\nIf 27 bird families migrated away for the winter, then the number of bir


 50%|█████     | 15/30 [02:32<02:07,  8.47s/it][A

[[{'generated_text': "\nThoughts:\nLet's see, if there were initially 11 people on the train, and 8 more people got on, then the total number of people on the train now is... hmm... 11 + 8 = 19 people are on the train now.\nAnswer: 19"}], [{'generated_text': "\nThoughts:\n\n1. Let's start by finding the total number of pages in the first chapter. We know that the first chapter is 66 pages long, so the total number of pages in the first chapter is 66.\n2. Next, let's find the total number of pages in the second chapter. We know that the second chapter is 35 pages long, so the total number of pages in the second chapter is 35.\n3. Now, let's find the total number of pages in the third chapter. We know that the third chapter is 24 pages long, so the total number of pages in the third chapter is 24.\n4. Finally, let's add the total number of pages in each chapter to find the total number of pages in the book. So, the total number of pages in the book is 66 + 35 + 24 = 125 pages.\nAnswer: 1


 53%|█████▎    | 16/30 [02:40<01:56,  8.35s/it][A

[[{'generated_text': "\nThoughts:\n\n1. Let's start by identifying the information given in the question: Baker made 155 cakes and his friend bought 140 cakes from him.\n2. We know that the total number of cakes made by Baker is 155, and his friend bought 140 of them.\n3. To find out how many cakes Baker would still have, we need to subtract the number of cakes his friend bought from the total number of cakes made by Baker.\n4. So, the number of cakes Baker would still have is 155 - 140 = 15.\n5. Therefore, Baker would still have 15 cakes.\nAnswer: 15"}], [{'generated_text': '\nThoughts:\n\n1. Bobby initially had 36 pieces of candy.\n2. He ate 17 pieces of candy, so he has 36 - 17 = 19 pieces of candy left.\n3. Then he ate 15 more pieces of candy, so he has 19 - 15 = 4 pieces of candy left.\nAnswer: 4'}], [{'generated_text': "\nThoughts:\nLet's start by using the information given in the question: Jake has 11 fewer peaches than Steven. This means that if Jake has x peaches, Steven has 


 57%|█████▋    | 17/30 [02:49<01:52,  8.62s/it][A

[[{'generated_text': "\nThoughts:\nLet's start by using the information given in the question to find out how many crayons Paul had at the end of the school year. We know that he had 521 crayons initially and that he didn't lose any erasers. So, the number of crayons he had at the end of the school year is:\n521 - 66 = 455\nNow, we need to find out how many crayons Paul had at the beginning of the school year. We are told that he had 154 crayons left at the end of the school year, so the number of crayons he had at the beginning of the school year is:\n455 - 154 = 301\nNow, we can find out how many crayons Paul had given away or lost by subtracting the number of crayons he had at the end of the school year from the number of crayons he had at the beginning of the school year:\n301 - 154 = 147\nSo, Paul had given away or lost 147 crayons during the school year.\nAnswer: 147"}], [{'generated_text': "\nThoughts:\nLet's start by identifying the information given in the question:\n* The pic


 60%|██████    | 18/30 [02:56<01:37,  8.16s/it][A

[[{'generated_text': "\nThoughts:\nLet's start by using the information given in the question: David did 44 push-ups in gym class today. David did 9 more push-ups than Zachary.\nSo, Zachary did:\n44 + 9 = 53 push-ups\n\nAnswer: 53"}], [{'generated_text': '\nThoughts:\n\n1. Kelly has 80 Nintendo games initially.\n2. She found 31 more Nintendo games, so now she has 80 + 31 = 111 games.\n3. To find out how many games Kelly needs to give away to have 6 games left, we need to subtract the number of games she has already given away (6) from the total number of games she has (111).\n4. So, Kelly needs to give away 111 - 6 = 105 games.\nAnswer: 105'}], [{'generated_text': "\nThoughts:\n\n1. Let's start by identifying the number of girls in the school: 315\n2. Now, let's find the number of boys in the school: 309\n3. Next, let's count the number of teachers in the school: 772\n4. Now, we need to find the total number of people in the school, which is the sum of the number of girls, boys, and te


 63%|██████▎   | 19/30 [03:05<01:32,  8.37s/it][A

[[{'generated_text': "\nThoughts:\nLet's start by identifying the information given in the question:\n* David did 40 more push-ups than Zachary.\n* David did 17 less crunches than Zachary.\n* Zachary did 34 push-ups and 62 crunches.\nFrom these statements, we can conclude that:\n* David did 34 + 40 = 74 push-ups.\n* David did 62 - 17 = 45 crunches.\nTherefore, David did 45 crunches in gym class today.\nAnswer: 45"}], [{'generated_text': "\nThoughts:\n\n1. Let's start by counting the total number of eggs Rebecca has: 9 eggs.\n2. Now, let's count the number of bananas Rebecca has: 99 bananas.\n3. Next, let's count the number of marbles Rebecca has: 27 marbles.\n4. Now, let's divide the total number of eggs by 3 to find out how many groups Rebecca can create: 9 eggs / 3 = 3 groups.\n5. So, Rebecca can create 3 groups of eggs.\nAnswer: 3"}], [{'generated_text': "\nThoughts:\n\n* Let's assume that the number of sacks of unripe oranges remains the same throughout the 73 days of harvest.\n* T


 67%|██████▋   | 20/30 [03:19<01:40, 10.01s/it][A

[[{'generated_text': "  Sure, I'd be happy to help! Here are my thoughts on the question:\nThoughts:\n1. Let's start by identifying the given information:\n* Mary has already added 12 cups of flour.\n* The recipe calls for 6 cups of flour.\n* The recipe also calls for 11 cups of sugar and 9 cups of salt.\n2. Now, let's think about how much sugar Mary needs to add:\n* If Mary has already added 12 cups of flour, that means she has 6 cups of flour left to add (since the recipe calls for 6 cups of flour).\n* The recipe calls for 11 cups of sugar, so Mary needs to add 5 cups of sugar (since she has 6 cups of flour left to add).\n3. Next, let's think about how much salt Mary needs to add:\n* The recipe calls for 9 cups of salt.\n* Since Mary has already added 12 cups of flour, she has 3 cups of flour left to add (since the recipe calls for 6 cups of flour).\n* Therefore, Mary needs to add 3 cups of salt (since she has 3 cups of flour left to add).\n4. Now, let's summarize the information:\n*


 70%|███████   | 21/30 [03:38<01:55, 12.85s/it][A

[[{'generated_text': "  Sure, I'd be happy to help you with that! Here are my thoughts on the question:\nThoughts:\nStep 1: Identify the information given in the question:\n* The Razorback t-shirt shop makes $87 per t-shirt sold.\n* They sold a total of 95 t-shirts during the Arkansas game and the Texas Tech game.\n* They sold 47 t-shirts during the Arkansas game.\nStep 2: Determine the total amount of money made from selling t-shirts during the Arkansas game:\n* If they sold 47 t-shirts during the Arkansas game, and each t-shirt costs $87, then the total amount of money made from selling t-shirts during the Arkansas game is:\n$47 x $87 = $4,099\nStep 3: Determine the amount of money made from selling t-shirts during the Texas Tech game:\n* Since they sold a total of 95 t-shirts during both games, and they sold 47 t-shirts during the Arkansas game, then they sold 48 t-shirts during the Texas Tech game.\n* If they sold 48 t-shirts during the Texas Tech game, and each t-shirt costs $87, 


 73%|███████▎  | 22/30 [03:46<01:30, 11.37s/it][A

[[{'generated_text': "\nThoughts:\nLet's break down the information given in the question:\n\n* Jack received 4 emails in the morning.\n* He received a total of 5 emails in the day.\n* So, the number of emails Jack received in the afternoon is 5 - 4 = 1.\n\nAnswer: 1"}], [{'generated_text': '\nThoughts:\n\n* Faye has 14 rows of pencils.\n* Each row has 11 pencils, so the total number of pencils is 14 x 11 = 154 pencils.\n\nAnswer: 154'}], [{'generated_text': '\nThoughts:\n\n1. Paco had 36 cookies to start with.\n2. He gave 14 cookies to his friend, so now he has 36 - 14 = 22 cookies left.\n3. He ate 10 cookies, so now he has 22 - 10 = 12 cookies left.\nAnswer: 12'}], [{'generated_text': "\nThoughts:\n\n1. Let's count the number of eggs Rebecca has: 12 eggs.\n2. Now, let's count the number of marbles Rebecca has: 5 marbles.\n3. To find out how many more eggs Rebecca has than marbles, we need to subtract the number of marbles from the number of eggs: 12 eggs - 5 marbles = 7 eggs.\nSo, Re


 77%|███████▋  | 23/30 [03:53<01:11, 10.15s/it][A

[[{'generated_text': "\nThoughts:\n\n* Let's first identify the information given in the question:\n\t+ Jack received 4 emails in the morning.\n\t+ Jack received 5 emails in the afternoon.\n\t+ Jack received 8 emails in the evening.\n* Now, let's find out the total number of emails Jack received in the afternoon and evening:\n\t+ The afternoon refers to the time period between 12 PM and 5 PM.\n\t+ Jack received 5 emails in the afternoon, so the total number of emails he received in the afternoon is 5.\n\t+ The evening refers to the time period between 5 PM and 12 AM.\n\t+ Jack received 8 emails in the evening, so the total number of emails he received in the evening is 8.\n* Therefore, the answer to the question is:\n\t+ Jack received 5 emails in the afternoon and 8 emails in the evening.\nAnswer: 13"}], [{'generated_text': '\nThoughts:\n\n1. Sofia asked 310 students for their suggestions on what food to add to the menu.\n2. 185 students suggested adding mashed potatoes.\n3. Therefore,


 80%|████████  | 24/30 [04:03<01:00, 10.14s/it][A

[[{'generated_text': '\nThoughts:\n\n1. Allan brought 5 balloons to the park.\n2. Jake brought 6 balloons to the park.\n3. Jake then bought 3 more balloons at the park.\n4. So, the total number of balloons Jake brought to the park is 6 + 3 = 9 balloons.\nAnswer: 9'}], [{'generated_text': "  Sure, I'd be happy to help! Here are my thoughts on the question:\nThoughts:\n* The person needs a total of 49 g of colors to paint the feathers.\n* The feathers require 288 g of wax to stick the feathers together.\n* The person currently has 260 g of wax.\n* To find out how many grams of wax the person already has, we need to subtract the amount of wax they currently have (260 g) from the total amount of wax needed (288 g).\n* So, the person already has 28 g of wax.\nAnswer: 28 g"}], [{'generated_text': "\nThoughts:\nLet's see, if you have 28 cookies and 86 candies in total, that means you have a total of 114 items. Since each bag has an equal number of cookies, we can divide the total number of it


 83%|████████▎ | 25/30 [04:10<00:45,  9.09s/it][A

[[{'generated_text': "  Sure, I'd be happy to help! Here are my thoughts on the question:\nThoughts:\nLet's start by using the information given in the question to find the total number of sacks of oranges harvested per day:\n5 sacks of ripe oranges + 74 sacks of unripe oranges = 89 sacks of oranges per day\nNow, let's find the difference between the number of sacks of unripe oranges and ripe oranges harvested per day:\n74 sacks of unripe oranges - 5 sacks of ripe oranges = 69 sacks of unripe oranges per day\nSo, there are 69 more sacks of unripe oranges than ripe oranges harvested per day.\nAnswer: 69"}], [{'generated_text': '\nThoughts:\n\n1. Jerry had 8 action figures on the shelf to start with.\n2. Later, he added 4 more action figures to the shelf, so now he has 8 + 4 = 12 action figures on the shelf.\n3. Then, he removed 5 old action figures from the shelf, so now he has 12 - 5 = 7 action figures on the shelf.\nAnswer: 7'}], [{'generated_text': "\nThoughts:\n\n1. Let's start by i


 87%|████████▋ | 26/30 [04:18<00:35,  8.77s/it][A

[[{'generated_text': "\nThoughts:\nLet's first identify the information given in the question:\n\n* There are 8 different movies in the 'crazy silly school' series.\n* There are 21 different books in the series.\n* If you read 7 of the books, then you have read 7/21 = 1/3 of the books.\n* If you watched 4 of the movies, then you have watched 4/8 = 1/2 of the movies.\nNow, we need to find out how many movies you still have to watch. To do this, we need to subtract the number of movies you have watched from the total number of movies in the series:\nNumber of movies still to watch = Total number of movies - Number of movies watched\n= 8 - 4\n= 4\n\nSo, you still have 4 movies left to watch from the 'crazy silly school' series.\nAnswer: 4"}], [{'generated_text': "\nThoughts:\nLet's start by using the information given in the question to find the distance jumped by the frog:\nFrog jumped 33 inches farther than the grasshopper, so the frog jumped a total of 33 + 24 = 57 inches.\nNow, let's 


 90%|█████████ | 27/30 [04:26<00:25,  8.39s/it][A

[[{'generated_text': "\nThoughts:\nLet's start by identifying the information given in the question:\n* There are 384 oranges in Philip's collection.\n* There are 192 bananas in Philip's collection.\n* The bananas are organized into 345 groups.\n* The oranges are organized into 16 groups.\nNow, let's think about how we can use this information to find the size of each group of oranges:\n* If there are 16 groups of oranges, and there are 384 oranges in total, then the number of oranges in each group is 384 / 16 = 24.\nSo, each group of oranges contains 24 oranges.\nAnswer: 24"}], [{'generated_text': "\nThoughts:\nLet's break down the problem step by step:\n\n1. Jerry had some action figures on a shelf in his room.\n\t* Let's assume he had x action figures initially on the shelf.\n2. Later he added 7 more action figures to the shelf.\n\t* So, the total number of action figures on the shelf now is x + 7.\n3. If there are a total of 10 action figures on his shelf now, then we can set up th


 93%|█████████▎| 28/30 [04:35<00:17,  8.57s/it][A

[[{'generated_text': "\nThoughts:\n\n1. Let's start by identifying the information given in the question:\n\t* Mom buys 28 packages.\n\t* Each package contains 56 white t-shirts in total.\n2. To find out how many white t-shirts each package has, we need to divide the total number of t-shirts in each package by the number of packages:\n\t* 56 white t-shirts / 28 packages =?\n3. Now, let's simplify the calculation:\n\t* 56 / 28 = 2\n4. So, each package has 2 white t-shirts.\nAnswer: 2"}], [{'generated_text': '\nThoughts:\n\n1. Dan spent $ 7 on the candy bar.\n2. Dan spent $ 6 on the chocolate.\n3. Total amount spent = $ 7 + $ 6 = $ 13.\n\nAnswer: $ 13'}], [{'generated_text': '\nThoughts:\n\n1. Danny had 25 bottle caps in his collection earlier.\n2. He found 32 bottle caps at the park.\n3. To find out how many more bottle caps he found, we need to subtract the number he already had from the number he found.\n4. 32 - 25 = 7\n\nAnswer: 7'}], [{'generated_text': "  Great! Let's work through 


 97%|█████████▋| 29/30 [04:42<00:08,  8.32s/it][A

[[{'generated_text': '\nThoughts:\n\n1. Dan spent $2 to buy a candy bar.\n2. He spent $3 to buy a chocolate.\n3. To find out how much more he spent on the chocolate than the candy bar, we need to subtract the cost of the candy bar from the cost of the chocolate.\n4. So, the difference between the cost of the chocolate and the candy bar is $3 - $2 = $1.\nAnswer: $1'}], [{'generated_text': "\nThoughts:\n\n1. Let's start by identifying the information given in the question:\n\t* Tiffany had 7 bags of cans on Monday.\n\t* The next day, she found 12 more bags worth of cans.\n2. To find out how many more bags Tiffany found on the next day than she had on Monday, we need to subtract the number of bags she had on Monday from the number of bags she found on the next day:\n\t* 12 bags - 7 bags = 5 bags\nSo, Tiffany found 5 more bags of cans on the next day than she had on Monday.\nAnswer: 5"}], [{'generated_text': "\nThoughts:\nLet's start by identifying the information given in the question:\n*


100%|██████████| 30/30 [04:57<00:00,  9.90s/it][A
100%|██████████| 1/1 [04:57<00:00, 297.13s/it]

[[{'generated_text': "\nThoughts:\n\n1. Let's start by counting the number of bottles of regular soda: 22.\n2. Now, let's count the number of bottles of diet soda: 61.\n3. To find out how many more bottles of diet soda there are than regular soda, we can subtract the number of regular soda bottles from the number of diet soda bottles: 61 - 22 = 39.\n4. So, the grocery store has 39 more bottles of diet soda than regular soda.\nAnswer: 39"}], [{'generated_text': "\nThoughts:\nLet's start by identifying the information given in the question:\n* Paul got a box of crayons for his birthday.\n* By the end of the school year, he had either lost or given away 551 of them.\n* He only had 177 left.\nNow, let's think about how we can use this information to find the number of crayons Paul got for his birthday.\n1. If Paul had 551 crayons at the end of the school year, and he gave away or lost 551, then he must have had more crayons at the beginning of the school year.\n2. If he had more crayons at




### 3.3 Process Responses for new_model

In [16]:
new_extracted_responses = [_extract_answer(response) for response in new_responses]

print('invalid rate:',sum(np.array(new_extracted_responses)=='[invalid]')/len(new_extracted_responses))
np.array(new_responses)[np.array(new_extracted_responses)=='[invalid]']
#extracted_responses
#_extract_answer("Answer: $-12.1333,1 and 2,2")

invalid rate: 0.006666666666666667


array(["\nThoughts:\nLet's start by using the information given in the question to set up an equation. We know that the grasshopper jumped 39 inches, and the frog jumped x inches. We also know that the mouse jumped y inches, where y is the distance the mouse jumped.\nWe can start by using the fact that the grasshopper jumped 19 inches farther than the frog to write:\n39 = x + 19\nNow, we can use the fact that the mouse jumped 12 inches less than the frog to write:\nx + 12 = y\nNow we have two equations and two unknowns. Can you use these equations to find the value of y, which is the distance the mouse jumped?",
       "\nThoughts:\nLet's start by dividing the total number of skittles and erasers by the number of groups.\nTotal number of skittles: 4502\nTotal number of erasers: 4276\nNumber of groups: 154\nNow, we can find the size of each group by dividing the total number of skittles and erasers by the number of groups.\nSize of each group:\nSkittles: 4502 / 154 = 29\nErasers: 4276 /

### 3.4 Record the results for Base Model

In [17]:
import pandas as pd

# Create a DataFrame with named columns
df_new_svamp = pd.DataFrame({'Prompt':new_prompts,'Question': new_response_questions, 'Response': new_responses,'extracted_response':new_extracted_responses, 'Answer': new_response_answers})

# Write to CSV
df_new_svamp.to_csv('test_zeroshot_new_svamp.csv', index=True)

### 3.5 calculate metrics for base model

In [20]:
svamp_new_accuracy=accuracy(reference=new_response_answers, test=new_extracted_responses)

In [21]:
svamp_new_accuracy

0.53

# Bug fix

In [13]:
import rich
from rich.table import Table
rich_table = Table(*["prompt"], title='1', show_lines=True)
for ix in range(2):
    rich_table.add_row("""[/INST][INST]""")
rich_table

In [44]:
ds_all = load_dataset("/root/autodl-tmp/SVAMP", split="all")

In [45]:
ds_train = load_dataset("/root/autodl-tmp/SVAMP", split="train")

In [48]:
sum(np.array(ds_train['Body'])==np.array(ds_all['Body'][:700]))

700

In [61]:
ds_train['Answer']

[145.0,
 19.0,
 3.0,
 198.0,
 63.0,
 322.0,
 30.0,
 192.0,
 26.0,
 17.0,
 1.0,
 20.0,
 4.0,
 2.0,
 7.0,
 70.0,
 345.0,
 266.0,
 54.0,
 807.0,
 3.0,
 12.0,
 23.0,
 31.0,
 1201565.0,
 89.0,
 574.0,
 8.0,
 146.0,
 1.0,
 14.0,
 2.0,
 7.0,
 12.0,
 21.0,
 14.0,
 2.0,
 51.0,
 2.0,
 15.0,
 20.0,
 3.0,
 7.0,
 42.0,
 756.0,
 11.0,
 5.0,
 12.0,
 65.0,
 1.0,
 111.0,
 1.0,
 27.0,
 4.0,
 17.0,
 13.0,
 10.0,
 77.0,
 1092.0,
 5.0,
 7.0,
 45.0,
 10.0,
 52.0,
 2.0,
 6.0,
 16.0,
 469.0,
 12518.0,
 1.0,
 2.0,
 131.0,
 92.0,
 640.0,
 42.0,
 7.0,
 25.0,
 1.0,
 2.0,
 11.0,
 15.0,
 20.0,
 8722.0,
 5.0,
 83.0,
 29.0,
 53.0,
 3.0,
 33.0,
 1.0,
 16.0,
 47.0,
 5590.0,
 1.0,
 1.0,
 49.0,
 9.0,
 78.0,
 34.0,
 7.0,
 24.0,
 34.0,
 10.0,
 14.0,
 10.0,
 15.0,
 621.0,
 1.0,
 2.0,
 14.0,
 8.0,
 2.0,
 14.0,
 89.0,
 20.0,
 7.0,
 1.0,
 16.0,
 21.0,
 3.0,
 5.0,
 9.0,
 76.0,
 28.0,
 30.0,
 19.0,
 2.0,
 22.0,
 7.0,
 42.0,
 14.0,
 24.0,
 8.0,
 27.0,
 31.0,
 2.0,
 21.0,
 4.0,
 80.0,
 3.0,
 2.0,
 7.0,
 1.0,
 4.0,
 4.0,
 32.0,
 60

In [23]:
import pandas

In [24]:
df1=pandas.read_csv('/root/autodl-tmp/msc_ml/llm_thought/Evaluation/test_zeroshot_base_svamp.csv')
df2=pandas.read_csv('/root/autodl-tmp/msc_ml/llm_thought/Evaluation/test_zeroshot_new_svamp.csv')


In [28]:
sum(df1['Response']==df2['Response'])

61

In [27]:
print(df1['Response'][0])
print("*********")


print(df2['Response'][0])

  Sure, I'd be happy to help! Here are my thoughts on the question:
Thoughts:
1. Let's start by identifying the given information:
* Mary has already added 5 cups of flour.
* The recipe calls for 6 cups of flour.
* The recipe also calls for 8 cups of sugar.
* The recipe calls for 7 cups of salt.
2. Now, we need to find out how many more cups of sugar Mary needs to add than cups of salt. Let's do this by subtracting the amount of flour she has already added from the total amount of flour needed:
6 cups of flour - 5 cups of flour = 1 cup of flour remaining
Now, we need to find out how many cups of sugar are left to add after subtracting the amount of flour remaining from the total amount of sugar needed:
8 cups of sugar - 1 cup of flour = 7 cups of sugar remaining
3. Finally, we can see that Mary needs to add 7 cups of sugar more than cups of salt to complete the recipe.
Answer: 7

I hope this helps! Let me know if you have any questions.
*********
  Sure, I'd be happy to help! Here are 

In [29]:
sum([len(response) for response in df1['Response']])/300

469.22

In [30]:
sum([len(response) for response in df2['Response']])/300

460.31333333333333