In [1]:
!source /etc/network_turbo

设置成功


### bugs for transformers(4.28.1), update to newest version 4.31.0

In [1]:
from dataclasses import dataclass, field
from typing import Optional
import torch
import re
import numpy as np
import torch
import random
from accelerate import Accelerator
from datasets import load_dataset

from tqdm import tqdm
from transformers import Adafactor, AutoTokenizer, HfArgumentParser, pipeline
from transformers import LlamaForCausalLM, LlamaTokenizer
from transformers import (
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel

# Random Seed

In [2]:
def set_seed(seed: int):
    """
   A function to fix random seed in `numpy`,`random`,  and `torch`.

    Args:
        seed (`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
set_seed(1000)

# Metrics

In [4]:
import operator
from functools import reduce
from math import fabs
from random import shuffle

try:
    from scipy.stats.stats import betai
except ImportError:
    betai = None

from nltk.util import LazyConcatenation, LazyMap


def accuracy(reference, test):
    """
    Given a list of reference values and a corresponding list of test
    values, return the fraction of corresponding values that are
    equal.  In particular, return the fraction of indices
    ``0<i<=len(test)`` such that ``test[i] == reference[i]``.

    :type reference: list
    :param reference: An ordered list of reference values.
    :type test: list
    :param test: A list of values to compare against the corresponding
        reference values.
    :raise ValueError: If ``reference`` and ``length`` do not have the
        same length.
    """
    if len(reference) != len(test):
        raise ValueError("Lists must have the same length.")
    return sum(x == y for x, y in zip(reference, test)) / len(test)



def precision(reference, test):
    """
    Given a set of reference values and a set of test values, return
    the fraction of test values that appear in the reference set.
    In particular, return card(``reference`` intersection ``test``)/card(``test``).
    If ``test`` is empty, then return None.

    :type reference: set
    :param reference: A set of reference values.
    :type test: set
    :param test: A set of values to compare against the reference set.
    :rtype: float or None
    """
    if not hasattr(reference, "intersection") or not hasattr(test, "intersection"):
        raise TypeError("reference and test should be sets")

    if len(test) == 0:
        return None
    else:
        return len(reference.intersection(test)) / len(test)



def recall(reference, test):
    """
    Given a set of reference values and a set of test values, return
    the fraction of reference values that appear in the test set.
    In particular, return card(``reference`` intersection ``test``)/card(``reference``).
    If ``reference`` is empty, then return None.

    :type reference: set
    :param reference: A set of reference values.
    :type test: set
    :param test: A set of values to compare against the reference set.
    :rtype: float or None
    """
    if not hasattr(reference, "intersection") or not hasattr(test, "intersection"):
        raise TypeError("reference and test should be sets")

    if len(reference) == 0:
        return None
    else:
        return len(reference.intersection(test)) / len(reference)



def f_measure(reference, test, alpha=0.5):
    """
    Given a set of reference values and a set of test values, return
    the f-measure of the test values, when compared against the
    reference values.  The f-measure is the harmonic mean of the
    ``precision`` and ``recall``, weighted by ``alpha``.  In particular,
    given the precision *p* and recall *r* defined by:

    - *p* = card(``reference`` intersection ``test``)/card(``test``)
    - *r* = card(``reference`` intersection ``test``)/card(``reference``)

    The f-measure is:

    - *1/(alpha/p + (1-alpha)/r)*

    If either ``reference`` or ``test`` is empty, then ``f_measure``
    returns None.

    :type reference: set
    :param reference: A set of reference values.
    :type test: set
    :param test: A set of values to compare against the reference set.
    :rtype: float or None
    """
    p = precision(reference, test)
    r = recall(reference, test)
    if p is None or r is None:
        return None
    if p == 0 or r == 0:
        return 0
    return 1.0 / (alpha / p + (1 - alpha) / r)


# Prompt format/ examples

In [6]:
cot_examples_1 = """### Instruction:
Given a question, generate some helpful and creative thoughts step by step and then answer the question.

### Examples:

Question: 
There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
Thought:
There are 15 trees originally.
Then there were 21 trees after some more were planted.
So there must have been 21 - 15 = 6.
#Answer: 6

Question: 
If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
Thought:
There are originally 3 cars.
2 more cars arrive.
3 + 2 = 5.
#Answer: 5

Question: 
Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?
Thought:
Originally, Leah had 32 chocolates.
Her sister had 42.
So in total they had 32 + 42 = 74.
After eating 35, they had 74 - 35 = 39.
#Answer: 39
"""

# Dataset

In [7]:
def build_test_dataset(
    dataset_name="/root/autodl-tmp/SVAMP",prompt_examples=cot_examples_1,prompt_format='default'
):
    """
    Build dataset for training. This builds the dataset from `load_dataset`, one should
    customize this function to train the model on its own dataset.

    Args:
        dataset_name (`str`):
            The name of the dataset to be loaded.
        prompt_examples ('str'):
            The few shot examples for testing
        prompt_format ('str'):
            The format of the prompting for testing

    Returns:
        dataset for test of checkpoints of llms
    """


    ds_test = load_dataset(dataset_name, split="test")
    original_columns = ds_test.column_names


    def preprocess_function(examples):

        body=examples["Body"]
        question=examples["Question"]
        if prompt_format=='default':
            query = prompt_examples +'\n\n### Input:\n'+ 'Question:\n' + body +'\n'+ question + '\n\n' + "### Response:\n"

        return {"query": query }
    

    ds_test = ds_test.map(
        preprocess_function,
        batched=False,
        #remove_columns=original_columns,
    )


    print(ds_test)
    return ds_test

In [8]:
# We retrieve the dataloader by calling the `build_dataset` function.
dataset = build_test_dataset(dataset_name="/root/autodl-tmp/SVAMP")
prompt_test=dataset["query"]
answer_test=dataset["Answer"]

Found cached dataset json (/root/.cache/huggingface/datasets/json/SVAMP-ba92263a9f37e5fb/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/SVAMP-ba92263a9f37e5fb/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-97ea55e108ce1fa3.arrow


Dataset({
    features: ['Type', 'Body', 'ID', 'Question', 'Answer', 'Equation', 'query'],
    num_rows: 300
})


# Model and Generator

## base model

In [9]:
# load base model
model_name='/root/autodl-tmp/Llama-2-7b-chat-hf'
base_model = AutoModelForCausalLM.from_pretrained(model_name,
                                                  torch_dtype=torch.float16,
                                                  device_map=0,
                                                 )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## fine-tuned model

In [56]:
# load fine-tuned model
new_model_name="/root/autodl-tmp/msc_ml/llama_2/ckpts_svamp/checkpoint_3000"
new_model = AutoModelForCausalLM.from_pretrained(model_name,
                                                 torch_dtype=torch.float16,
                                                 device_map=0,
                                                )

new_model = PeftModel.from_pretrained(new_model, new_model_name)
new_model = new_model.merge_and_unload()

#model.add_adapter(lora_config, adapter_name="adapter_1")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
# tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Generate responses to questions

In [13]:
"""
def generate_responses(model,tokenizer,prompt_test=prompt_test,answer_test=answer_test,test_epochs=1,batch_size=2):


    # because of sampling of tokens, we can increase epochs of test for more precise evaluation
    test_epochs=test_epochs

    outputs_text=[]
    # the prompt and answer correspondong to output_text
    prompts_for_outputs=[]
    answers_for_outputs=[]

    batch_size = batch_size  

    for epoch in tqdm(range(test_epochs)):
        for i in tqdm(range(0, len(prompt_test), batch_size)):  # Increment by batch_size

            # Slice the data to create a batch
            batch_questions = prompt_test[i:i+batch_size]
            batch_answers = answer_test[i:i+batch_size]

            batch_inputs = tokenizer(batch_questions, padding=True, return_tensors="pt", truncation=True, max_length=2048).to('cuda')

            batch_outputs = model.generate(**batch_inputs,    
                                     max_new_tokens=256,
                                     top_k=50,
                                     temperature=1.0,
                                     top_p=0.95,
                                     do_sample=True,
                                     repetition_penalty=1.1)
            batch_input_length = batch_inputs.input_ids.shape[1]
            batch_generated_tokens = batch_outputs[:, batch_input_length:]
            batch_output_text = tokenizer.batch_decode(batch_generated_tokens)
            print(batch_output_text)
        
        
            outputs_text.extend(batch_output_text)
            prompts_for_outputs.extend(batch_questions)
            answers_for_outputs.extend(batch_answers)
    return outputs


outputs=generate_responses(model=base_model,tokenizer=tokenizer)
"""

In [19]:
gen_kwargs=dict(
    max_new_tokens=256,
    top_k=50,
    temperature=1.0,
    top_p=0.95,
    do_sample=True,
    repetition_penalty=1.1
)


# Create the pipeline for base model
base_generator= pipeline("text-generation", model=base_model, tokenizer=tokenizer,
                         max_new_tokens=256,
                         top_k=50,
                         temperature=1.0,
                         top_p=0.95,
                         do_sample=True,
                         repetition_penalty=1.1,
                         device=0,)
"""
# Create the pipeline for new model
new_generator= pipeline("text2text-generation", model=new_model, tokenizer=tokenizer,
                         max_new_tokens=256,
                         top_k=50,
                         temperature=1.0,
                         top_p=0.95,
                         do_sample=True,
                         repetition_penalty=1.1,
                         device=0,)
"""
def generate_responses(generator,prompt_test=prompt_test,answer_test=answer_test,test_epochs=1,batch_size=2):
    generator=generator

    # because of sampling of tokens, we can increase epochs of test for more precise evaluation
    test_epochs=test_epochs

    outputs=[]
    prompts_for_outputs=[]
    answers_for_outputs=[]

    batch_size = batch_size  

    for epoch in tqdm(range(test_epochs)):
        for i in tqdm(range(0, len(prompt_test), batch_size)):  # Increment by batch_size

            # Slice the data to create a batch
            batch_questions = prompt_test[i:i+batch_size]
            batch_answers = answer_test[i:i+batch_size]

            batch_generations = generator(batch_questions,batch_size=batch_size,return_full_text=False)
            print(batch_generations)
            outputs.extend([gen[0]['generated_text'] for gen in batch_generations])
            prompts_for_outputs.extend(batch_questions)
            answers_for_outputs.extend(batch_answers)
    return outputs,answers_for_outputs

In [20]:
responses,response_answers=generate_responses(base_generator)

  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/150 [00:00<?, ?it/s][A
  1%|          | 1/150 [00:04<11:43,  4.72s/it][A

[[{'generated_text': 'Thought:\nWe know that Mary has already added 5 cups of flour.\nThe recipe requires 8 cups of sugar.\nSo Mary needs 8 - 5 = 3 cups of sugar.\nNow, we know that the recipe also requires 7 cups of salt.\nMary has already added 5 cups of flour. So to find out how many more cups of salt she needs, we subtract 5 from 7.\n7 - 5 = 2.\nSo Mary needs 2 more cups of salt.\n#Answer: 2'}], [{'generated_text': '\nThought:\nPaul had 535 crayons originally.\nHe lost 535 - 52 = 483 crayons.\nGave away 52.\n#Answer: 483'}]]



  1%|▏         | 2/150 [00:08<10:20,  4.19s/it][A

[[{'generated_text': " \nThought:\nOriginally, Robin's hair was 16 inches long.\nHe cut off 11 inches.\nNow it is 16 - 11 = 5 inches.\nThen it grew by 12 inches, so it is 5 + 12 = 17 inches.\n#Answer: 17\n\n### Note:\nPlease provide the actual input question you would like me to solve and I will update the response accordingly.  "}], [{'generated_text': 'Thought:\nJosh originally had 15 marbles.\nHe lost 23 marbles.\nSo he found 9 - 23 = -14.\n#Answer: -14'}]]



  2%|▏         | 3/150 [00:12<10:31,  4.29s/it][A

[[{'generated_text': "Thought:\nEd originally had more marbles than Doug.\nHe lost 17 of his marbles.\nSo now he has less marbles than Doug.\nLet's see... Ed had 29 more marbles than Doug.\nDoug had x marbles.\nEd had 29 + x = 46.\nNow that Ed lost 17 marbles, he has 29 + x - 17 = 33.\nSo Ed had 33 more marbles than Doug after he lost his. #Answer: 33"}], [{'generated_text': '\nThought:\nPaco had initially 25 cookies.\nHe ate 5 of them (so now he has 25 - 5 = 20).\nThen he bought 3 more cookies (so now he has 20 + 3 = 23).\nThe difference is 3 - 5 = -2.\n#Answer: -2'}]]



  3%|▎         | 4/150 [00:16<09:35,  3.94s/it][A

[[{'generated_text': "\nThought:\nPaul had 71 books originally.\nSold some in a garage sale. (let's say he sold 20)\nNow he has 116 books.\n116 - 71 = 45.\nSo he bought 45 more books than he sold.\n#Answer: 45"}], [{'generated_text': '\nStep by step thought process:\n\nThought:\nWe ordered 9 pizzas.\nEach pizza has 10 slices.\nSo there are 9 x 10 = 90 slices of pizza in total.\nIf we divide it equally among 2 people, we will get 90 / 2 = 45 slices per person.\n#Answer: 45 slices per person'}]]



  3%|▎         | 5/150 [00:20<09:26,  3.91s/it][A

[[{'generated_text': "Thought:\nLet's count the number of bottles and apples:\n72 (regular soda) + 32 (diet soda) = 104\n78 (apples)\nSo, they had 104 - 78 = 26 more bottles than apples."}], [{'generated_text': "\nThought:\nLet's first find out how many students the school has in total.\n569 + 236 = 805.\nNow let's find the number of girls compared to boys.\nNumber of girls = 569.\nNumber of boys = 236.\nSo the number of girls is more than the number of boys by 569 - 236 = 333.\n#Answer: 333"}]]



  4%|▍         | 6/150 [00:28<12:51,  5.36s/it][A

[[{'generated_text': '\nThought:\nLewis earns $403 per week for 233 weeks of harvest.\nHe pays $49 rent per week.\nIn total, he earns $403 x 233 = $90,690.\n\n\n\n'}], [{'generated_text': '\nThought:\nJessie initially weighed 92 kg.\nDuring the first week, she lost 56 kg.\nSo after the first week, she weighed 92 - 56 = 36 kg.\n#Answer: 36\n\nQuestion:\nWhen Ali bought 6 pens at 20 Tshs each, he spent a total amount of money. If later on he bought 4 more pens and paid 18 Tshs each, how much money did Ali spend in total?\nThought:\nAli spent 20 Tshs for each of the initial 6 pens.\nSo altogether he paid 6 x 20 = 120 Tshs.\nLater, he bought 4 more pens and paid 18 Tshs for each pen.\nSo he paid 4 x 18 = 72 Tshs extra.\nIn total, Ali spent 120 + 72 = 192 Tshs.\n#Answer: 192'}]]



  5%|▍         | 7/150 [00:33<12:39,  5.31s/it][A

[[{'generated_text': '\nThought:\nThe farmer had 175 tomatoes originally.\nHe picked 172 potatoes, leaving him with 175 - 172 = 3 tomatoes.\nSo the farmer has 3 tomatoes left in his garden.'}], [{'generated_text': "\nThought:\nLet's first find out the total number of students in the school now.\nThe original number of boys is 362.\nThe original number of girls is 257.\nSo, the total number of students was 362 + 257 = 619.\nNow, 403 more girls joined the school.\nThe new total number of girls is 619 + 403 = 1022.\nSo, the difference between the number of girls and boys in the school is:\n1022 - 362 = 660.\n#Answer: 660"}]]



  5%|▌         | 8/150 [00:36<10:38,  4.50s/it][A

[[{'generated_text': 'Thought:\nBaker made 144 cakes initially.\nHe sold 71 of them.\nSo he made 144 - 71 = 73 more cakes.\nThat means he made 111 + 73 = 184 cakes in total.\n#Answer: 184'}], [{'generated_text': '\nThought:\nPaige initially had 15 goldfish in the pond.\nAfter 5 were eaten by stray cats, she had 15 - 5 = 10 remaining.\n#Answer: 10'}]]


  5%|▌         | 8/150 [00:39<11:32,  4.88s/it]
  0%|          | 0/1 [00:39<?, ?it/s]


KeyboardInterrupt: 

# Process Responses

In [None]:
ANS_RE = re.compile(r"#Answer: (\-?[0-9\.\,]+)")
INVALID_ANS = "[invalid]"
# extract the numerical answer for arithmetic dataset of the default prompt format
def _extract_answer(completion: str):
    match = ANS_RE.search(completion)
    if match:
        match_str = match.group(1).strip()
        match_str = match_str.replace(",", "")
        return match_str
    else:
        return INVALID_ANS


    
extracted_responses = [_extract_answer(response) for response in responses]

# calculate metric

In [None]:
svamp_base_accuracy=accuracy(reference=extracted_responses, test=response_answers)