In [1]:
import torch

torch.cuda.empty_cache()
print(f"GPU: {torch.cuda.get_device_name(0)}")

from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    TextStreamer,
)
import transformers

from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from tqdm import tqdm  # Import tqdm

GPU: NVIDIA GeForce RTX 4090


2024-01-08 17:27:46.537990: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-08 17:27:46.556937: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-08 17:27:46.556960: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-08 17:27:46.556975: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-08 17:27:46.561166: I tensorflow/core/platform/cpu_feature_g

In [2]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)
inf_model_path="./zephyr-7b-padding-full"
inf_model=AutoModelForCausalLM.from_pretrained(inf_model_path, device_map="auto")
inf_tokenizer = AutoTokenizer.from_pretrained(inf_model_path)
streamer = TextStreamer(inf_tokenizer)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [10]:
dataset = load_dataset("gsm8k",'main')
print(dataset['test'][0])

{'question': "Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?", 'answer': 'Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.\n#### 18'}


In [18]:
pipe = pipeline(task="text-generation", model=inf_model, tokenizer=inf_tokenizer, max_length=300, streamer=streamer)

In [19]:
def score_zephyr_7b(prompt, response):
    try:
        result_list = pipe(f"<|user|>{prompt}</s>\n")
        ground_truth_score = response.split("####")[-1].strip()

        # Extract the first score from the generated text
        #generated_score = result_list[0]['generated_text'].split("####")[-1].strip()
        generated_text = result_list[0]["generated_text"]

        # Find the first occurrence of "####"
        first_occurrence = generated_text.find("####")

        if first_occurrence != -1:
            # Extract the number immediately following the first "####"
            generated_score = generated_text[first_occurrence + 4:].split()[0].strip()

            # Extract the ground truth score from the answer
            generated_score = generated_score[: len(ground_truth_score)]

            # Compare the scores and assign points
            score = 1.0 if generated_score == ground_truth_score else 0.0
            print("ground_truth\n"+ground_truth_score)
            print("model_output\n"+generated_score)
        else:
            # No "####" found, skipping scoring
            print(f"No '####' found in the generated text for prompt: {prompt}")
            score = None

        return score
    except Exception as e:
        print(f"Skipping scoring for prompt: {prompt}")
        print(f"Error: {e}")
        return None

In [20]:
def evaluate_zephyr_7b(dataset):
    total_scores = 0.0
    total_samples = len(dataset)

    # Use tqdm to create a progress bar
    with tqdm(total=total_samples, desc="Evaluating dataset") as pbar:
        for example in dataset:
            prompt = example["question"]
            response = example["answer"]

            score = score_zephyr_7b(prompt, response)
            if score is not None:
                total_scores += score
            else:
                total_samples -= 1

            # Update the progress bar
            pbar.update(1)

    average_score = total_scores / total_samples if total_samples > 0 else 0.0
    print("scores=\n" + str(total_scores))
    print("samples=\n" + str(total_samples))
    return average_score



In [21]:
split_dataset=load_dataset("gsm8k",'main', split='test[:10]')
print(split_dataset)
evaluate_zephyr_7b(split_dataset)

Dataset({
    features: ['question', 'answer'],
    num_rows: 10
})


Evaluating dataset:   0%|          | 0/10 [00:00<?, ?it/s]

<|user|>Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?</s> 
<|assistant|>


Evaluating dataset:   0%|          | 0/10 [00:03<?, ?it/s]


KeyboardInterrupt: 

In [None]:
def evaluate_single_example(model, tokenizer, dataset):
    #example = "dataset['test'][0]"
    prompt = "Dean's mother gave him $28 to go to the toy store. Dean bought 6 toy cars and 5 teddy bears. Each toy car cost $2 and each teddy bear cost $1. His mother then feels generous and decides to give him an extra $10. How much money does Dean have left?"
    response = "The cost of the toy cars is 6 cars × $2/car = $<<6*2=12>>12. The cost of the teddy bears is 5 bears × $1/bear = $<<5*1=5>>5. The total cost of the toys is $12 + $5 = $<<12+5=17>>17. Adding the two amounts of money his mother gave him, we find that Dean has $28 + $10 = $<<28+10=38>>38 to spend. Dean has $38 − $17 = $21 left. #### 21"
    print(prompt+'\n')
    print(response+'\n')
    score = score_zephyr_7b(prompt, response)

    print(f"Score for the first example: {score}")

# Call the modified function to evaluate a single example
#evaluate_single_example(inf_model, inf_tokenizer, dataset)