In [1]:
!pip install human_eval

Collecting human_eval
  Downloading human_eval-1.0.3-py3-none-any.whl.metadata (153 bytes)
Collecting fire (from human_eval)
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Downloading human_eval-1.0.3-py3-none-any.whl (52 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.3/52.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: fire
  Building wheel for fire (setup.py) ... [?25ldone
[?25h  Created wheel for fire: filename=fire-0.7.0-py3-none-any.whl size=114248 sha256=5fbd478ec33c36bd736739db1f4ea0a6b3f2d4b8d6d3379c1c766746b15a835f
  Stored in directory: /root/.cache/pip/wheels/19/39/2f/2d3cadc408a8804103f1c34ddd4b9f6a93497b11fa96fe738e
Successfully built fire
Installing collected packages: fire, human_eval
Successfully installed fire-0.7.0 human_eval-1.0.3


In [2]:
import os
import json
import human_eval
import time
import requests
import json
import itertools 

from human_eval.data import write_jsonl, read_problems, HUMAN_EVAL
from human_eval.evaluation import evaluate_functional_correctness
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
from typing import Dict, List, Optional, Tuple

## Benchmark on the difference for inference time and accuracy

We use Qwen models since our work was mostly on them. The outcome could be even more outstandin if we used the 32B model !

In [3]:
main_model_name = "Qwen/Qwen2.5-Coder-3B"
main_model = AutoModelForCausalLM.from_pretrained(main_model_name).cuda()
main_tokenizer = AutoTokenizer.from_pretrained(main_model_name)

assistant_model_name = "Qwen/Qwen2.5-Coder-0.5B"
assistant_model = AutoModelForCausalLM.from_pretrained(assistant_model_name).cuda()
assistant_tokenizer = AutoTokenizer.from_pretrained(assistant_model_name)

assert main_tokenizer.vocab == assistant_tokenizer.vocab, "Tokenizers must share the same vocabulary."

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.21G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/139 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/139 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Because of computation time and limited resources we used only `num_problems_to_evaluate` of the HUMAN_EVAL dataset.

In [4]:
def evaluate_model(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    prompt: str,
    max_length: int = 1024,
    assistant_model: Optional[AutoModelForCausalLM] = None
) -> Tuple[str, float]:
    
    inputs = tokenizer(prompt, return_tensors="pt")
    start_time = time.time()
    generate_ids = model.generate(
        inputs.input_ids.cuda(),
        pad_token_id=tokenizer.eos_token_id,
        attention_mask=inputs["attention_mask"].cuda(),
        max_length=max_length,
        assistant_model=assistant_model
    )
    end_time = time.time()
    solution = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    return solution, end_time - start_time

def compare_inference_time(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    model_name: str,
    assistant_model: AutoModelForCausalLM,
    num_problems: int = 20
) -> dict:
    
    model_name = model_name.replace('/', '_')
    all_problems = read_problems(HUMAN_EVAL)

    # split human eval
    problems = dict(itertools.islice(all_problems.items(), num_problems))

    # write the new dataset to a jsonl file
    problems_path = f"/kaggle/working/problems_{num_problems}.jsonl"
    write_jsonl(problems_path, problems.values())

    regular_time = 0
    regular_solutions = []
    for problem_key, problem in tqdm(problems.items(), desc="Evaluating regular decoding"):
        solution, time_taken = evaluate_model(model, tokenizer, problem['prompt'])
        regular_time += time_taken
        regular_solutions.append({
            'task_id': problem_key,
            'prompt': problem['prompt'],
            'completion': solution
        })

    regular_output_path = f'/kaggle/working/{model_name}_regular_humaneval_solutions.jsonl'
    write_jsonl(regular_output_path, regular_solutions)  # Using the library function
    regular_results = evaluate_functional_correctness(regular_output_path, n_workers=4, timeout=3.0, k=[1], problem_file=problems_path)

    speculative_time = 0
    speculative_solutions = []
    for task_id, problem in tqdm(problems.items(), desc="Evaluating speculative decoding"):
        solution, time_taken = evaluate_model(model, tokenizer, problem['prompt'], assistant_model=assistant_model)
        speculative_time += time_taken
        speculative_solutions.append({
            'task_id': task_id,
            'prompt': problem['prompt'],
            'completion': solution
        })

    speculative_output_path = f'/kaggle/working/{model_name}_speculative_humaneval_solutions.jsonl'
    write_jsonl(speculative_output_path, speculative_solutions)  # Using the library function
    speculative_results = evaluate_functional_correctness(speculative_output_path, n_workers=4, timeout=3.0, k=[1], problem_file=problems_path)

    return {
        'regular_decoding_time': regular_time,
        'speculative_decoding_time': speculative_time,
        'regular_pass@1': regular_results['pass@1'],
        'speculative_pass@1': speculative_results['pass@1'],
        'num_problems': num_problems
    }

In [5]:
num_problems_to_evaluate = 40

results = compare_inference_time(
    model=main_model,
    tokenizer=main_tokenizer,
    model_name=main_model_name,
    assistant_model=assistant_model,
    num_problems=num_problems_to_evaluate
)

print("Benchmark Results:")
print(f"Regular Decoding Time: {results['regular_decoding_time']} seconds")
print(f"Speculative Decoding Time: {results['speculative_decoding_time']} seconds")
print(f"Regular Pass@1: {results['regular_pass@1']}")
print(f"Speculative Pass@1: {results['speculative_pass@1']}")
print(f"Number of Problems Evaluated: {results['num_problems']}")

Evaluating regular decoding:   0%|          | 0/40 [00:00<?, ?it/s]Both `max_new_tokens` (=2048) and `max_length`(=1024) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Evaluating regular decoding:   2%|▎         | 1/40 [00:04<03:14,  5.00s/it]Both `max_new_tokens` (=2048) and `max_length`(=1024) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Evaluating regular decoding:   5%|▌         | 2/40 [01:37<35:38, 56.28s/it]Both `max_new_tokens` (=2048) and `max_length`(=1024) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Evaluating regular decoding

Reading samples...


40it [00:00, 7522.74it/s]


Running test suites...


  0%|          | 0/40 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  self.pid = os.fork()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already

Writing results to /kaggle/working/Qwen_Qwen2.5-Coder-3B_regular_humaneval_solutions.jsonl_results.jsonl...


100%|██████████| 40/40 [00:00<00:00, 6637.34it/s]
Evaluating speculative decoding:   0%|          | 0/40 [00:00<?, ?it/s]Both `max_new_tokens` (=2048) and `max_length`(=1024) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.
Evaluating speculative decoding:   2%|▎         | 1/40 [00:04<02:53,  4.46s/it]Both `max_new_tokens` (=2048) and `max_length`(=1024) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Evaluating speculative decoding:   5%|

Reading samples...


0it [00:00, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
40it [00:00, 1311.66it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



Running test suites...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  0%|          | 0/40 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling p

Writing results to /kaggle/working/Qwen_Qwen2.5-Coder-3B_speculative_humaneval_solutions.jsonl_results.jsonl...


100%|██████████| 40/40 [00:00<00:00, 9787.77it/s]

Benchmark Results:
Regular Decoding Time: 2718.0612103939056 seconds
Speculative Decoding Time: 1734.0049483776093 seconds
Regular Pass@1: 0.2
Speculative Pass@1: 0.2
Number of Problems Evaluated: 40



