In [1]:
import gc
import torch
import numpy as np
import pandas as pd
import functools
from tqdm.auto import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

# –ù–∞—à–∏ –Ω–æ–≤—ã–µ –º–æ–¥—É–ª–∏
import src.config as config
from src.train.ptuning_train import train
from src.prompt import mbpp, humaneval
from src.data.loader import load_benchmark
from src.executor import LocalExecutor
from src.metrics import GreedyPass, PassAtk, PercentPassed, MeanEntropy, ExecutionResult

Skipping import of cpp extensions due to incompatible torch version 2.9.0+cu128 for torchao version 0.14.0         Please see GitHub issue #2919 for more info


In [2]:
TRAIN = False

In [3]:
if TRAIN:
    print("üî• –†–µ–∂–∏–º –æ–±—É—á–µ–Ω–∏—è: –ó–∞–ø—É—Å–∫–∞–µ–º train()...")
    model, tokenizer = train()

else:
    print(f"‚ùÑÔ∏è –†–µ–∂–∏–º –∏–Ω—Ñ–µ—Ä–µ–Ω—Å–∞: –ó–∞–≥—Ä—É–∂–∞–µ–º –±–∞–∑—É + P-Tuning –∞–¥–∞–ø—Ç–µ—Ä...")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    )

    base_model = AutoModelForCausalLM.from_pretrained(
        config.MODEL_PATH,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        attn_implementation="sdpa"
    )

    tokenizer = AutoTokenizer.from_pretrained(config.MODEL_PATH, trust_remote_code=True)

    print(f"üîó –ü–æ–¥–∫–ª—é—á–∞–µ–º –∞–¥–∞–ø—Ç–µ—Ä –∏–∑: {config.PTUNING_MODEL_PATH}")
    model = PeftModel.from_pretrained(base_model, config.PTUNING_MODEL_PATH)


model.eval()
print("‚úÖ –ú–æ–¥–µ–ª—å –≥–æ—Ç–æ–≤–∞ –∫ —Ç–µ—Å—Ç–∞–º!")

‚ùÑÔ∏è –†–µ–∂–∏–º –∏–Ω—Ñ–µ—Ä–µ–Ω—Å–∞: –ó–∞–≥—Ä—É–∂–∞–µ–º –±–∞–∑—É + P-Tuning –∞–¥–∞–ø—Ç–µ—Ä...
üîó –ü–æ–¥–∫–ª—é—á–∞–µ–º –∞–¥–∞–ø—Ç–µ—Ä –∏–∑: ./models/qwen3-0.6b-ptuning
‚úÖ –ú–æ–¥–µ–ª—å –≥–æ—Ç–æ–≤–∞ –∫ —Ç–µ—Å—Ç–∞–º!


In [4]:
# –í—Å–ø–æ–º–æ–≥–∞—Ç–µ–ª—å–Ω–∞—è —Ñ—É–Ω–∫—Ü–∏—è –¥–ª—è —ç–Ω—Ç—Ä–æ–ø–∏–∏ (–Ω—É–∂–Ω–∞ –¥–ª—è evaluate_transformers)
def calculate_entropy_hf(scores):
    """–°—á–∏—Ç–∞–µ—Ç —ç–Ω—Ç—Ä–æ–ø–∏—é –ø–æ –ª–æ–≥–∏—Ç–∞–º HuggingFace (Tuple of tensors)"""
    if not scores: return 0.0

    entropies = []

    for step_logits in scores:
        # step_logits: [batch_size, vocab_size]
        # –ü–µ—Ä–µ–≤–æ–¥–∏–º –≤ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏
        probs = torch.softmax(step_logits, dim=-1)
        log_probs = torch.log_softmax(step_logits, dim=-1)

        # H = - sum(p * log p) –ø–æ –ø–æ—Å–ª–µ–¥–Ω–µ–π —Ä–∞–∑–º–µ—Ä–Ω–æ—Å—Ç–∏ (vocab)
        # –ü–æ–ª—É—á–∞–µ–º [batch_size] —ç–Ω—Ç—Ä–æ–ø–∏–π –¥–ª—è —ç—Ç–æ–≥–æ —à–∞–≥–∞
        step_entropy = -(probs * log_probs).sum(dim=-1)

        entropies.append(step_entropy) # List of tensors

    # –°–æ–±–∏—Ä–∞–µ–º –≤ [steps, batch_size] -> [batch_size, steps]
    if not entropies: return [0.0]

    # Stack -> [seq_len, batch_size]
    entropies_tensor = torch.stack(entropies)

    # –°—Ä–µ–¥–Ω–µ–µ –ø–æ –¥–ª–∏–Ω–µ –ø–æ—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω–æ—Å—Ç–∏ -> [batch_size]
    mean_entropies = entropies_tensor.mean(dim=0)

    return mean_entropies.tolist()

def evaluate_transformers(model, tokenizer, tasks, metrics_suite, batch_size=4): # <--- –°–ù–ò–ó–ò–õ –ë–ê–¢–ß –î–û 4
    executor = LocalExecutor()
    final_results = {}

    tokenizer.padding_side = "left"
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # –ß–∏—Å—Ç–∏–º –∫–æ–Ω—Ñ–∏–≥
    base_config = config.SAMPLING_SETTINGS.copy()
    for key in ["n", "temperature", "ignore_eos", "detokenize", "logprobs", "stop"]:
        base_config.pop(key, None)
    if "max_tokens" in base_config:
        base_config["max_new_tokens"] = base_config.pop("max_tokens")

    grouped = {}
    for m in metrics_suite:
        cfg_key = str(m.gen_config)
        if cfg_key not in grouped: grouped[cfg_key] = []
        grouped[cfg_key].append(m)

    for cfg_key, metrics in grouped.items():
        metric_specific_config = metrics[0].gen_config
        generation_kwargs = {**base_config, **metric_specific_config}

        if not generation_kwargs.get("do_sample", False):
            for k in ["temperature", "top_p", "top_k"]:
                generation_kwargs.pop(k, None)

        print(f"\nüöÄ Generation for: {[m.name for m in metrics]} | Batch Size: {batch_size}")

        all_exec_results = []

        # tqdm —Å –æ—á–∏—Å—Ç–∫–æ–π
        pbar = tqdm(range(0, len(tasks), batch_size), desc="Eval Batches")

        for i in pbar:
            try:
                batch_tasks = tasks[i : i + batch_size]
                batch_prompts = [t.prompt for t in batch_tasks]

                inputs = tokenizer(
                    batch_prompts,
                    return_tensors="pt",
                    padding=True,
                    truncation=True
                ).to(model.device)

                input_len = inputs.input_ids.shape[1]

                with torch.no_grad():
                    outputs = model.generate(
                        **inputs,
                        pad_token_id=tokenizer.eos_token_id,
                        return_dict_in_generate=True,
                        output_scores=True, # –ù—É–∂–Ω–æ –¥–ª—è —ç–Ω—Ç—Ä–æ–ø–∏–∏, –Ω–æ –µ—Å—Ç –ø–∞–º—è—Ç—å
                        **generation_kwargs
                    )

                # –°—á–∏—Ç–∞–µ–º —ç–Ω—Ç—Ä–æ–ø–∏—é –°–†–ê–ó–£, –ø–æ–∫–∞ —Ç–µ–Ω–∑–æ—Ä—ã –≤ –ø–∞–º—è—Ç–∏, —á—Ç–æ–±—ã –ø–æ—Ç–æ–º —É–¥–∞–ª–∏—Ç—å
                batch_entropies = [0.0] * len(batch_tasks)
                if outputs.scores:
                    # scores - —ç—Ç–æ tuple(len=seq_len) of tensors [batch, vocab]
                    # –ù–∞—à–∞ —Ñ—É–Ω–∫—Ü–∏—è –≤–µ—Ä–Ω–µ—Ç —Å–ø–∏—Å–æ–∫ [ent_sample_0, ent_sample_1, ...]
                    batch_entropies = calculate_entropy_hf(outputs.scores)

                # –í–∞–∂–Ω–æ: –æ—Ç–≤—è–∑—ã–≤–∞–µ–º —ç–Ω—Ç—Ä–æ–ø–∏—é –æ—Ç –≥—Ä–∞—Ñ–∞ (—Ö–æ—Ç—è –º—ã –≤ no_grad, –Ω–æ –Ω–∞ –≤—Å—è–∫–∏–π —Å–ª—É—á–∞–π –ø–µ—Ä–µ–≤–æ–¥–∏–º –≤ float)
                # calculate_entropy_hf —É–∂–µ –≤–æ–∑–≤—Ä–∞—â–∞–µ—Ç list[float]

                n_samples = generation_kwargs.get("num_return_sequences", 1)

                for j, task in enumerate(batch_tasks):
                    task_samples = []
                    start_idx = j * n_samples
                    end_idx = start_idx + n_samples

                    # –≠–Ω—Ç—Ä–æ–ø–∏—è –¥–ª—è —Ç–µ–∫—É—â–µ–π –∑–∞–¥–∞—á–∏ (–¥–ª—è greedy n=1, —ç–Ω—Ç—Ä–æ–ø–∏—è –æ–¥–Ω–∞)
                    # –ï—Å–ª–∏ n > 1, HF –ø–æ–≤—Ç–æ—Ä—è–µ—Ç –ø—Ä–æ–º–ø—Ç—ã, –Ω–æ outputs.scores —Å–ª–æ–∂–Ω–µ–µ –º–∞–ø–ø—è—Ç—Å—è.
                    # –î–ª—è –ø—Ä–æ—Å—Ç–æ—Ç—ã —Å—á–∏—Ç–∞–µ–º, —á—Ç–æ output_scores –≤–æ–∑–≤—Ä–∞—â–∞–µ—Ç [batch_size * n_samples, vocab]
                    # calculate_entropy_hf –≤–µ—Ä–Ω–µ—Ç —Å–ø–∏—Å–æ–∫ –¥–ª–∏–Ω–æ–π batch_size * n_samples

                    for k in range(start_idx, end_idx):
                        seq = outputs.sequences[k]
                        gen_tokens = seq[input_len:]
                        gen_text = tokenizer.decode(gen_tokens, skip_special_tokens=True)

                        # –ë–µ—Ä–µ–º —ç–Ω—Ç—Ä–æ–ø–∏—é –¥–ª—è –∫–æ–Ω–∫—Ä–µ—Ç–Ω–æ–≥–æ —Å—ç–º–ø–ª–∞
                        sample_entropy = batch_entropies[k] if k < len(batch_entropies) else 0.0

                        exec_res = executor.execute(gen_text, task.tests)
                        exec_res.entropy = sample_entropy
                        task_samples.append(exec_res)

                    all_exec_results.append(task_samples)

                # === –ê–ì–†–ï–°–°–ò–í–ù–ê–Ø –û–ß–ò–°–¢–ö–ê ===
                del inputs
                del outputs # –£–¥–∞–ª—è–µ–º 5 –ì–ë –ª–æ–≥–∏—Ç–æ–≤
                del batch_entropies
                gc.collect()
                torch.cuda.empty_cache()
                # ===========================

            except Exception as e:
                print(f"Error in batch {i}: {e}")
                torch.cuda.empty_cache()
                continue

        for m in metrics:
            score = m.calculate(all_exec_results)
            final_results[m.name] = score
            print(f"üìä {m.name}: {score:.4f}")

    return final_results

In [5]:
# 1. –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö (—á–µ—Ä–µ–∑ –Ω–æ–≤—ã–µ –ª–æ–∞–¥–µ—Ä—ã)
print("üì• Loading Datasets...")

# –§—É–Ω–∫—Ü–∏—è-–º–∞–ø–ø–µ—Ä —Å "–∑–∞–ø–µ—á–µ–Ω–Ω—ã–º" —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–æ–º
mbpp_mapper = functools.partial(mbpp.mbpp_to_task, tokenizer=tokenizer)
he_mapper = functools.partial(humaneval.humaneval_to_task, tokenizer=tokenizer)

# –ì—Ä—É–∑–∏–º —Å—ã—Ä—ã–µ –¥–∞–Ω–Ω—ã–µ
ds_mbpp = mbpp.get_dataset()["test"]
ds_mbpp_small = ds_mbpp#.select(range(20))

ds_he = humaneval.get_dataset()["test"]
ds_he_small = ds_he#.select(range(20))

tasks_mbpp = load_benchmark(ds_mbpp_small, mbpp_mapper)
tasks_he = load_benchmark(ds_he_small, he_mapper)

metrics_suite = [
	GreedyPass(),
    PassAtk(k=1, n_samples=1),
    # PassAtk(k=5, n_samples=5), # –æ–Ω–æ –ø—Ä–æ—Å—Ç–æ –Ω–µ –∑–∞–ø—É—Å–∫–∞–µ—Ç—Å—è, –ø–∞–º—è—Ç—å —É–º–∏—Ä–∞–µ—Ç
	PercentPassed(),
    MeanEntropy(),
]

# 3. –ó–∞–ø—É—Å–∫!
print("\n" + "="*40)
print("üêç EVALUATING MBPP (Small Subset)")
print("="*40)
results_mbpp = evaluate_transformers(model, tokenizer, tasks_mbpp, metrics_suite)

print("\n" + "="*40)
print("üß† EVALUATING HumanEval (Small Subset)")
print("="*40)
results_he = evaluate_transformers(model, tokenizer, tasks_he, metrics_suite)

# 4. –ò—Ç–æ–≥–æ–≤—ã–π –≤—ã–≤–æ–¥
print("\n=== FINAL REPORT ===")
df = pd.DataFrame([results_mbpp, results_he], index=["MBPP", "HumanEval"])
display(df)

üì• Loading Datasets...

üêç EVALUATING MBPP (Small Subset)

üöÄ Generation for: ['greedy@1'] | Batch Size: 4


Eval Batches:   0%|          | 0/65 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


üìä greedy@1: 0.2568

üöÄ Generation for: ['pass@1 (n=1)', 'mean_%passed', 'mean_entropy'] | Batch Size: 4


Eval Batches:   0%|          | 0/65 [00:00<?, ?it/s]

üìä pass@1 (n=1): 0.2490
üìä mean_%passed: 0.2980
üìä mean_entropy: nan

üß† EVALUATING HumanEval (Small Subset)

üöÄ Generation for: ['greedy@1'] | Batch Size: 4


Eval Batches:   0%|          | 0/41 [00:00<?, ?it/s]

7.5
7.5
7.5
üìä greedy@1: 0.2195

üöÄ Generation for: ['pass@1 (n=1)', 'mean_%passed', 'mean_entropy'] | Batch Size: 4


Eval Batches:   0%|          | 0/41 [00:00<?, ?it/s]

2
3
2
3
2
3
2
3
2
3
2
3
2
3
2
3
üìä pass@1 (n=1): 0.2317
üìä mean_%passed: 0.3919
üìä mean_entropy: nan

=== FINAL REPORT ===


Unnamed: 0,greedy@1,pass@1 (n=1),mean_%passed,mean_entropy
MBPP,0.256809,0.249027,0.29799,
HumanEval,0.219512,0.231707,0.391887,
