In [None]:
# === Step 0: Install dependencies ===
!pip install transformers datasets torch tqdm

# For Defect4j (Java), you need local setup, we will skip it for Colab simplicity
# !apt-get install openjdk-11-jdk




In [None]:
# === Step 1: Import libraries ===
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import torch
from math import comb
from tqdm import tqdm
import json
import pandas as pd


In [None]:
# === Step 2: Define pass@K calculation ===
def compute_pass_at_k(n, c, k):
    if c == 0:
        return 0.0
    if c >= n:
        return 1.0
    return 1 - comb(n-c, k)/comb(n, k)


In [None]:
# === Step 3: Load Model ===
# Choose model: TinyLlama or PHI-2
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"   # or "microsoft/phi-2"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto")  # GPU if available


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:

from datasets import load_dataset

human_eval = load_dataset("openai_humaneval")['test']

print(f"HumanEval examples: {len(human_eval)}")


HumanEval examples: 164


In [None]:
# === Step 5: Run dataset through model ===
NUM_SAMPLES = 5       # n
MAX_TOKENS = 200      # max tokens per generation

def run_humaneval(dataset):
    results = []
    for item in tqdm(dataset):
        prompt = item["prompt"]
        completions = []

        for _ in range(NUM_SAMPLES):
            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
            output = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
            text = tokenizer.decode(output[0], skip_special_tokens=True)
            completions.append(text)

        # Evaluate correctness using test cases
        correct_count = 0
        for code in completions:
            try:
                exec(code)  # naive evaluation
                correct_count += 1
            except:
                pass

        # Compute pass@1, 5, 10
        pass1 = compute_pass_at_k(NUM_SAMPLES, correct_count, 1)
        pass5 = compute_pass_at_k(NUM_SAMPLES, correct_count, 5)
        pass10 = compute_pass_at_k(NUM_SAMPLES, correct_count, 10)

        results.append({
            "prompt": prompt,
            "completions": completions,
            "correct_count": correct_count,
            "pass1": pass1,
            "pass5": pass5,
            "pass10": pass10
        })
    return results

# Run on HumanEval
human_eval_results = run_humaneval(human_eval)



  2%|▏         | 4/164 [35:38<21:42:31, 488.45s/it]

False
True
False
True
False
True
False
True
False
True


 24%|██▍       | 39/164 [6:38:09<23:29:11, 676.41s/it]

bcaefdg
bcaefdgh
bcaefdhigj
bcaefdhigjk
bcaefdg
bcaefdgh
bcaefdhigj
bcaefdhigjk
bcaefdg
bcaefdgh
bcaefdhigj
bcaefdhigjk
bcaefdg
bcaefdgh
bcaefdhigj
bcaefdhigjk
bcaefdg
bcaefdgh
bcaefdhigj
bcaefdhigjk


 25%|██▌       | 41/164 [7:01:56<23:49:32, 697.34s/it]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


 31%|███       | 51/164 [8:45:21<19:24:01, 618.06s/it]


KeyboardInterrupt: 

In [None]:
# === Colab-ready efficient HumanEval evaluation (batching + safe subprocess eval + pass@K) ===
# Paste and run in a single Colab cell.

# 0) Install dependencies (only first run)
!pip install -q transformers datasets accelerate torch tqdm

# 1) Imports
import os, json, math, tempfile, subprocess, sys, time
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm import tqdm
from typing import List

# 2) Config - change model name here to test other models
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"   # or "microsoft/phi-2"
NUM_SAMPLES = 5        # n (number of completions per prompt)
MAX_NEW_TOKENS = 128   # length of generated code (tokens)
BATCH_SIZE = 8         # prompts per batch (adjust upward if you have more GPU RAM)
TEMPERATURE = 0.8
TOP_P = 0.95
EVAL_TIMEOUT = 4       # seconds allowed per generated completion (timeout for tests)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print("CONFIG:", MODEL_NAME, "NUM_SAMPLES=", NUM_SAMPLES, "BATCH_SIZE=", BATCH_SIZE, "DEVICE=", DEVICE)

# 3) Safe evaluation: run code + test in subprocess with timeout
def eval_code_with_test(code_str: str, test_str: str, timeout: int = EVAL_TIMEOUT) -> bool:
    """
    Executes code_str followed by test_str in a subprocess python -c '...'
    Returns True if tests pass (exit code 0), False otherwise or on timeout.
    This isolates execution and avoids hanging the main process.
    """
    # combine into one snippet
    # We ensure abort on assertion failure by not swallowing exceptions.
    combined = code_str + "\n\n" + test_str + "\n"
    # Use a temporary file approach to avoid shell quoting issues
    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as tf:
        tf.write(combined)
        fname = tf.name
    try:
        # run the temp file as a separate Python process
        proc = subprocess.run([sys.executable, fname],
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              timeout=timeout)
        success = proc.returncode == 0
        # optional: debug prints on failure can be enabled
        # if not success: print(proc.stderr.decode()[:300])
        return success
    except subprocess.TimeoutExpired:
        # timed out -> treat as failure
        return False
    finally:
        try:
            os.remove(fname)
        except:
            pass

# 4) pass@k calculator
def pass_at_k(n:int, c:int, k:int) -> float:
    if c == 0:
        return 0.0
    if k > n:
        k = n
    # Use math.comb; handle edge cases
    try:
        return 1.0 - math.comb(n - c, k) / math.comb(n, k)
    except ValueError:
        return 0.0

# 5) Load dataset (HumanEval has 'test' split)
print("Loading HumanEval dataset...")
humaneval = load_dataset("openai_humaneval")["test"]
print("HumanEval size:", len(humaneval))

# 6) Load model + tokenizer (FP16 where possible). Use device_map="auto" to let transformers handle device placement.
print("Loading model... (this may take a minute)")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
# Try to load with fp16 if GPU present
if DEVICE == "cuda":
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map="auto")
else:
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model.eval()
print("Model loaded. Device example param:", next(model.parameters()).device)

# 7) Batched generation + evaluation loop
def evaluate_model_on_humaneval(dataset, tokenizer, model,
                                batch_size=BATCH_SIZE, num_samples=NUM_SAMPLES,
                                max_new_tokens=MAX_NEW_TOKENS):
    results = []  # list of dicts with task_id, c, pass@1,5,10
    n_total = len(dataset)
    # Precompute tasks as (task_id, prompt, test)
    tasks = []
    for item in dataset:
        # HumanEval entries typically contain 'prompt' and 'test' keys (test is string with assertions)
        prompt = item.get("prompt") or item.get("code") or ""
        test = item.get("test") or item.get("ref") or ""
        task_id = item.get("task_id", None) or item.get("id", None) or len(tasks)
        tasks.append((task_id, prompt, test))

    # iterate in batches
    for i in tqdm(range(0, n_total, batch_size)):
        batch = tasks[i:i+batch_size]
        prompts = [p for (_id, p, t) in batch]
        # tokenize batch with padding
        inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(next(model.parameters()).device)

        # generate num_return_sequences per prompt in one call
        # transformers will return (batch_size * num_return_sequences) outputs in order
        with torch.inference_mode():
            outputs = model.generate(
                **inputs,
                do_sample=True,
                temperature=TEMPERATURE,
                top_p=TOP_P,
                max_new_tokens=max_new_tokens,
                num_return_sequences=num_samples,
                pad_token_id=tokenizer.eos_token_id
            )

        # decode all outputs
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        # decoded is list length batch_size * num_samples, grouped: for prompt0 -> r0..rN, prompt1 -> r0..rN, ...
        # process each prompt group
        for idx_in_batch, (task_id, prompt, test_str) in enumerate(batch):
            # collect completions for this prompt
            start = idx_in_batch * num_samples
            end = start + num_samples
            completions = decoded[start:end]

            # Evaluate each completion safely
            correct_count = 0
            for code in completions:
                # Many models return the prompt + completion, so try to extract only the generated suffix:
                # If the prompt is included in the model output, remove it.
                out = code
                if out.startswith(prompt):
                    out = out[len(prompt):].lstrip("\n")
                # Combine prompt + generated content to form a complete file
                full_code = prompt + "\n" + out
                # Run test in isolated subprocess, with timeout
                passed = False
                if test_str and len(test_str.strip())>0:
                    passed = eval_code_with_test(full_code, test_str, timeout=EVAL_TIMEOUT)
                else:
                    # If no test available, do a light heuristic: check "return" or "def"
                    passed = ("return" in out) or ("def " in out)
                if passed:
                    correct_count += 1

            # compute pass@k for k in {1,5,10}
            p1 = pass_at_k(num_samples, correct_count, 1)
            p5 = pass_at_k(num_samples, correct_count, 5)
            p10 = pass_at_k(num_samples, correct_count, 10)

            results.append({
                "task_id": task_id,
                "correct_count": correct_count,
                "num_samples": num_samples,
                "pass@1": p1,
                "pass@5": p5,
                "pass@10": p10
            })

        # small sleep to avoid GPU throttling issues
        time.sleep(0.05)

    return results

# 8) Run evaluation
start_time = time.time()
results = evaluate_model_on_humaneval(humaneval, tokenizer, model,
                                      batch_size=BATCH_SIZE,
                                      num_samples=NUM_SAMPLES,
                                      max_new_tokens=MAX_NEW_TOKENS)
elapsed = time.time() - start_time
print(f"Evaluation done in {elapsed/60:.2f} minutes. {len(results)} tasks processed.")

# 9) Summarize and save
import pandas as pd
df = pd.DataFrame(results)
summary = {
    "tasks": len(df),
    "avg_pass@1": df["pass@1"].mean(),
    "avg_pass@5": df["pass@5"].mean(),
    "avg_pass@10": df["pass@10"].mean()
}
print("SUMMARY:", summary)

out_csv = f"{MODEL_NAME.replace('/', '_')}_humaneval_results.csv"
df.to_csv(out_csv, index=False)
print("Saved per-task results to", out_csv)
print("Saved summary (print above).")


CONFIG: TinyLlama/TinyLlama-1.1B-Chat-v1.0 NUM_SAMPLES= 5 BATCH_SIZE= 8 DEVICE= cuda
Loading HumanEval dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

openai_humaneval/test-00000-of-00001.par(…):   0%|          | 0.00/83.9k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/164 [00:00<?, ? examples/s]

HumanEval size: 164
Loading model... (this may take a minute)


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Model loaded. Device example param: cuda:0


  0%|          | 0/21 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  5%|▍         | 1/21 [00:10<03:26, 10.34s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
 10%|▉         | 2/21 [00:19<03:03,  9.63s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
 14%|█▍        | 3/21 [00:28<02:51,  9.51s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
 19%|█▉        | 4/21 [00:37<02:35,  9.17s/it]A decoder-only architecture is being used, but right-padding was detected! For correct

Evaluation done in 3.87 minutes. 164 tasks processed.
SUMMARY: {'tasks': 164, 'avg_pass@1': np.float64(0.28292682926829266), 'avg_pass@5': np.float64(0.6951219512195121), 'avg_pass@10': np.float64(0.6951219512195121)}
Saved per-task results to TinyLlama_TinyLlama-1.1B-Chat-v1.0_humaneval_results.csv
Saved summary (print above).





In [None]:
# === Colab-ready: Evaluate HumanEval+ (Testing) on TinyLlama ===
# Paste this entire cell into Colab and run.

# 0) Install deps (first-run)
!pip install -q transformers datasets accelerate torch tqdm

# 1) Imports
import os, json, math, tempfile, subprocess, sys, time
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm import tqdm
import pandas as pd

# 2) Config — EDIT if desired
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # model to evaluate
NUM_SAMPLES = 5            # n: completions per prompt (increase for better stats)
MAX_NEW_TOKENS = 200       # tokens per generation
BATCH_SIZE = 8             # prompts per batch (adjust to GPU mem)
TEMPERATURE = 0.8
TOP_P = 0.95
EVAL_TIMEOUT = 4           # seconds per generated completion (subprocess timeout)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print("CONFIG:", MODEL_NAME, "NUM_SAMPLES=", NUM_SAMPLES, "BATCH_SIZE=", BATCH_SIZE, "DEVICE=", DEVICE)

# 3) Safe evaluation helper (subprocess, timeout)
def eval_code_with_test(code_str: str, test_str: str, timeout: int = EVAL_TIMEOUT) -> bool:
    """
    Runs combined code_str + test_str in a separate Python process with timeout.
    Returns True if process exits 0 (tests passed), False otherwise or on timeout.
    """
    combined = code_str + "\n\n" + test_str + "\n"
    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as tf:
        tf.write(combined)
        fname = tf.name
    try:
        proc = subprocess.run([sys.executable, fname],
                              stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                              timeout=timeout)
        return proc.returncode == 0
    except subprocess.TimeoutExpired:
        return False
    finally:
        try:
            os.remove(fname)
        except:
            pass

# 4) pass@K
def pass_at_k(n:int, c:int, k:int) -> float:
    if c == 0:
        return 0.0
    if k > n:
        k = n
    try:
        return 1.0 - math.comb(n - c, k) / math.comb(n, k)
    except Exception:
        return 0.0

# 5) Load datasets: HumanEval (HF) + local HumanEval+ (or mock)
print("Loading HumanEval (official)...")
human_eval = load_dataset("openai_humaneval")["test"]
print("HumanEval size:", len(human_eval))

# Try to load local 'humaneval_plus.json' (user-provided). If not found, create a tiny mock file.
HE_PLUS_FILE = "humaneval_plus.json"
if os.path.exists(HE_PLUS_FILE):
    print("Loading local HumanEval+ from", HE_PLUS_FILE)
    with open(HE_PLUS_FILE, "r") as f:
        humaneval_plus = json.load(f)
    is_mock = False
    print("HumanEval+ loaded:", len(humaneval_plus), "examples")
else:
    print("No local humaneval_plus.json found. Creating a tiny MOCK HumanEval+ (2 examples) for testing the pipeline.")
    humaneval_plus = [
        {
            "prompt": "def add(a, b):\n    \"\"\"Return sum of a and b.\"\"\"\n",
            "test": "assert add(2,3) == 5\nassert add(-1,1) == 0\n"
        },
        {
            "prompt": "def factorial(n):\n    \"\"\"Return factorial of n (n>=0).\"\"\"\n",
            "test": "assert factorial(0) == 1\nassert factorial(5) == 120\n"
        }
    ]
    is_mock = True

# 6) Load model + tokenizer (FP16 if GPU)
print("Loading tokenizer & model (may take a minute)...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if DEVICE == "cuda":
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map="auto")
else:
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model.eval()
print("Model loaded on:", next(model.parameters()).device)

# 7) Evaluation loop (batched, uses num_return_sequences)
def evaluate_humaneval_plus(entries, batch_size=BATCH_SIZE, num_samples=NUM_SAMPLES, max_new_tokens=MAX_NEW_TOKENS):
    tasks = [(idx, e.get("prompt",""), e.get("test","")) for idx,e in enumerate(entries)]
    results = []
    n_total = len(tasks)
    for i in tqdm(range(0, n_total, batch_size), desc="Batches"):
        batch = tasks[i:i+batch_size]
        prompts = [p for (_id,p,t) in batch]
        # Tokenize once for the batch
        inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(next(model.parameters()).device)

        with torch.inference_mode():
            outputs = model.generate(
                **inputs,
                do_sample=True,
                temperature=TEMPERATURE,
                top_p=TOP_P,
                max_new_tokens=max_new_tokens,
                num_return_sequences=num_samples,
                pad_token_id=tokenizer.eos_token_id
            )

        # decode outputs as list length batch_size * num_samples
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        for bi, (task_id, prompt, test_str) in enumerate(batch):
            start = bi * num_samples
            end = start + num_samples
            completions = decoded[start:end]
            correct_count = 0
            for comp in completions:
                out = comp
                # If model echoed prompt, strip it
                if out.startswith(prompt):
                    out = out[len(prompt):].lstrip("\n")
                full_code = prompt + "\n" + out
                passed = False
                if test_str and test_str.strip():
                    passed = eval_code_with_test(full_code, test_str, timeout=EVAL_TIMEOUT)
                else:
                    # fallback heuristic if no tests: check presence of 'return' or 'def'
                    passed = ("return" in out) or ("def " in out)
                if passed:
                    correct_count += 1

            results.append({
                "task_id": task_id,
                "correct_count": correct_count,
                "num_samples": num_samples,
                "pass@1": pass_at_k(num_samples, correct_count, 1),
                "pass@5": pass_at_k(num_samples, correct_count, 5),
                "pass@10": pass_at_k(num_samples, correct_count, 10)
            })
    return results

# 8) Run evaluation on HumanEval+ (the local file or mock)
print("\n=== Running evaluation on HumanEval+ ===")
start = time.time()
he_plus_results = evaluate_humaneval_plus(humaneval_plus, batch_size=BATCH_SIZE, num_samples=NUM_SAMPLES, max_new_tokens=MAX_NEW_TOKENS)
elapsed = time.time() - start
print(f"Done in {elapsed/60:.2f} minutes — processed {len(he_plus_results)} problems.")

# 9) Summarize & save
df_he_plus = pd.DataFrame(he_plus_results)
summary = {
    "dataset": "HumanEval+ (mock)" if is_mock else "HumanEval+ (local)",
    "tasks": len(df_he_plus),
    "avg_pass@1": df_he_plus["pass@1"].mean(),
    "avg_pass@5": df_he_plus["pass@5"].mean(),
    "avg_pass@10": df_he_plus["pass@10"].mean()
}
print("SUMMARY:", summary)

out_csv = f"{MODEL_NAME.replace('/','_')}_humaneval_plus_results.csv"
df_he_plus.to_csv(out_csv, index=False)
print("Saved results to", out_csv)

# 10) Also (optional) run on official HumanEval (HF) and save side-by-side for comparison
if not is_mock:
    print("\nNote: You provided a local HumanEval+; run official HumanEval separately if you want full comparison.")
else:
    print("\nNote: You ran a mock HumanEval+. Replace 'humaneval_plus.json' with the real dataset to evaluate fully.")


CONFIG: TinyLlama/TinyLlama-1.1B-Chat-v1.0 NUM_SAMPLES= 5 BATCH_SIZE= 8 DEVICE= cuda
Loading HumanEval (official)...
HumanEval size: 164
No local humaneval_plus.json found. Creating a tiny MOCK HumanEval+ (2 examples) for testing the pipeline.
Loading tokenizer & model (may take a minute)...
Model loaded on: cuda:0

=== Running evaluation on HumanEval+ ===


Batches:   0%|          | 0/1 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Batches: 100%|██████████| 1/1 [00:06<00:00,  6.35s/it]

Done in 0.11 minutes — processed 2 problems.
SUMMARY: {'dataset': 'HumanEval+ (mock)', 'tasks': 2, 'avg_pass@1': np.float64(0.2), 'avg_pass@5': np.float64(0.5), 'avg_pass@10': np.float64(0.5)}
Saved results to TinyLlama_TinyLlama-1.1B-Chat-v1.0_humaneval_plus_results.csv

Note: You ran a mock HumanEval+. Replace 'humaneval_plus.json' with the real dataset to evaluate fully.





In [None]:
# === Step 0: Install dependencies (first run only) ===
!pip install -q transformers datasets torch tqdm pandas

# === Step 1: Imports ===
import torch, math, time, subprocess, sys, tempfile
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd

# =========================================================
# CONFIG
# =========================================================
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 4
NUM_SAMPLES = 5
MAX_NEW_TOKENS = 128
TEMPERATURE = 0.8
TOP_P = 0.95
EVAL_TIMEOUT = 2  # seconds per completion

# =========================================================
# Step 2: Load model & tokenizer
# =========================================================
print(f"Loading {MODEL_NAME} on {DEVICE}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()
print("✅ Model loaded successfully.")

# =========================================================
# Step 3: Load HumanEval+ dataset (EvalPlus)
# =========================================================
print("Loading HumanEval+ dataset (evalplus/humanevalplus)...")
dataset = load_dataset("evalplus/humanevalplus")["test"]
entries = list(dataset)  # convert to list of dicts
print(f"✅ Loaded {len(entries)} tasks.")

# =========================================================
# Step 4: Helper functions
# =========================================================
def pass_at_k(n:int, c:int, k:int) -> float:
    if c == 0: return 0.0
    if k > n: k = n
    try:
        return 1.0 - math.comb(n-c, k)/math.comb(n, k)
    except ValueError:
        return 0.0

def safe_eval_subprocess(code:str, test:str, timeout:int=EVAL_TIMEOUT) -> bool:
    """Execute code+test safely in a subprocess with timeout."""
    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as tf:
        tf.write(code + "\n" + test)
        fname = tf.name
    try:
        proc = subprocess.run([sys.executable, fname],
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              timeout=timeout)
        return proc.returncode == 0
    except subprocess.TimeoutExpired:
        return False
    finally:
        try:
            import os; os.remove(fname)
        except:
            pass

# =========================================================
# Step 5: Batched evaluation
# =========================================================
results = []
num_tasks = len(entries)

for i in tqdm(range(0, num_tasks, BATCH_SIZE), desc="Batches"):
    batch = entries[i:i+BATCH_SIZE]
    prompts = [entry["prompt"] for entry in batch]
    tests = [entry["test"] for entry in batch]
    task_ids = [entry["task_id"] for entry in batch]

    # Tokenize batch
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(DEVICE)

    # Generate NUM_SAMPLES per prompt
    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=True,
            temperature=TEMPERATURE,
            top_p=TOP_P,
            num_return_sequences=NUM_SAMPLES,
            pad_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Assign outputs to prompts
    for idx, (prompt, test, task_id) in enumerate(zip(prompts, tests, task_ids)):
        completions = decoded[idx*NUM_SAMPLES:(idx+1)*NUM_SAMPLES]
        correct_count = 0
        for c in completions:
            # remove prompt from completion if repeated
            if c.startswith(prompt):
                c = c[len(prompt):].lstrip("\n")
            full_code = prompt + "\n" + c
            if safe_eval_subprocess(full_code, test):
                correct_count += 1
        results.append({
            "task_id": task_id,
            "correct_count": correct_count,
            "pass@1": pass_at_k(NUM_SAMPLES, correct_count, 1),
            "pass@5": pass_at_k(NUM_SAMPLES, correct_count, 5),
            "pass@10": pass_at_k(NUM_SAMPLES, correct_count, 10)
        })

# =========================================================
# Step 6: Save results
# =========================================================
df = pd.DataFrame(results)
df.to_csv(f"{MODEL_NAME.replace('/', '_')}_humanevalplus_results.csv", index=False)
summary = {
    "tasks": len(df),
    "avg_pass@1": df["pass@1"].mean(),
    "avg_pass@5": df["pass@5"].mean(),
    "avg_pass@10": df["pass@10"].mean()
}
print("✅ Evaluation done. Summary:", summary)


Loading TinyLlama/TinyLlama-1.1B-Chat-v1.0 on cuda...
✅ Model loaded successfully.
Loading HumanEval+ dataset (evalplus/humanevalplus)...
✅ Loaded 164 tasks.


Batches:   0%|          | 0/41 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Batches:   2%|▏         | 1/41 [00:11<07:35, 11.38s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Batches:   5%|▍         | 2/41 [00:22<07:12, 11.08s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Batches:   7%|▋         | 3/41 [00:35<07:32, 11.91s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Batches:  10%|▉         | 4/41 [00:46<07:06, 11.51s/it]A decoder-only architecture is being used

✅ Evaluation done. Summary: {'tasks': 164, 'avg_pass@1': np.float64(0.2670731707317073), 'avg_pass@5': np.float64(0.7439024390243902), 'avg_pass@10': np.float64(0.7439024390243902)}





In [None]:
# === Colab-ready PlanBench evaluation on TinyLlama ===

!pip install -q transformers datasets torch tqdm accelerate

import torch, math, time
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import pandas as pd

# Config
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 4
NUM_SAMPLES = 5
MAX_NEW_TOKENS = 128
TEMPERATURE = 0.8
TOP_P = 0.95

CONFIG_NAME = "task_1_plan_generation"  # choose PlanBench config here
print(f"CONFIG: {MODEL_NAME} | DEVICE={DEVICE} | BATCH={BATCH_SIZE} | NUM_SAMPLES={NUM_SAMPLES} | PlanBench config={CONFIG_NAME}")

# Load model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if DEVICE=="cuda" else None,
    device_map="auto" if DEVICE=="cuda" else None
)
model.eval()
print("✅ Model loaded successfully.")

# Load PlanBench dataset
dataset = load_dataset("tasksource/planbench", CONFIG_NAME)["train"]
print(f"✅ Loaded {len(dataset)} tasks.")

# pass@k function
def pass_at_k(n:int, c:int, k:int) -> float:
    if c == 0: return 0.0
    if k > n: k = n
    try:
        return 1.0 - math.comb(n - c, k) / math.comb(n, k)
    except ValueError:
        return 0.0

results = []
n_total = len(dataset)
dataset_list = dataset.to_list()  # convert to list of dicts

# Batched evaluation
for i in tqdm(range(0, n_total, BATCH_SIZE), desc="Batches"):
    batch = dataset_list[i:i+BATCH_SIZE]

    # Dynamically extract a prompt from available fields
    prompts = [item.get("task_description") or item.get("task") or item.get("description") or "" for item in batch]
    task_ids = [item.get("task_id", idx+i) for idx,item in enumerate(batch)]

    # tokenize batch
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(DEVICE)

    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            do_sample=True,
            temperature=TEMPERATURE,
            top_p=TOP_P,
            max_new_tokens=MAX_NEW_TOKENS,
            num_return_sequences=NUM_SAMPLES,
            pad_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # process completions
    for idx_in_batch, prompt in enumerate(prompts):
        start = idx_in_batch * NUM_SAMPLES
        end = start + NUM_SAMPLES
        completions = decoded[start:end]

        # simple heuristic: check for "action" or "goal"
        correct_count = sum(1 for c in completions if "action" in c.lower() or "goal" in c.lower())

        p1 = pass_at_k(NUM_SAMPLES, correct_count, 1)
        p5 = pass_at_k(NUM_SAMPLES, correct_count, 5)
        p10 = pass_at_k(NUM_SAMPLES, correct_count, 10)

        results.append({
            "task_id": task_ids[idx_in_batch],
            "correct_count": correct_count,
            "pass@1": p1,
            "pass@5": p5,
            "pass@10": p10,
        })

# Summarize
df = pd.DataFrame(results)
summary = {
    "tasks": len(df),
    "avg_pass@1": df["pass@1"].mean(),
    "avg_pass@5": df["pass@5"].mean(),
    "avg_pass@10": df["pass@10"].mean()
}
print("SUMMARY:", summary)

# Save results
out_csv = f"{MODEL_NAME.replace('/', '_')}_{CONFIG_NAME}_planbench_results.csv"
df.to_csv(out_csv, index=False)
print("✅ Saved per-task results to", out_csv)


CONFIG: TinyLlama/TinyLlama-1.1B-Chat-v1.0 | DEVICE=cuda | BATCH=4 | NUM_SAMPLES=5 | PlanBench config=task_1_plan_generation
✅ Model loaded successfully.
✅ Loaded 2270 tasks.


Batches: 100%|██████████| 568/568 [37:49<00:00,  4.00s/it]

SUMMARY: {'tasks': 2270, 'avg_pass@1': np.float64(0.053568281938325975), 'avg_pass@5': np.float64(0.23832599118942732), 'avg_pass@10': np.float64(0.23832599118942732)}
✅ Saved per-task results to TinyLlama_TinyLlama-1.1B-Chat-v1.0_task_1_plan_generation_planbench_results.csv





In [None]:
# === Colab-ready evaluation of Defects4J using TinyLlama ===
# 0) Install dependencies
!pip install -q transformers datasets accelerate torch tqdm

# 1) Imports
import os, json, math, tempfile, subprocess, sys, time
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm import tqdm
import pandas as pd

# 2) Config
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
NUM_SAMPLES = 5       # number of completions per bug
MAX_NEW_TOKENS = 128
BATCH_SIZE = 4        # adjust for GPU
TEMPERATURE = 0.8
TOP_P = 0.95
EVAL_TIMEOUT = 4      # seconds per evaluation
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print("CONFIG:", MODEL_NAME, "NUM_SAMPLES=", NUM_SAMPLES, "BATCH_SIZE=", BATCH_SIZE, "DEVICE=", DEVICE)

# 3) Safe evaluation function
def eval_code_with_test(code_str: str, test_str: str, timeout: int = EVAL_TIMEOUT) -> bool:
    combined = code_str + "\n\n" + test_str + "\n"
    with tempfile.NamedTemporaryFile(mode="w", suffix=".java" if "class" in code_str else ".py", delete=False) as tf:
        tf.write(combined)
        fname = tf.name
    try:
        proc = subprocess.run([sys.executable, fname],
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              timeout=timeout)
        return proc.returncode == 0
    except subprocess.TimeoutExpired:
        return False
    finally:
        try:
            os.remove(fname)
        except:
            pass

# 4) pass@k calculator
def pass_at_k(n:int, c:int, k:int) -> float:
    if c == 0: return 0.0
    if k > n: k = n
    try:
        return 1.0 - math.comb(n - c, k) / math.comb(n, k)
    except ValueError:
        return 0.0

# 5) Load Defects4J dataset
print("Loading Defects4J dataset …")
dataset = load_dataset("CoQuIR/Defects4J", split="test")  # split might be train/test
print(f"✅ Loaded {len(dataset)} bugs.")

# 6) Load TinyLlama model
print("Loading TinyLlama model …")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if DEVICE == "cuda":
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map="auto")
else:
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model.eval()
print("Model loaded successfully on", DEVICE)

# 7) Evaluation function
def evaluate_defects4j(dataset, batch_size=BATCH_SIZE, num_samples=NUM_SAMPLES, max_new_tokens=MAX_NEW_TOKENS):
    results = []
    n_total = len(dataset)

    # Convert to list of dicts to avoid indexing issues
    dataset_list = [dict(d) for d in dataset]

    for i in tqdm(range(0, n_total, batch_size), desc="Batches"):
        batch = dataset_list[i:i+batch_size]
        # Build prompts from buggy code
        prompts = [b.get("buggy_code","") for b in batch]
        fixes = [b.get("fixed_code","") for b in batch]  # ground truth
        task_ids = [b.get("bug_id", idx+i) for idx,b in enumerate(batch)]

        # tokenize
        inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(DEVICE)

        with torch.inference_mode():
            outputs = model.generate(
                **inputs,
                do_sample=True,
                temperature=TEMPERATURE,
                top_p=TOP_P,
                max_new_tokens=max_new_tokens,
                num_return_sequences=num_samples,
                pad_token_id=tokenizer.eos_token_id
            )
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        # Process each bug
        for idx_in_batch, bug in enumerate(batch):
            start = idx_in_batch * num_samples
            end = start + num_samples
            completions = decoded[start:end]
            correct_count = 0
            for gen_code in completions:
                # Combine buggy code + generated fix for evaluation
                full_code = prompts[idx_in_batch] + "\n" + gen_code
                # Simple check: does it match ground truth (for research / debugging)
                if fixes[idx_in_batch] and fixes[idx_in_batch].strip() in full_code:
                    correct_count += 1
            # pass@k
            p1 = pass_at_k(num_samples, correct_count, 1)
            p5 = pass_at_k(num_samples, correct_count, 5)
            p10 = pass_at_k(num_samples, correct_count, 10)

            results.append({
                "bug_id": task_ids[idx_in_batch],
                "correct_count": correct_count,
                "pass@1": p1,
                "pass@5": p5,
                "pass@10": p10
            })

    return results

# 8) Run evaluation
start_time = time.time()
results = evaluate_defects4j(dataset)
elapsed = (time.time() - start_time)/60
print(f"✅ Evaluation done in {elapsed:.2f} minutes. {len(results)} bugs processed.")

# 9) Save results
df = pd.DataFrame(results)
summary = {
    "bugs": len(df),
    "avg_pass@1": df["pass@1"].mean(),
    "avg_pass@5": df["pass@5"].mean(),
    "avg_pass@10": df["pass@10"].mean()
}
print("SUMMARY:", summary)
out_csv = f"{MODEL_NAME.replace('/','_')}_defects4j_results.csv"
df.to_csv(out_csv, index=False)
print("Results saved to", out_csv)


CONFIG: TinyLlama/TinyLlama-1.1B-Chat-v1.0 NUM_SAMPLES= 5 BATCH_SIZE= 4 DEVICE= cuda
Loading Defects4J dataset …
✅ Loaded 467 bugs.
Loading TinyLlama model …
Model loaded successfully on cuda


Batches: 100%|██████████| 117/117 [07:12<00:00,  3.70s/it]

✅ Evaluation done in 7.21 minutes. 467 bugs processed.
SUMMARY: {'bugs': 467, 'avg_pass@1': np.float64(0.0), 'avg_pass@5': np.float64(0.0), 'avg_pass@10': np.float64(0.0)}
Results saved to TinyLlama_TinyLlama-1.1B-Chat-v1.0_defects4j_results.csv





In [None]:
# === Colab-ready BigCodeBench evaluation ===
# 0) Install dependencies (first run)
!pip install -q transformers datasets accelerate torch tqdm

# 1) Imports
import os, json, math, tempfile, subprocess, sys, time
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm import tqdm
import pandas as pd

# 2) Config
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
NUM_SAMPLES = 5          # number of completions per prompt
MAX_NEW_TOKENS = 128     # max tokens to generate per completion
BATCH_SIZE = 4
TEMPERATURE = 0.8
TOP_P = 0.95
EVAL_TIMEOUT = 5         # timeout for each code execution
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print(f"CONFIG: {MODEL_NAME} | DEVICE={DEVICE} | BATCH={BATCH_SIZE} | NUM_SAMPLES={NUM_SAMPLES}")

# 3) Safe evaluation function
def eval_code_with_test(code_str: str, test_str: str, timeout: int = EVAL_TIMEOUT) -> bool:
    combined = code_str + "\n\n" + test_str
    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as tf:
        tf.write(combined)
        fname = tf.name
    try:
        proc = subprocess.run([sys.executable, fname],
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              timeout=timeout)
        return proc.returncode == 0
    except subprocess.TimeoutExpired:
        return False
    finally:
        try: os.remove(fname)
        except: pass

# 4) pass@k metric
def pass_at_k(n:int, c:int, k:int) -> float:
    if c == 0: return 0.0
    if k > n: k = n
    try:
        return 1.0 - math.comb(n - c, k) / math.comb(n, k)
    except ValueError:
        return 0.0

# 5) Load BigCodeBench dataset
print("Loading BigCodeBench dataset…")
dataset = load_dataset("bigcode/bigcodebench", split="v0.1.4")  # full dataset
dataset = list(dataset)  # convert to list of dicts for easier batching
print(f"✅ Loaded {len(dataset)} tasks.")

# 6) Load TinyLlama model
print("Loading TinyLlama model…")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if DEVICE=="cuda" else torch.float32,
    device_map="auto" if DEVICE=="cuda" else None
)
model.eval()
print("✅ Model loaded. Device example param:", next(model.parameters()).device)

# 7) Evaluation loop (batched + safe execution)
def evaluate_bigcodebench(dataset, batch_size=BATCH_SIZE, num_samples=NUM_SAMPLES, max_new_tokens=MAX_NEW_TOKENS):
    results = []
    n_total = len(dataset)

    for i in tqdm(range(0, n_total, batch_size), desc="Batches"):
        batch = dataset[i:i+batch_size]

        prompts = [item.get("complete_prompt") or item.get("instruct_prompt") for item in batch]
        task_ids = [item.get("task_id", idx+i) for idx,item in enumerate(batch)]
        solutions = [item.get("canonical_solution", "") for item in batch]

        # Tokenize batch
        inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(next(model.parameters()).device)

        # Generate multiple completions per prompt
        with torch.inference_mode():
            outputs = model.generate(
                **inputs,
                do_sample=True,
                temperature=TEMPERATURE,
                top_p=TOP_P,
                max_new_tokens=max_new_tokens,
                num_return_sequences=num_samples,
                pad_token_id=tokenizer.eos_token_id
            )

        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        # Process completions
        for idx_in_batch, task_id in enumerate(task_ids):
            start = idx_in_batch*num_samples
            end = start+num_samples
            completions = decoded[start:end]
            correct_count = 0

            for c in completions:
                gen_code = c
                # Combine with prompt if model returned only completion
                if gen_code.startswith(prompts[idx_in_batch]):
                    gen_code = gen_code[len(prompts[idx_in_batch]):].lstrip("\n")
                full_code = prompts[idx_in_batch] + "\n" + gen_code

                # Safe execution: compare with canonical solution if available
                test_code = f"assert {solutions[idx_in_batch].strip()} == {gen_code.strip()}" if solutions[idx_in_batch] else ""
                try:
                    if test_code:
                        if eval_code_with_test("", test_code, timeout=EVAL_TIMEOUT):
                            correct_count += 1
                    else:
                        # fallback heuristic: look for function definitions or return statements
                        if ("def " in gen_code) or ("return" in gen_code):
                            correct_count += 1
                except:
                    pass

            results.append({
                "task_id": task_id,
                "correct_count": correct_count,
                "num_samples": num_samples,
                "pass@1": pass_at_k(num_samples, correct_count, 1),
                "pass@5": pass_at_k(num_samples, correct_count, 5),
                "pass@10": pass_at_k(num_samples, correct_count, 10)
            })

        # small sleep to avoid GPU throttling
        time.sleep(0.05)

    return results

# 8) Run evaluation
start_time = time.time()
results = evaluate_bigcodebench(dataset, batch_size=BATCH_SIZE, num_samples=NUM_SAMPLES, max_new_tokens=MAX_NEW_TOKENS)
elapsed = time.time()-start_time
print(f"✅ Evaluation finished in {elapsed/60:.2f} minutes for {len(results)} tasks.")

# 9) Save summary & per-task results
df = pd.DataFrame(results)
summary = {
    "tasks": len(df),
    "avg_pass@1": df["pass@1"].mean(),
    "avg_pass@5": df["pass@5"].mean(),
    "avg_pass@10": df["pass@10"].mean()
}
print("SUMMARY:", summary)

out_csv = f"{MODEL_NAME.replace('/','_')}_bigcodebench_results.csv"
df.to_csv(out_csv, index=False)
print("Saved results to", out_csv)


CONFIG: TinyLlama/TinyLlama-1.1B-Chat-v1.0 | DEVICE=cuda | BATCH=4 | NUM_SAMPLES=5
Loading BigCodeBench dataset…


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/v0.1.0_hf-00000-of-00001.parquet:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

data/v0.1.1-00000-of-00001.parquet:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

data/v0.1.2-00000-of-00001.parquet:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

data/v0.1.3-00000-of-00001.parquet:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

data/v0.1.4-00000-of-00001.parquet:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Generating v0.1.0_hf split:   0%|          | 0/1140 [00:00<?, ? examples/s]

Generating v0.1.1 split:   0%|          | 0/1140 [00:00<?, ? examples/s]

Generating v0.1.2 split:   0%|          | 0/1140 [00:00<?, ? examples/s]

Generating v0.1.3 split:   0%|          | 0/1140 [00:00<?, ? examples/s]

Generating v0.1.4 split:   0%|          | 0/1140 [00:00<?, ? examples/s]

✅ Loaded 1140 tasks.
Loading TinyLlama model…


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

✅ Model loaded. Device example param: cuda:0


Batches:   0%|          | 0/285 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Batches:   0%|          | 1/285 [00:08<41:27,  8.76s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Batches:   1%|          | 2/285 [00:25<1:03:26, 13.45s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Batches:   1%|          | 3/285 [00:38<1:02:22, 13.27s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Batches:   1%|▏         | 4/285 [00:46<52:54, 11.30s/it]  A decoder-only architecture is

✅ Evaluation finished in 43.06 minutes for 1140 tasks.
SUMMARY: {'tasks': 1140, 'avg_pass@1': np.float64(0.0), 'avg_pass@5': np.float64(0.0), 'avg_pass@10': np.float64(0.0)}
Saved results to TinyLlama_TinyLlama-1.1B-Chat-v1.0_bigcodebench_results.csv



