In [None]:
!pip install -q  bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install -q sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
!pip install -q  unsloth

^C
[31mERROR: Operation cancelled by user[0m[31m
[0m

In [1]:
# C1 — Robust CSV/Excel loader
import os, glob, pandas as pd

def load_any_table(path_globs):
    paths=[]
    for g in path_globs:
        paths += glob.glob(g, recursive=True) if "**" in g else ([g] if os.path.exists(g) else [])
    assert paths, f"Could not find any of: {path_globs}"
    p = paths[0]
    df = pd.read_csv(p, dtype=str, keep_default_na=False, engine="python")
    df.columns = [c.strip() for c in df.columns]
    return df

TRIAL = load_any_table([
    "trial.csv"
    #"/kaggle/input/**/trial*.csv*",      # Change Here -----------------
])
DEV   = load_any_table([
    "dev_v2.csv"
    #"/kaggle/input/**/*dev_v2*.csv*",    # Change Here -----------
])

for df in (TRIAL, DEV):
    if "id" in df.columns:
        df["id"] = pd.to_numeric(df["id"], errors="coerce").fillna(-1).astype(int)
    for col in ("instruction","response","test_list"):
        if col in df.columns:
            df[col] = df[col].astype(str)


In [2]:
# C2 — parse tests + function name extraction
import re, json, ast

ASSERT_NAME_RE = re.compile(r"""assert\s+([A-Za-z_]\w*)\s*\(""", re.VERBOSE)

def _try_json_or_eval(s):
    try: return json.loads(s)
    except Exception:
        try: return ast.literal_eval(s)
        except Exception: return s

def parse_tests_cell(val):
    s = (val or "").strip()
    if not s or s.lower() in {"none","null","nan"}:
        return {"kind":"none","tests":[],"fn":None}

    # normalize quotes
    s = (s.replace("’","'").replace("‘","'")
           .replace("“","\"").replace("”","\""))

    obj = _try_json_or_eval(s)
    if isinstance(obj, str) and obj.strip().startswith("[") and obj.strip().endswith("]"):
        obj = _try_json_or_eval(obj)

    tests = []
    if isinstance(obj, list):
        for x in obj:
            xs = str(x).strip().strip('"').strip("'")
            if not xs.startswith("assert") and "==" in xs:
                xs = "assert " + xs
            if xs.startswith("assert"):
                tests.append(xs)
    else:
        # fallback: scrape raw string
        tests = [m.group(0).strip() for m in re.finditer(r"assert\s+.+", s)]

    # function name (first assert wins)
    fn = None
    for t in tests:
        m = ASSERT_NAME_RE.search(t)
        if m:
            fn = m.group(1)
            break

    return {"kind": "asserts" if tests else "none", "tests": tests, "fn": fn}


In [3]:
# S1 — Load base model and attach chat template
import os, torch

from unsloth import FastModel
from unsloth.chat_templates import get_chat_template

BASE_MODEL = "md-nishat-008/TigerLLM-1B-it"  # your pick

model, tokenizer = FastModel.from_pretrained(
    model_name      = BASE_MODEL,
    max_seq_length  = 1024,
    load_in_4bit    = False,
    load_in_8bit    = True,
    full_finetuning = False,
)

# Gemma-3 style chat template
tokenizer = get_chat_template(tokenizer, chat_template="gemma-3")

# Quick smoke generation
def quick_gen(prompt):
    chat = tokenizer.apply_chat_template(
        [{"role":"user","content":prompt}],
        add_generation_prompt=True, tokenize=False
    )
    toks = tokenizer(chat, return_tensors="pt").to(getattr(model,"device","cuda"))
    out = model.generate(**toks, max_new_tokens=128, do_sample=False, use_cache=False)
    print(tokenizer.decode(out[0], skip_special_tokens=True))

quick_gen("প্রদত্ত পূর্ণসংখ্যাটি একটি মৌলিক সংখ্যা কিনা তা পরীক্ষা করার জন্য একটি ফাংশন লিখুন। Example: prime_num(n)")


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


ImportError: Unsloth: Please install unsloth_zoo via `pip install unsloth_zoo`

In [None]:
# S2 — attach LoRA adapters
from unsloth import FastModel

TARGETS = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]

model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False,
    finetune_language_layers   = True,
    finetune_attention_modules = True,
    finetune_mlp_modules       = True,

    r            = 64,
    lora_alpha   = 128,
    lora_dropout = 0.05,
    bias         = "none",
    target_modules = TARGETS,
    random_state = 3407,
)
print("PEFT (LoRA) attached.")


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.


Unsloth: Making `model.base_model.model.model` require gradients
PEFT (LoRA) attached.


In [None]:
# G0 — robust code extractor + generators (place right after S1)
import re, types, builtins, torch

def _norm(s: str) -> str:
    return str(s or "").replace("\r\n","\n")

_FENCE_ANY = re.compile(r"```([\w+\-]*)\s*\n([\s\S]*?)\n```", re.I)

def extract_first_code_block(text: str) -> str | None:
    s = _norm(text)

    # 1) fenced blocks (prefer python)
    blocks = _FENCE_ANY.findall(s)
    if blocks:
        for lang, body in blocks:
            if lang.strip().lower() in ("python","py"):
                b = body.strip()
                if b: return b
        for _, body in blocks:
            b = body.strip()
            if b: return b

    # 2) Gemma-style section fallback
    if "\nmodel\n" in s:
        tail = s.split("\nmodel\n", 1)[1].strip()
        if ("def " in tail) or ("import " in tail) or ("return " in tail):
            return tail

    # 3) first top-level def
    lines = s.splitlines()
    start = None
    for i, L in enumerate(lines):
        if re.match(r"^\s*def\s+[A-Za-z_]\w*\s*\(", L):
            start = i; break
    if start is not None:
        buf=[lines[start]]
        for j in range(start+1,len(lines)):
            if re.match(r"^\s*(def|class)\s+[A-Za-z_]\w*\s*\(", lines[j]): break
            buf.append(lines[j])
        code="\n".join(buf).strip()
        if code: return code

    return None

def generate_code(model, tokenizer, prompt, *, do_sample=False, temperature=0.2, top_p=0.95, max_new_tokens=512):
    """Single-prompt with a strict system rule to emit ONE fenced python block."""
    messages = [
        {"role":"system",
         "content":"Reply with ONLY one ```python``` fenced block implementing the required function. No extra text."},
        {"role":"user","content":prompt},
    ]
    chat = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
    toks = tokenizer(chat, return_tensors="pt").to(getattr(model, "device", "cuda"))
    pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
    was_training = model.training
    model.eval()
    with torch.no_grad():
        out = model.generate(
            **toks,
            max_new_tokens=max_new_tokens,
            do_sample=do_sample,
            temperature=temperature,
            top_p=top_p,
            use_cache=False,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=pad_id,
        )
    if was_training: model.train()
    return tokenizer.decode(out[0], skip_special_tokens=True)

def generate_code_with_sys(model, tokenizer, system_text, user_text, *,
                           do_sample=False, temperature=0.2, top_p=0.95, max_new_tokens=512):
    """Two-message (system+user) generator matching SFT/RSFT serialization."""
    messages = [
        {"role":"system","content":system_text},
        {"role":"user","content":user_text},
    ]
    chat = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
    toks = tokenizer(chat, return_tensors="pt").to(getattr(model, "device", "cuda"))
    pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
    was_training = model.training
    model.eval()
    with torch.no_grad():
        out = model.generate(
            **toks,
            max_new_tokens=max_new_tokens,
            do_sample=do_sample,
            temperature=temperature,
            top_p=top_p,
            use_cache=False,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=pad_id,
        )
    if was_training: model.train()
    return tokenizer.decode(out[0], skip_special_tokens=True)


In [None]:
# E1 — permissive assert runner (place after G0 and before any eval/callback)
import re, types, builtins

def _expected_fn_from_asserts(asserts):
    for a in asserts:
        m = re.search(r"assert\s+([A-Za-z_]\w*)\s*\(", str(a))
        if m: return m.group(1)
    return None

def _sanitize_asserts(asserts):
    good=[]
    for raw in asserts:
        s=str(raw).strip()
        if not s.startswith("assert "):
            s = "assert " + s if not s.startswith("assert") else s
        try: compile(s, "<assert>", "exec"); good.append(s)
        except SyntaxError: continue
    return good

_COMMON_PREAMBLE = """
import math, itertools, functools, operator, bisect, heapq, statistics, collections, re, string, random, sys, datetime
from collections import Counter, defaultdict, deque, OrderedDict
from math import gcd, sqrt, factorial, comb, perm, ceil, floor
from bisect import bisect_left, bisect_right
"""

def _make_mod_permissive(code: str, expected_fn: str | None):
    try:
        m = types.ModuleType("student")
        g = {"__builtins__": builtins.__dict__}
        exec(_COMMON_PREAMBLE, g, g)
        exec(code, g, g)
        if expected_fn and expected_fn not in g:
            cands=[k for k,v in g.items() if callable(v) and not k.startswith("_")]
            if len(cands)==1: g[expected_fn]=g[cands[0]]
        m.__dict__.update(g); return m
    except Exception:
        return None

def run_asserts_module_permissive(generation_text: str, asserts: list[str]) -> tuple[int,int]:
    code = extract_first_code_block(generation_text)
    if not code: return (0,0)
    expected = _expected_fn_from_asserts(asserts)
    mod = _make_mod_permissive(code, expected)
    if mod is None: return (0,0)
    tests = _sanitize_asserts(asserts)
    ok=0
    for s in tests:
        try: exec(s, mod.__dict__, mod.__dict__); ok+=1
        except Exception: pass
    return ok, len(tests)


In [None]:
# R1 — helpers to mirror your SFT format (updated)
def build_system_prompt(fn: str) -> str:
    return (
        "তুমি একটি কোড-জেনারেশন সহকারী।\n"
        f"শুধু একটি ```python``` fenced ব্লকে **শুধুমাত্র** ফাংশন `{fn}` ইমপ্লিমেন্ট করবে।\n"
        "ইনপুট/আউটপুট বা print লিখবে না। প্রয়োজনীয় সব imports **ব্লকের ভিতরেই** দেবে। "
        "বাইরের টেক্সট, ব্যাখ্যা, বা একাধিক ব্লক দেবে না।"
    )

def build_user_prompt(instr: str, tests: list[str], fn: str) -> str:
    tests_comment = "\n".join(f"# {t}" for t in tests[:8])
    return (
        f"{instr.strip()}\n\n"
        "উদাহরণ টেস্টসমূহ (কমেন্ট আকারে):\n"
        f"{tests_comment}\n\n"
        f"ফাংশনের নাম অবশ্যই `{fn}` হবে এবং একটিমাত্র ```python``` fenced ব্লকে কোড দেবে।"
    )

def serialize_chat_for_sft(tokenizer, system_text: str, user_text: str, code_py: str) -> str:
    model_text = f"```python\n{code_py.strip()}\n```"
    messages = [
        {"role":"system","content": system_text},
        {"role":"user",  "content": user_text},
        {"role":"model", "content": model_text},
    ]
    txt = tokenizer.apply_chat_template(messages, add_generation_prompt=False, tokenize=False)
    return txt.replace("<bos>","")


In [None]:
# R0 — build DEV_HAS_TESTS, RSFT source, and a clean DEV_VAL (place after R1)
from sklearn.model_selection import train_test_split

DEV_parsed    = DEV.assign(_p = DEV.get("test_list","").apply(parse_tests_cell))
DEV_HAS_TESTS = DEV_parsed[DEV_parsed["_p"].apply(lambda d: len(d.get("tests",[]))>0)].reset_index(drop=True)

# If there are no rows with tests in DEV, fall back to TRIAL
if len(DEV_HAS_TESTS) == 0:
    print("⚠️ DEV has no usable asserts. Falling back to TRIAL for RSFT/Eval.")
    TRIAL_parsed    = TRIAL.assign(_p = TRIAL.get("test_list","").apply(parse_tests_cell))
    DEV_HAS_TESTS   = TRIAL_parsed[TRIAL_parsed["_p"].apply(lambda d: len(d.get("tests",[]))>0)].reset_index(drop=True)

DEV_RSFT, DEV_VAL = train_test_split(DEV_HAS_TESTS, test_size=0.2, random_state=42, shuffle=True)

print(f"DEV total={len(DEV)} | with tests={len(DEV_HAS_TESTS)} | RSFT-source={len(DEV_RSFT)} | DEV-val={len(DEV_VAL)}")

# Helper: always return a dataframe with tests
def pick_eval_df():
    for name in ["DEV_VAL","DEV_HAS_TESTS","TRIAL"]:
        if name in globals():
            df = globals()[name]
            parsed = df.assign(_p=df.get("test_list","").apply(parse_tests_cell))
            have = parsed[parsed["_p"].apply(lambda d: len(d.get("tests",[]))>0)]
            if len(have) > 0:
                print(f"[Eval] Using {name} with {len(have)}/{len(df)} rows that have tests.")
                return have.reset_index(drop=True)
    raise RuntimeError("No dataframe with usable asserts found.")


DEV total=400 | with tests=400 | RSFT-source=320 | DEV-val=80


In [None]:
# R2 — generate K samples per prompt, keep only those that pass all asserts (fixed: no nonlocal)
from datasets import Dataset
import torch

# ------- knobs -------
RSFT_MAX_ROWS        = min(20, len(DEV_RSFT))   # how many prompts to try
RSFT_K               = 15                      # samples per prompt
RSFT_TEMPERATURE     = 0.7
RSFT_TOP_P           = 0.95
RSFT_MAX_NEW_TOKENS  = 512
RSFT_PER_PROMPT_MAX  = 2                       # keep up to N winners per prompt (diversity)

# generator that uses the same chat shape as SFT/Callback (system + user)
def _gen_once_system_user(model, tokenizer, system_text, user_text,
                          *, do_sample, temperature, top_p, max_new_tokens):
    messages = [
        {"role":"system","content": system_text},
        {"role":"user",  "content": user_text},
    ]
    chat = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
    toks = tokenizer(chat, return_tensors="pt").to(getattr(model, "device", "cuda"))
    pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id
    was_training = model.training
    model.eval()
    with torch.no_grad():
        out = model.generate(
            **toks,
            max_new_tokens=max_new_tokens,
            do_sample=do_sample,
            temperature=temperature,
            top_p=top_p,
            use_cache=False,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=pad_id,
        )
    if was_training: model.train()
    return tokenizer.decode(out[0], skip_special_tokens=True)

# pick the RSFT source set prepared in R0
source_df = DEV_RSFT if 'DEV_RSFT' in globals() else DEV_has_tests
picked = source_df.sample(min(RSFT_MAX_ROWS, len(source_df)), random_state=123).reset_index(drop=True)

winners_texts = []
attempted = 0
found_prompts_with_at_least_one = 0

for i, row in picked.iterrows():
    info  = parse_tests_cell(row.get("test_list", ""))
    tests = info["tests"]
    if not tests:
        continue
    fn = info["fn"] or "solve"
    system_text = build_system_prompt(fn)                   # from R1
    user_text   = build_user_prompt(row["instruction"], tests, fn)  # from R1

    codes_seen = set()
    winners_for_this_prompt = 0

    for _ in range(RSFT_K):
        gen = _gen_once_system_user(
            model, tokenizer, system_text, user_text,
            do_sample=True, temperature=RSFT_TEMPERATURE, top_p=RSFT_TOP_P,
            max_new_tokens=RSFT_MAX_NEW_TOKENS,
        )
        ok, tot = run_asserts_module_permissive(gen, tests)
        if tot > 0 and ok == tot:
            code = extract_first_code_block(gen)
            if code and ("def " in code) and (code not in codes_seen):
                codes_seen.add(code)
                winners_texts.append({
                    "text": serialize_chat_for_sft(tokenizer, system_text, user_text, code)  # from R1
                })
                winners_for_this_prompt += 1
                if winners_for_this_prompt >= RSFT_PER_PROMPT_MAX:
                    break

    attempted += 1
    found_prompts_with_at_least_one += int(winners_for_this_prompt > 0)
    print(f"[RSFT] prompt {i+1}/{len(picked)}: winners={winners_for_this_prompt} | "
          f"cumulative found={found_prompts_with_at_least_one}")

print(f"[RSFT] prompts attempted={attempted}, "
      f"prompts with ≥1 winner={found_prompts_with_at_least_one}, "
      f"total winners kept={len(winners_texts)}, "
      f"win-rate={found_prompts_with_at_least_one/max(1,attempted):.3f}")

rsft_ds = Dataset.from_list(winners_texts) if winners_texts else None
print(rsft_ds)


[RSFT] prompt 1/20: winners=2 | cumulative found=1
[RSFT] prompt 2/20: winners=0 | cumulative found=1
[RSFT] prompt 3/20: winners=0 | cumulative found=1
[RSFT] prompt 4/20: winners=0 | cumulative found=1
[RSFT] prompt 5/20: winners=2 | cumulative found=2
[RSFT] prompt 6/20: winners=2 | cumulative found=3
[RSFT] prompt 7/20: winners=0 | cumulative found=3
[RSFT] prompt 8/20: winners=2 | cumulative found=4
[RSFT] prompt 9/20: winners=0 | cumulative found=4
[RSFT] prompt 10/20: winners=0 | cumulative found=4
[RSFT] prompt 11/20: winners=0 | cumulative found=4
[RSFT] prompt 12/20: winners=1 | cumulative found=5
[RSFT] prompt 13/20: winners=0 | cumulative found=5
[RSFT] prompt 14/20: winners=0 | cumulative found=5
[RSFT] prompt 15/20: winners=0 | cumulative found=5
[RSFT] prompt 16/20: winners=2 | cumulative found=6
[RSFT] prompt 17/20: winners=0 | cumulative found=6
[RSFT] prompt 18/20: winners=2 | cumulative found=7
[RSFT] prompt 19/20: winners=2 | cumulative found=8
[RSFT] prompt 20/20: 

In [None]:

# --- Compatibility fallbacks in case R1 wasn't executed yet ---
try:
    build_system_prompt
except NameError:
    def build_system_prompt(fn: str) -> str:
        return (
            "তুমি একটি কোড-জেনারেশন সহকারী।\n"
            f"শুধু একটি ```python``` fenced ব্লকে **শুধুমাত্র** ফাংশন `{fn}` ইমপ্লিমেন্ট করবে।\n"
            "ইনপুট/আউটপুট বা print লিখবে না। প্রয়োজনীয় সব imports **ব্লকের ভিতরেই** দেবে। "
            "বাইরের টেক্সট, ব্যাখ্যা, বা একাধিক ব্লক দেবে না।"
        )
try:
    build_user_prompt
except NameError:
    def build_user_prompt(instr: str, tests: list[str], fn: str) -> str:
        tests_comment = "\n".join(f"# {t}" for t in tests[:8])
        return (
            f"{instr.strip()}\n\n"
            "উদাহরণ টেস্টসমূহ (কমেন্ট আকারে):\n"
            f"{tests_comment}\n\n"
            f"ফাংশনের নাম অবশ্যই `{fn}` হবে এবং একটিমাত্র ```python``` fenced ব্লকে কোড দেবে।"
        )
try:
    serialize_chat_for_sft
except NameError:
    def serialize_chat_for_sft(tokenizer, system_text: str, user_text: str, code_py: str) -> str:
        model_text = f"```python\n{code_py.strip()}\n```"
        messages = [
            {"role":"system","content": system_text},
            {"role":"user",  "content": user_text},
            {"role":"model", "content": model_text},
        ]
        txt = tokenizer.apply_chat_template(messages, add_generation_prompt=False, tokenize=False)
        return txt.replace("<bos>","")
# --- end fallbacks ---

# R3 — Augment RSFT dataset with TRIAL gold that pass public asserts
from datasets import Dataset

def _check_gold_row(row):
    p = parse_tests_cell(row.get("test_list"))
    tests = p.get("tests", []) if isinstance(p, dict) else []
    if not tests: return None
    code = str(row.get("response") or "").strip()
    if not code: return None
    # Build module with permissive defaults and test
    expected = _expected_fn_from_asserts(tests)
    mod = _make_mod_permissive(code, expected)
    if mod is None: return None
    sanitized = _sanitize_asserts(tests)
    ok = 0
    for s in sanitized:
        try:
            exec(s, mod.__dict__, mod.__dict__)
            ok += 1
        except Exception:
            pass
    return code if (ok == len(sanitized) and ok > 0) else None

trial_texts = []
if "TRIAL" in globals():
    for _, r in TRIAL.iterrows():
        code = _check_gold_row(r)
        if not code:
            continue
        p = parse_tests_cell(r.get("test_list"))
        tests = p.get("tests", []) if isinstance(p, dict) else []
        fn = (p.get("fn") if isinstance(p, dict) else None) or _expected_fn_from_asserts(tests)
        system_text = build_system_prompt(fn)
        user_text   = build_user_prompt(r.get("instruction",""), tests, fn)

        trial_texts.append({"text": serialize_chat_for_sft(tokenizer, system_text, user_text, code)})

trial_ok_ds = Dataset.from_list(trial_texts) if trial_texts else None

# Merge with RSFT-mined winners
if "rsft_ds" in globals() and rsft_ds and trial_ok_ds:
    from datasets import concatenate_datasets
    train_ds = concatenate_datasets([rsft_ds, trial_ok_ds]).shuffle(seed=42)
elif "rsft_ds" in globals() and rsft_ds:
    train_ds = rsft_ds
else:
    train_ds = trial_ok_ds

# Ensure downstream trainer uses the merged dataset
rsft_ds = train_ds
print("✅ R3: train dataset prepared:", rsft_ds)


✅ R3: train dataset prepared: Dataset({
    features: ['text'],
    num_rows: 81
})




In [None]:
import os, pandas as pd, json, torch
from transformers import TrainingArguments, TrainerCallback
from trl import SFTTrainer

class SaveBestCodeEval(TrainerCallback):
    def __init__(self, trainer, model, tokenizer, dev_df, tests_col="test_list",
                 every_n_steps=10, max_new_tokens=640, save_dir="/content/best-rsft"):
        self.trainer=trainer; self.model=model; self.tok=tokenizer
        parsed = dev_df.assign(_p = dev_df.get(tests_col,"").apply(parse_tests_cell))
        self.dev = parsed[parsed["_p"].apply(lambda d: len(d.get("tests",[]))>0)].reset_index(drop=True)
        self.tests_col=tests_col; self.every=max(1,every_n_steps); self.max_new=max_new_tokens
        self.best=-1.0; self.save_dir=save_dir
        os.makedirs(save_dir, exist_ok=True)

    def _one_eval(self):
        total=passed=0
        for _,r in self.dev.iterrows():
            info=parse_tests_cell(r.get(self.tests_col,"")); tests=info["tests"]
            if not tests: continue
            fn=info["fn"] or "solve"
            sys_t = build_system_prompt(fn)
            usr_t = build_user_prompt(str(r["instruction"]), tests, fn)
            gen = generate_code_with_sys(self.model, self.tok, sys_t, usr_t, do_sample=False, max_new_tokens=self.max_new)
            ok, tot = run_asserts_module_permissive(gen, tests)
            if tot>0:
                total+=1; passed+=int(ok==tot)
        return (passed/total if total else 0.0), passed, total

    def on_step_end(self, args, state, control, **kwargs):
        step=int(state.global_step or 0)
        if step>0 and step%self.every==0:
            rate,p,t = self._one_eval()
            print(f"[CodeEval] step={step} pass@1={rate:.3f} ({p}/{t})")
            try: self.trainer.log({"code/pass_at_1": rate, "code/total": t})
            except: pass
            if t>0 and (rate > self.best + 1e-6):
                self.best = rate
                print(f"[CodeEval] New best {rate:.3f} — saving to {self.save_dir}")
                self.trainer.save_model(self.save_dir)
        return control

# Build/verify RSFT winners first (your R2 must create rsft_ds)
# NOTE: If you haven't run R2 in this session, run it before this cell!

if (rsft_ds is None) or (len(rsft_ds) == 0):
    print("No RSFT winners found — skip RSFT.")
else:
    rsft_args = TrainingArguments(
        output_dir                  = "/content/blp-rsft-out",
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 4,
        learning_rate               = 5e-5,
        max_steps                   = 20,
        lr_scheduler_type           = "cosine",
        warmup_ratio                = 0.03,
        logging_steps               = 1,
        save_strategy               = "no",
        eval_strategy               = "no",
        bf16                        = False,
        fp16                        = False,
        remove_unused_columns       = False,
        report_to                   = [],
    )

    rsft_trainer = SFTTrainer(
        model              = model,
        tokenizer          = tokenizer,
        args               = rsft_args,
        train_dataset      = rsft_ds,
        dataset_text_field = "text",
        max_seq_length     = 1024,
    )

    # avoid duplicates
    rsft_trainer.callback_handler.callbacks = [
        cb for cb in rsft_trainer.callback_handler.callbacks
        if cb.__class__.__name__ not in ("CodeEvalCallback","SaveBestCodeEval")
    ]

    eval_df = pick_eval_df()
    rsft_trainer.add_callback(SaveBestCodeEval(
        trainer   = rsft_trainer,
        model     = model,
        tokenizer = tokenizer,
        dev_df    = eval_df,
        tests_col = "test_list",
        every_n_steps = 10,
        max_new_tokens = 640,
        save_dir  = "/content/best-rsft",
    ))

    print("Starting RSFT…")
    model.config.use_cache = False
    rsft_train_result = rsft_trainer.train()
    print(rsft_train_result)


Unsloth: Switching to float32 training since model cannot work with float16


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/81 [00:00<?, ? examples/s]

[Eval] Using DEV_VAL with 80/80 rows that have tests.
Starting RSFT…


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 81 | Num Epochs = 7 | Total steps = 20
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 52,183,040 of 1,052,068,992 (4.96% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.707
2,2.7486
3,2.4286
4,2.1627
5,1.8268
6,1.812
7,1.5248
8,1.4433
9,1.3425
11,1.1586


[CodeEval] step=10 pass@1=0.233 (17/73)
[CodeEval] New best 0.233 — saving to /content/best-rsft
[CodeEval] step=20 pass@1=0.257 (18/70)
[CodeEval] New best 0.257 — saving to /content/best-rsft
TrainOutput(global_step=20, training_loss=1.4300328224897385, metrics={'train_runtime': 1490.5541, 'train_samples_per_second': 0.429, 'train_steps_per_second': 0.013, 'total_flos': 1134920689731072.0, 'train_loss': 1.4300328224897385, 'epoch': 6.7272727272727275})


In [None]:
# R4 — save refined model
if rsft_ds is not None and len(rsft_ds) > 0:
    SAVE_DIR_RSFT = "/content/blp-sft-dpo/rsft-lora"
    rsft_trainer.save_model(SAVE_DIR_RSFT)
    tokenizer.save_pretrained(SAVE_DIR_RSFT)
    print("✅ RSFT complete and saved to:", SAVE_DIR_RSFT)


✅ RSFT complete and saved to: /content/blp-sft-dpo/rsft-lora


In [None]:
# S-load — pick adapters (prefer RSFT) and attach to model
import os
from peft import PeftModel

ADAPTER_RSFT = "/content/blp-sft-dpo/rsft-lora"
#ADAPTER_SFT  = "/kaggle/working/blp-sft-dpo/sft-lora"
BEST_ADAPTER_DIR = ADAPTER_RSFT
print("Using adapters from:", BEST_ADAPTER_DIR)

try:
    # If base model (no PEFT yet), wrap with PeftModel.from_pretrained
    if not hasattr(model, "peft_config"):
        model = PeftModel.from_pretrained(model, BEST_ADAPTER_DIR)
    else:
        # If already PEFT, try loading weights into current PEFT container
        try:
            model.load_adapter(BEST_ADAPTER_DIR, adapter_name="inference")
            model.set_adapter("inference")
        except Exception:
            # Some PEFT variants don’t support load_adapter; fall back to re-wrap
            model = PeftModel.from_pretrained(model, BEST_ADAPTER_DIR)
    model.eval()
    print("✅ Adapters attached.")
except Exception as e:
    print("⚠️ Could not attach adapters, proceeding with current in-memory model:", repr(e))


Using adapters from: /content/blp-sft-dpo/rsft-lora
✅ Adapters attached.


In [None]:
# S-load — pick adapters (prefer RSFT) and attach to model
import os
from peft import PeftModel

ADAPTER_RSFT = "/content/blp-sft-dpo/rsft-lora"
#ADAPTER_SFT  = "/kaggle/working/blp-sft-dpo/sft-lora"
BEST_ADAPTER_DIR = ADAPTER_RSFT
print("Using adapters from:", BEST_ADAPTER_DIR)

try:
    # If base model (no PEFT yet), wrap with PeftModel.from_pretrained
    if not hasattr(model, "peft_config"):
        model = PeftModel.from_pretrained(model, BEST_ADAPTER_DIR)
    else:
        # If already PEFT, try loading weights into current PEFT container
        try:
            model.load_adapter(BEST_ADAPTER_DIR, adapter_name="inference")
            model.set_adapter("inference")
        except Exception:
            # Some PEFT variants don’t support load_adapter; fall back to re-wrap
            model = PeftModel.from_pretrained(model, BEST_ADAPTER_DIR)
    model.eval()
    print("✅ Adapters attached.")
except Exception as e:
    print("⚠️ Could not attach adapters, proceeding with current in-memory model:", repr(e))


Using adapters from: /content/blp-sft-dpo/rsft-lora
✅ Adapters attached.


In [None]:
# F-infer — helpers for inference messaging + payload extraction (place above S-run)
import re

def _extract_fn_from_instruction(instr: str) -> str:
    """Best-effort function-name extraction from the instruction text."""
    pats = [
        r'Exammple.*?\n\s*([A-Za-z_]\w*)\s*\(',
        r'Example.*?\n\s*([A-Za-z_]\w*)\s*\(',
        r'Examples?.*?\n\s*([A-Za-z_]\w*)\s*\(',
        r'উদাহরণ.*?\n\s*([A-Za-z_]\w*)\s*\(',
    ]
    for pat in pats:
        m = re.search(pat, instr, flags=re.IGNORECASE | re.MULTILINE)
        if m:
            return m.group(1)
    m_all = re.findall(r'([A-Za-z_]\w*)\s*\(', instr)
    return m_all[-1] if m_all else "solve"

# Fallbacks if helper cells weren't executed
try:
    build_system_prompt
except NameError:
    def build_system_prompt(fn: str) -> str:
        return (
            "তুমি একটি কোড-জেনারেশন সহকারী।\n"
            f"শুধু একটি ```python``` fenced ব্লকে **শুধুমাত্র** ফাংশন `{fn}` ইমপ্লিমেন্ট করবে।\n"
            "ইনপুট/আউটপুট বা print লিখবে না। প্রয়োজনীয় সব imports **ব্লকের ভিতরেই** দেবে। "
            "বাইরের টেক্সট, ব্যাখ্যা, বা একাধিক ব্লক দেবে না।"
        )

def build_messages_for_inference(instr: str):
    """
    Create (system,user) messages for inference, mirroring training constraints.
    """
    fn = _extract_fn_from_instruction(instr)
    user_text = (
        instr.strip() + "\n\n"
        f"ফাংশনের নাম অবশ্যই `{fn}` হবে এবং একটিমাত্র ```python``` fenced ব্লকে কোড দেবে।"
    )
    return [
        {"role": "system", "content": build_system_prompt(fn)},
        {"role": "user",   "content": user_text},
    ]

# Robust code payload extractor
try:
    extract_first_code_block
except NameError:
    extract_first_code_block = None

def extract_model_payload(decoded: str) -> str:
    """
    Return ONLY the python code block from the model's decoded text.
    Falls back gracefully if no fenced block is present.
    """
    s = str(decoded or "")
    if extract_first_code_block:
        code = extract_first_code_block(s)
        if code:
            return code.strip()
    m = re.search(r"```(?:python|py)?\s*\n([\s\S]*?)\n```", s, flags=re.IGNORECASE)
    if m:
        return m.group(1).strip()
    return s.strip()


In [20]:
# S-run — load data, generate, save submission.json
import pandas as pd, torch
from tqdm.auto import tqdm
from unsloth.chat_templates import get_chat_template

# Ensure template & padding match training
tokenizer = get_chat_template(tokenizer, chat_template="gemma-3")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
model.eval()
torch.manual_seed(0)  # deterministic greedy

# Load the eval CSV (change path if needed)
DEV_PATH = "/content/dev_v2.csv"   # <-- set to the file you must submit for
df = pd.read_csv(DEV_PATH, dtype=str, keep_default_na=False, engine="python")

# Normalize columns
cols_lower = {c.lower(): c for c in df.columns}
assert "instruction" in {c.lower() for c in df.columns}, f"Missing 'instruction' column. Found: {list(df.columns)}"
inst_col = cols_lower["instruction"]
id_col   = cols_lower.get("id", None)
if id_col is None:
    df["id"] = range(len(df))
    id_col = "id"
df[id_col] = pd.to_numeric(df[id_col], downcast="integer", errors="coerce").fillna(-1).astype(int)

# Optional: append helpful extra columns into the prompt
for extra in ["Exammple", "examples", "hint", "note"]:
    if extra in df.columns:
        df[inst_col] = (df[inst_col].astype(str).fillna("") + "\n" +
                        df[extra].astype(str).replace({"nan": ""}).fillna("")).str.strip()

# Generate
responses = []
for prompt in tqdm(df[inst_col].astype(str), desc="Generating"):
    messages = build_messages_for_inference(prompt)
    chat_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
    inputs = tokenizer(chat_text, return_tensors="pt").to(model.device)

    with torch.no_grad():
        out = model.generate(
            **inputs,
            do_sample=False,          # greedy for pass@1 determinism
            max_new_tokens=1024,
            use_cache=False,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )

    decoded = tokenizer.decode(out[0], skip_special_tokens=True)
    responses.append(extract_model_payload(decoded))

# Save
out_df = pd.DataFrame({"id": df[id_col].astype(int), "response": responses})
out_df.to_json("submission.json", orient="records", force_ascii=False, indent=2)
print(f"✅ Wrote submission.json with {len(out_df)} rows (id, response).")


Generating:   0%|          | 0/400 [00:00<?, ?it/s]

✅ Wrote submission.json with 400 rows (id, response).


In [22]:
# # repair_submission.py
# import json, re, ast, os
# import pandas as pd

# CSV_PATH = "/content/dev_v2.csv"     # path to your reference CSV
# SUB_PATH = "submission.json"    # path to your model outputs
# OUT_PATH = "submission_fixed.json"

# # --- helpers -------------------------------------------------------------

# def parse_test_list_cell(cell):
#     # CSV stores a Python string that itself is a Python list of assert strings.
#     # Example: '"""["assert foo(1)==2", "assert foo(2)==3"]"""'
#     # 1st literal_eval removes the outer quotes, 2nd gives list[str]
#     inner = ast.literal_eval(cell)
#     return ast.literal_eval(inner)

# def get_expected_fn_name(tests):
#     # Grab the function name from the first assert line that looks like "assert fn_name("
#     for t in tests:
#         m = re.search(r"assert\s+([A-Za-z_]\w*)\s*\(", t)
#         if m:
#             return m.group(1)
#     return None

# def strip_code_fence(s):
#     if not isinstance(s, str):
#         return ""
#     s = s.strip()
#     if s.startswith("```"):
#         # remove opening ```
#         s = s.split("```", 1)[1]
#         # drop optional language tag on first line
#         s = re.sub(r"^\s*\w+\n", "", s, count=1)
#         # remove trailing ```
#         if "```" in s:
#             s = s.rsplit("```", 1)[0]
#     return s.strip()

# def refence(code):
#     code = code.strip()
#     return f"```python\n{code}\n```"

# def first_def_name(code):
#     m = re.search(r"^\s*def\s+([A-Za-z_]\w*)\s*\(", code, flags=re.M)
#     return m.group(1) if m else None

# def ensure_def_colon(code):
#     # Add missing colon at end of def line if absent
#     def repl(m):
#         line = m.group(0)
#         return line if line.rstrip().endswith(":") else line + ":"
#     return re.sub(r"^\s*def\s+[A-Za-z_]\w*\s*\([^)]*\)\s*$", repl, code, flags=re.M)

# def keep_first_block_with_def(code):
#     # If multiple codeblocks/noise, keep the shortest slice that contains the first def
#     lines = code.splitlines()
#     # find first 'def'
#     try:
#         i = next(i for i,l in enumerate(lines) if re.match(r"^\s*def\s+[A-Za-z_]\w*\s*\(", l))
#     except StopIteration:
#         return code  # no def found
#     # from i to end
#     return "\n".join(lines[i:])

# def try_compile(code):
#     try:
#         ast.parse(code)
#         return True
#     except Exception:
#         return False

# def rename_first_def(code, new_name):
#     # rename ONLY the first def's name to new_name
#     return re.sub(
#         r"(^\s*def\s+)([A-Za-z_]\w*)(\s*\()",
#         r"\1" + new_name + r"\3",
#         code,
#         count=1,
#         flags=re.M
#     )

# # --- load data -----------------------------------------------------------
# df = pd.read_csv(CSV_PATH, dtype=str, keep_default_na=False)
# df["id"] = df["id"].astype(int)
# with open(SUB_PATH, "r", encoding="utf-8") as f:
#     sub = json.load(f)

# sub_by_id = {row["id"]: row["response"] for row in sub}

# fixed = []
# fix_stats = {"total":0, "missing_in_sub":0, "renamed":0, "colon_fixed":0, "refenced":0, "compiled":0}

# for _, row in df.iterrows():
#     rid = int(row["id"])
#     tests = parse_test_list_cell(row["test_list"])
#     want = get_expected_fn_name(tests) or "solution"  # fallback

#     raw = sub_by_id.get(rid, "")
#     fix_stats["total"] += 1
#     if raw is None:
#         raw = ""
#     code = strip_code_fence(raw)

#     # Remove obvious garbage around; keep from first def if present
#     code = keep_first_block_with_def(code)

#     # If there is no def at all, nothing to rename; we’ll just fence it.
#     have_name = first_def_name(code)

#     # Fix missing colon on def lines (common issue)
#     new_code = ensure_def_colon(code)
#     if new_code != code:
#         fix_stats["colon_fixed"] += 1
#         code = new_code

#     # If the first def name != expected, rename it to expected
#     have_name = first_def_name(code)
#     if have_name and have_name != want:
#         code = rename_first_def(code, want)
#         fix_stats["renamed"] += 1

#     # If still no def with expected name but there IS some def, try again to rename
#     if not re.search(rf"^\s*def\s+{re.escape(want)}\s*\(", code, flags=re.M):
#         have_name = first_def_name(code)
#         if have_name and have_name != want:
#             code = rename_first_def(code, want)
#             fix_stats["renamed"] += 1

#     # Final: if still no def statements at all, just fence what we have (grader may still fail)
#     ok = try_compile(code)

#     fixed.append({"id": rid, "response": refence(code)})
#     fix_stats["refenced"] += 1
#     fix_stats["compiled"] += int(ok)

# with open(OUT_PATH, "w", encoding="utf-8") as f:
#     json.dump(fixed, f, ensure_ascii=False, indent=2)

# print("Repair summary:", fix_stats)
# print(f"✅ Wrote {OUT_PATH}")


Repair summary: {'total': 400, 'missing_in_sub': 0, 'renamed': 6, 'colon_fixed': 0, 'refenced': 400, 'compiled': 390}
✅ Wrote submission_fixed.json


In [24]:
import json
import os
import ast
import pandas as pd
import time
import signal

reference_dir = "/content/" #replace with the path to the reference data (dev_v2.csv)
prediction_dir = "/content/" #replace with the path to the prediction data (submission.json)


#Do not modify anything below this part

# Timeout handler
def handler(signum, frame):
    raise TimeoutError("Execution timed out")


def evaluate_combined_data(res_data, ref_data):
    # Convert to DataFrames for easy merging
    res_df = pd.DataFrame(res_data)[['id', 'response']]
    ref_df = pd.DataFrame(ref_data)
    # Drop the response column from ref_df if it exists
    if 'response' in ref_df.columns:
        ref_df = ref_df.drop(columns=['response'])

    # Merge the data on 'id'
    combined_df = ref_df.merge(res_df, on='id', how='left')

    # Convert back to list of dictionaries
    combined_data = combined_df.to_dict('records')

    global_correct = 0
    global_total = len(combined_data)

    for entry in combined_data:
        entry_id = entry['id']
        response_code = entry.get('response', '')  # Use empty string if response missing
        test_list_raw = entry['test_list']
        if response_code is not None:
            response_code = response_code.strip('` \n').replace('python\n', '').strip()


        print(f"Executing Sample ID: {entry_id}")

        # 🚫 Skip code if it contains time.sleep (case-insensitive)
        if "time.sleep" in response_code.lower():
            print(f"⏭️ Skipping Code Execution: contains time.sleep()")
            continue

        correct = 0


        # Parse the test cases safely
        try:
            inner_str = ast.literal_eval(test_list_raw)
            test_cases = ast.literal_eval(inner_str)
        except Exception as e:
            print(f"❌❌❌❌ Failed to parse test_list: {e} ❌❌❌")
            continue

        # Create a shared namespace for exec
        namespace = {}

        try:
            # Set timeout for function definition
            signal.signal(signal.SIGALRM, handler)
            signal.alarm(30)
            exec(response_code, namespace)
            signal.alarm(0)  # cancel timer if finished early
        except TimeoutError:
            print(f"⏱️ Timeout in function definition. Skipping test case execution for this ID.\n")
            continue
        except Exception as e:
            print(f"❌ Error in function definition: {e}. Skipping test case execution for this ID.\n")
            continue

        passed = True
        # Run each assert statement
        for i, assert_stmt in enumerate(test_cases):
            try:
                signal.alarm(30)  # 30 seconds per test case
                exec(assert_stmt, namespace)
                signal.alarm(0)
                correct += 1
            except TimeoutError:
                print(f"⏱️ Test case {i + 1} timed out. Skipping all remaining test cases for this ID.")
                passed = False
                break  # Exit loop on timeout
            except AssertionError:
                print(f"❌ Test case {i + 1} failed: assertion error. Skipping all remaining test cases for this ID.")
                passed = False
                break  # Exit loop on timeout
            except Exception as e:
                print(f"⚠️ Test case {i + 1} exception: {e}. Skipping all remaining test cases for this ID.")
                passed = False
                break  # Exit loop on timeout
            finally:
                signal.alarm(0)
        if passed:
            print(f"✅ ID {entry_id} Passed all test cases.\n")
        else:
            print(f"❌ ID {entry_id} Failed some test cases.\n")

        total = len(test_cases)
        if correct == total:
            global_correct += 1

    return global_correct, global_total




# Read both files
with open(os.path.join(prediction_dir,'submission.json'), 'r', encoding='utf-8') as f:
    res_data = json.load(f)



ref_df = pd.read_csv(
    os.path.join(reference_dir, 'dev_v2.csv'),
    dtype=str,                # keep everything as string to avoid NaN
    keep_default_na=False     # empty cells stay '', not NaN
)
# Ensure 'id' is numeric to merge cleanly (adjust to int if your JSON ids are ints)
ref_df['id'] = ref_df['id'].astype(int)
ref_data = ref_df.to_dict('records')

# Evaluate the combined data
correct, all = evaluate_combined_data(res_data, ref_data)

# Write the accuracy to scores.json
scores = {
    "accuracy": correct / all if all > 0 else 0.0
}

print(f"\nPass@1: {correct}/{all} = {scores['accuracy']:.2f}")




Executing Sample ID: 1
⚠️ Test case 1 exception: name 'Pair' is not defined. Skipping all remaining test cases for this ID.
❌ ID 1 Failed some test cases.

Executing Sample ID: 2
❌ Test case 2 failed: assertion error. Skipping all remaining test cases for this ID.
❌ ID 2 Failed some test cases.

Executing Sample ID: 3
❌ Test case 1 failed: assertion error. Skipping all remaining test cases for this ID.
❌ ID 3 Failed some test cases.

Executing Sample ID: 4
✅ ID 4 Passed all test cases.

Executing Sample ID: 5
✅ ID 5 Passed all test cases.

Executing Sample ID: 6
❌ Test case 1 failed: assertion error. Skipping all remaining test cases for this ID.
❌ ID 6 Failed some test cases.

Executing Sample ID: 7
❌ Test case 1 failed: assertion error. Skipping all remaining test cases for this ID.
❌ ID 7 Failed some test cases.

Executing Sample ID: 8
❌ Test case 2 failed: assertion error. Skipping all remaining test cases for this ID.
❌ ID 8 Failed some test cases.

Executing Sample ID: 9
⚠️ Test 