In [None]:
pip install -q transformers datasets peft accelerate bitsandbytes pandas tqdm

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os

BASE_DIR = "/workspace"
CACHE_DIR = f"{BASE_DIR}/hf_cache"
MODEL_DIR = f"{BASE_DIR}/qwen2.5_coder_14b"   # optional local save for later reuse

# Create folders
os.makedirs(CACHE_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

print("CWD before:", os.getcwd())
os.chdir(BASE_DIR)
print("CWD now   :", os.getcwd())
print("CACHE_DIR :", CACHE_DIR)
print("MODEL_DIR :", MODEL_DIR)

CWD before: /workspace
CWD now   : /workspace
CACHE_DIR : /workspace/hf_cache
MODEL_DIR : /workspace/qwen2.5_coder_14b


In [3]:
import os

os.environ["HF_HOME"] = CACHE_DIR
os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
os.environ["HF_DATASETS_CACHE"] = CACHE_DIR

print("HF_HOME              =", os.environ["HF_HOME"])
print("TRANSFORMERS_CACHE   =", os.environ["TRANSFORMERS_CACHE"])
print("HF_DATASETS_CACHE    =", os.environ["HF_DATASETS_CACHE"])


HF_HOME              = /workspace/hf_cache
TRANSFORMERS_CACHE   = /workspace/hf_cache
HF_DATASETS_CACHE    = /workspace/hf_cache


In [4]:
import shutil, os

paths = [
    os.path.expanduser("~/.cache/huggingface"),
    "/root/.cache/huggingface",
    "/root/.local/share/huggingface",
    os.path.expanduser("~/.cache/pip"),
    os.path.expanduser("~/.local/share/pip"),
]

for p in paths:
    try:
        if os.path.exists(p):
            shutil.rmtree(p, ignore_errors=True)
            print("Removed:", p)
        else:
            print("Not found:", p)
    except Exception as e:
        print("Skip (no permission?):", p, "->", e)


Not found: /root/.cache/huggingface
Not found: /root/.cache/huggingface
Not found: /root/.local/share/huggingface
Removed: /root/.cache/pip
Not found: /root/.local/share/pip


In [None]:
import sys
!{sys.executable} -m pip install -q --upgrade transformers accelerate bitsandbytes

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import os

# ‚úÖ Cache directory (so models don‚Äôt fill up the 30GB root overlay)
CACHE_DIR = "/workspace/hf_cache"
os.makedirs(CACHE_DIR, exist_ok=True)

# ‚úÖ Model name
model_name = "Qwen/Qwen2.5-Coder-14B-Instruct"

# ‚úÖ BitsAndBytes 4-bit quantization config (QLoRA)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16  # use fp16 (bf16 not needed)
)

# ‚úÖ Tokenizer (Qwen2.5 still may need trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=CACHE_DIR,
    trust_remote_code=True
)

# Set pad_token safely
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# ‚úÖ Model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir=CACHE_DIR,
    quantization_config=bnb_config,
    device_map="auto",        # automatically places layers across GPUs if multi-GPU
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
)

print("‚úÖ Model & tokenizer ready.")
print("GPU:", torch.cuda.get_device_name(0))




tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

‚úÖ Model & tokenizer ready.
GPU: NVIDIA A100 80GB PCIe


In [None]:
guideline = (
    "You are an ULTIMATE Python Coding Expert. Follow the rules STRICTLY.\n"
    "Input: A programming instruction in English with function name and parameters.\n"
    "Output: Only the correct Python code inside a fenced code block.\n\n"
    "Format:\n"
    "```python\n"
    "<code here>\n"
    "```\n\n"
    "Rules:\n"
    "- Reason internally; output only the final code block.\n"
    "- Handle edge cases; the code MUST pass typical unit tests.\n"
    "- Preserve the exact function name and parameters.\n"
    "- Always return the output (no print() / input()).\n"
    "- Do not define classes; use functions and variables only.\n"
    "- Import required libraries if needed (no unused imports).\n"
    "- Do not include any text, explanations, comments, or docstrings.\n"
    "\n"
)


In [None]:
# STEP 5 ‚Äî Apply Qwen chat template (system, user, assistant) ‚Äî RunPod version
import os, json
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer

# Paths for RunPod
BASE_DIR   = "/workspace"
CACHE_DIR  = os.getenv("HF_HOME", f"{BASE_DIR}/hf_cache")
QWEN_MODEL_ID = "Qwen/Qwen2.5-Coder-14B-Instruct"

INPUT_PATH  = f"trial_mbpp_cleaned_data.csv"         # your cleaned CSV
OUTPUT_PATH = f"{BASE_DIR}/train_chat_templated.jsonl"       # where to write JSONL
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

# Load tokenizer (Qwen2.5 typically needs trust_remote_code)
tokenizer = AutoTokenizer.from_pretrained(
    QWEN_MODEL_ID,
    use_fast=True,
    trust_remote_code=True,
    cache_dir=CACHE_DIR
)

# Load dataset (expects columns: instruction | response)
df = pd.read_csv(INPUT_PATH)

###
# df = df.sample(50)


def render_row(inst: str, resp: str) -> str:
    messages = [
        {"role": "system",   "content": guideline},
        {"role": "user",     "content": (inst or "").strip()},
        {"role": "assistant","content": (resp or "").strip()},
    ]
    return tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False  # include assistant content in rendered text
    )

# Stream to jsonl
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    for row in tqdm(df.itertuples(index=False), total=len(df), desc="Templating"):
        text = render_row(getattr(row, "instruction"), getattr(row, "response"))
        f.write(json.dumps({"text": text}, ensure_ascii=False) + "\n")

print("‚úÖ Chat template applied ‚Üí", OUTPUT_PATH)


Templating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:00<00:00, 3722.45it/s]

‚úÖ Chat template applied ‚Üí /workspace/train_chat_templated.jsonl





In [13]:
pd.set_option("display.max_colwidth", None)

In [None]:
# STEP 6 ‚Äî Tokenize (and optionally pack) ‚Äî RunPod version
import os
import torch
from datasets import load_dataset
from transformers import AutoTokenizer

# Paths for RunPod
BASE_DIR   = "/workspace"
CACHE_DIR  = os.getenv("HF_HOME", f"{BASE_DIR}/hf_cache")
QWEN_MODEL_ID = "Qwen/Qwen2.5-Coder-14B-Instruct"   # keep consistent with Step 5
INPUT_PATH = f"{BASE_DIR}/train_chat_templated.jsonl"
OUTPUT_PATH = f"{BASE_DIR}/ds_tokenized"
MAX_SEQ_LEN = 1024
PACK = False  # set True later if you want packed LM training

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    QWEN_MODEL_ID,
    use_fast=True,
    trust_remote_code=True,
    cache_dir=CACHE_DIR
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load chat-formatted jsonl -> HF dataset
ds = load_dataset("json", data_files=INPUT_PATH, split="train")

# Tokenization function
def tok_fn(batch):
    out = tokenizer(
        batch["text"],
        max_length=MAX_SEQ_LEN,
        truncation=True,
        padding=False,               # dynamic padding via collator later
        return_attention_mask=True,
    )
    # For causal LM, labels = input_ids
    out["labels"] = out["input_ids"].copy()
    return out

# Apply tokenizer
ds_tok = ds.map(tok_fn, batched=True, remove_columns=["text"])
print(ds_tok)

# (Optional) Packing ‚Äî disabled by default
if PACK:
    block_size = MAX_SEQ_LEN
    def group_texts(examples):
        concatenated = {k: sum(examples[k], []) for k in ["input_ids", "attention_mask", "labels"]}
        total_length = len(concatenated["input_ids"])
        total_length = (total_length // block_size) * block_size
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated.items()
        }
        return result
    ds_tok = ds_tok.map(group_texts, batched=True)
    print("‚úÖ Packed dataset:", ds_tok)

# Save tokenized dataset for reuse
ds_tok.save_to_disk(OUTPUT_PATH)
print("‚úÖ Saved tokenized dataset ‚Üí", OUTPUT_PATH)


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 50
})


Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]

‚úÖ Saved tokenized dataset ‚Üí /workspace/ds_tokenized


In [15]:
# STEP 7 ‚Äî Use the whole dataset for training (no val split)
# (Nothing to do beyond naming it explicitly)

train_dataset = ds_tok
print("‚úÖ Training samples:", len(train_dataset))

‚úÖ Training samples: 50


In [None]:
# STEP 8 ‚Äî Load model in 4-bit (QLoRA-ready) ‚Äî RunPod version
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import os

# Paths and model
BASE_DIR   = "/workspace"
CACHE_DIR  = os.getenv("HF_HOME", f"{BASE_DIR}/hf_cache")
QWEN_MODEL_ID = "Qwen/Qwen2.5-Coder-14B-Instruct"

# Compute dtype: use bf16 if supported, else fp16 (T4 will fall back to fp16)
compute_dtype = torch.bfloat16 if (torch.cuda.is_available() and torch.cuda.is_bf16_supported()) else torch.float16

# 4-bit quantization config for QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    QWEN_MODEL_ID,
    use_fast=True,
    trust_remote_code=True,
    cache_dir=CACHE_DIR
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model in 4-bit mode
model = AutoModelForCausalLM.from_pretrained(
    QWEN_MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=compute_dtype,
    trust_remote_code=True,
    cache_dir=CACHE_DIR,
    low_cpu_mem_usage=True
)

# Ensure pad id is set on model
model.config.pad_token_id = tokenizer.pad_token_id

print("‚úÖ Model loaded in 4-bit with compute dtype:", compute_dtype)
print("GPU:", torch.cuda.get_device_name(0))

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

‚úÖ Model loaded in 4-bit with compute dtype: torch.bfloat16
GPU: NVIDIA A100 80GB PCIe


In [17]:
# STEP 9 ‚Äî Enable gradient checkpointing + training flags for QLoRA

# ‚úÖ Reduce memory by recomputing activations
model.gradient_checkpointing_enable()

# ‚úÖ Disable caching (must be False when training with gradient checkpointing)
model.config.use_cache = False

# ‚úÖ Ensure inputs require gradients (needed for QLoRA / PEFT on 4-bit models)
model.enable_input_require_grads()

print("‚úÖ Gradient checkpointing ON; use_cache=False; inputs require grads.")

‚úÖ Gradient checkpointing ON; use_cache=False; inputs require grads.


In [None]:
# STEP 10 ‚Äî Define LoRA configuration for QLoRA fine-tuning
from peft import LoraConfig, TaskType

# üîß LoRA hyperparameters
LORA_R = 32
LORA_ALPHA = 64
LORA_DROPOUT = 0.05

# üîë Target modules (common for Qwen/transformer-based LLMs)
TARGET_MODULES = [
    "q_proj", "k_proj", "v_proj", "o_proj",
    "gate_proj", "up_proj", "down_proj"
]

# ‚úÖ Define LoRA config
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=TARGET_MODULES,
)

print("‚úÖ LoRA config ready with r =", LORA_R, "alpha =", LORA_ALPHA)


‚úÖ LoRA config ready with r = 8 alpha = 16


In [19]:
# STEP 11 ‚Äî Attach LoRA adapters with PEFT
from peft import get_peft_model

# Wrap the model with LoRA adapters
model = get_peft_model(model, lora_config)

# (Optional) sanity check: trainable vs total parameters
def print_trainable_parameters(m):
    trainable = 0
    total = 0
    for _, p in m.named_parameters():
        num = p.numel()
        total += num
        if p.requires_grad:
            trainable += num
    print(f"Trainable params: {trainable:,} | Total params: {total:,} | "
          f"Trainable%: {100 * trainable/total:.4f}%")

print_trainable_parameters(model)
print("‚úÖ PEFT adapters attached and model is ready for training.")


Trainable params: 4,399,104 | Total params: 319,518,592 | Trainable%: 1.3768%
‚úÖ PEFT adapters attached and model is ready for training.


In [None]:
# STEP 12 ‚Äî TrainingArguments for QLoRA fine-tuning on RunPod
from transformers import TrainingArguments
import os

# ‚úÖ Output directory in /workspace (not /kaggle)
OUTPUT_DIR = "/workspace/qwen2.5-14b-qlora"

train_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=5,
    per_device_train_batch_size=4,   # keep small, accumulate instead
    gradient_accumulation_steps=4,  # effective batch size = 16
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    weight_decay=0.0,
    fp16=True,                       # use mixed precision
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",

    # ‚úÖ Logging & checkpointing
    logging_strategy="epoch",        # print one line per epoch
    save_strategy="epoch",           # save at each epoch
    save_total_limit=2,              # keep only last 2 checkpoints (saves disk)
    logging_steps=50,                # only used if logging_strategy="steps"

    # ‚úÖ Performance
    dataloader_num_workers=4,

    # ‚úÖ Disable external loggers (use WandB if needed)
    report_to="none",
)

print("‚úÖ TrainingArguments ready ‚Üí", OUTPUT_DIR)

‚úÖ TrainingArguments ready ‚Üí /workspace/qwen2.5-14b-qlora


In [21]:
# STEP 13 ‚Äî Initialize Trainer (dynamic padding; labels created by collator)
from datasets import load_from_disk
from transformers import DataCollatorForLanguageModeling, Trainer

# ‚úÖ Reload tokenized dataset saved in Step 6 (RunPod path)
train_dataset = load_from_disk("/workspace/ds_tokenized")

# ‚úÖ Drop precomputed labels to avoid mismatch when using dynamic padding
if "labels" in train_dataset.column_names:
    train_dataset = train_dataset.remove_columns("labels")

# ‚úÖ Ensure right-padding for causal LM
tokenizer.padding_side = "right"

# ‚úÖ Collator: pads dynamically and sets labels = input_ids (mlm=False)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8,   # speeds up training on tensor cores (A100)
)

# ‚úÖ Trainer: links model, args, dataset, tokenizer, collator
trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=train_dataset,   # full dataset (no validation split)
    eval_dataset=None,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("‚úÖ Trainer initialized (dynamic padding + labels).")


‚úÖ Trainer initialized (dynamic padding + labels).


  trainer = Trainer(


In [24]:
# STEP 14 ‚Äî Train
train_result = trainer.train()

# Save PEFT adapters and tokenizer
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("‚úÖ Training finished.")
print(train_result)

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
4,3.0388


‚úÖ Training finished.
TrainOutput(global_step=4, training_loss=3.0387957096099854, metrics={'train_runtime': 21.7074, 'train_samples_per_second': 2.303, 'train_steps_per_second': 0.184, 'total_flos': 25859326089216.0, 'train_loss': 3.0387957096099854, 'epoch': 1.0})


In [None]:
# STEP ‚Äî Reload fine-tuned model (base + LoRA adapters) and run inference
import os, re, json, torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# ‚úÖ Model & paths
BASE_DIR     = "/workspace"
CACHE_DIR    = os.getenv("HF_HOME", f"{BASE_DIR}/hf_cache")
QWEN_MODEL_ID = "Qwen/Qwen2.5-Coder-14B-Instruct"   # use same as training
OUTPUT_DIR    = f"{BASE_DIR}/qwen2.5-14b-qlora"    # LoRA adapter save dir from training

# ‚úÖ Compute dtype
compute_dtype = torch.bfloat16 if (torch.cuda.is_available() and torch.cuda.is_bf16_supported()) else torch.float16

# ‚úÖ Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype
)

# ‚úÖ Reload tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    QWEN_MODEL_ID,
    use_fast=True,
    trust_remote_code=True,
    cache_dir=CACHE_DIR
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# ‚úÖ Reload base model in 4-bit
base_model = AutoModelForCausalLM.from_pretrained(
    QWEN_MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=compute_dtype,
    trust_remote_code=True,
    cache_dir=CACHE_DIR
)

# ‚úÖ Attach trained LoRA adapters
model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
model.eval()
print("‚úÖ Fine-tuned model (base + LoRA) ready for inference.")

# ===== Prompt rendering helpers =====
def render_prompt(instruction: str) -> str:
    msgs = [
        {"role": "system", "content": guideline},  # replace with your guideline
        {"role": "user", "content": instruction}
    ]
    return tokenizer.apply_chat_template(
        msgs,
        tokenize=False,
        add_generation_prompt=True
    )

# Extract code block from model output
CODE_BLOCK_RE = re.compile(r"```python\s*([\s\S]*?)```", re.IGNORECASE)
def extract_code(text: str) -> str:
    m = CODE_BLOCK_RE.search(text)
    return m.group(1).strip() if m else text.strip()

# ===== Inference function =====
@torch.inference_mode()
def generate_code(instruction: str,
                  max_new_tokens=512,
                  temperature=0.2,
                  top_p=0.9,
                  rep_penalty=1.05):
    text = render_prompt(instruction)
    inputs = tokenizer([text], return_tensors="pt").to(model.device)
    out_ids = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=rep_penalty,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    gen = tokenizer.decode(out_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    return extract_code(gen)

# ===== Quick sanity check =====
print(generate_code("Write a function sum_series(n) that returns the sum of first n natural numbers."))


‚úÖ Fine-tuned model (base + LoRA) ready for inference.
def sum_series(n):
    # Initialize the sum variable
    total_sum = 0
    
    # Loop through the range from 1 to n
    for i in range(1, n + 1):
        # Add the current number to the total sum
        total_sum += i
    
    return total_sum


In [26]:
from peft import PeftModel

# Sanity check: model type should be a PEFT wrapper (not raw Qwen model)
print("Model class:", type(model))

# LoRA config should be visible in the model
print("PEFT config:", getattr(model, "peft_config", None))


Model class: <class 'peft.peft_model.PeftModelForCausalLM'>
PEFT config: {'default': LoraConfig(task_type='CAUSAL_LM', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='Qwen/Qwen2.5-Coder-0.5B-Instruct', revision=None, inference_mode=True, r=8, target_modules={'o_proj', 'k_proj', 'q_proj', 'up_proj', 'v_proj', 'gate_proj', 'down_proj'}, exclude_modules=None, lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, use_qalora=False, qalora_group_size=16, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False, target_parameters=None)}


In [None]:
# === DEV INFERENCE: use fine-tuned Qwen (base + LoRA) ===
import os, re, json, torch
import pandas as pd
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# ---- Config ----
BASE_DIR      = "/workspace"
CACHE_DIR     = os.getenv("HF_HOME", f"{BASE_DIR}/hf_cache")
QWEN_MODEL_ID = "Qwen/Qwen2.5-Coder-14B-Instruct"   # must match training
OUTPUT_DIR    = f"{BASE_DIR}/qwen2.5-14b-qlora"    # LoRA adapter dir (from TrainingArguments)
DEV_IN_PATH   = f"gpt_translated_test_data.csv"  # must contain: id,instruction
PRED_JSON     = f"{BASE_DIR}/dev_predictions.json"
SUB_JSON      = f"{BASE_DIR}/submission.json"

# ---- Load tokenizer & base model ----
compute_dtype = torch.bfloat16 if (torch.cuda.is_available() and torch.cuda.is_bf16_supported()) else torch.float16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=compute_dtype,
)

tokenizer = AutoTokenizer.from_pretrained(
    QWEN_MODEL_ID,
    use_fast=True,
    trust_remote_code=True,
    cache_dir=CACHE_DIR
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

_base = AutoModelForCausalLM.from_pretrained(
    QWEN_MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=compute_dtype,
    trust_remote_code=True,
    cache_dir=CACHE_DIR
)

# ---- Attach trained LoRA adapters ----
model = PeftModel.from_pretrained(_base, OUTPUT_DIR)
model.eval()
print("‚úÖ Fine-tuned model ready for dev inference.")

# ---- Prompt rendering ----
def render_prompt(instruction: str) -> str:
    msgs = [
        {"role": "system", "content": guideline},  # replace with your guideline if defined
        {"role": "user",  "content": instruction.strip()}
    ]
    return tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)

CODE_BLOCK_RE = re.compile(r"```python\s*([\s\S]*?)```", re.IGNORECASE)
def extract_code(text: str) -> str:
    m = CODE_BLOCK_RE.search(text)
    return m.group(1).strip() if m else text.strip()

# ---- Inference function ----
@torch.inference_mode()
def generate_code(instruction: str,
                  max_new_tokens=512,
                  rep_penalty=1.05):
    text = render_prompt(instruction)
    inputs = tokenizer([text], return_tensors="pt").to(model.device)
    out_ids = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,        # deterministic (greedy decoding)
        temperature=0.0,
        top_p=1.0,
        repetition_penalty=rep_penalty,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    gen = tokenizer.decode(out_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    return extract_code(gen)

# ---- Run inference over dev set ----
df = pd.read_csv(DEV_IN_PATH)

###############
# df = df.sample(20)

assert {"id", "instruction"}.issubset(df.columns), "dev_translated.csv must have id,instruction"

pred_rows = []
for rid, instr in tqdm(zip(df["id"], df["instruction"]), total=len(df), desc="üîÅ Generating"):
    code = generate_code(instr)
    fenced = f"```python\n{code}\n```"
    pred_rows.append({"id": int(rid), "instruction": instr, "response": fenced})

# ---- Save predictions ----
with open(PRED_JSON, "w", encoding="utf-8") as f:
    json.dump(pred_rows, f, ensure_ascii=False, indent=2)

sub_rows = [{"id": r["id"], "response": r["response"]} for r in pred_rows]
with open(SUB_JSON, "w", encoding="utf-8") as f:
    json.dump(sub_rows, f, ensure_ascii=False, indent=2)

print(f"‚úÖ Wrote {PRED_JSON}, {SUB_JSON}")

‚úÖ Fine-tuned model ready for dev inference.


üîÅ Generating:   0%|          | 0/20 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [None]:
import json, os, re, zipfile

SUB_PATH = "/workspace/submission.json"

def file_format_check(path: str) -> bool:
    # name + extension
    if os.path.basename(path) != "submission.json":
        print("Error: File name must be exactly 'submission.json'")
        return False
    if not path.lower().endswith(".json"):
        print("Error: File must have .json extension")
        return False

    # must be valid JSON (not JSONL) and root must be a list
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
    except json.JSONDecodeError as e:
        print(f"Error: Invalid JSON format - {e}")
        print("Note: The file must be in proper JSON format (not JSONL)")
        return False

    if not isinstance(data, list):
        print("Error: The root element should be a list of objects")
        return False

    # each item: dict with ONLY keys {'id','response'}; id=int; response=str
    for idx, item in enumerate(data):
        if not isinstance(item, dict):
            print(f"Error: Item at index {idx} is not a dictionary")
            return False
        keys = set(item.keys())
        if keys != {"id", "response"}:
            print(f"Error: Item at index {idx} must contain only keys 'id' and 'response', found: {keys}")
            return False
        if not isinstance(item["id"], int):
            print(f"Error: 'id' field at index {idx} must be an integer")
            return False
        if not isinstance(item["response"], str):
            print(f"Error: 'response' field at index {idx} must be a string")
            return False

    print("Format check passed successfully!")
    return True

# ---------- Load, compute per-item validity, blank invalids, save, zip ----------
# Load JSON list
with open(SUB_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

n = len(data)
fence_pat = re.compile(r"^```python[\s\S]*```$", re.MULTILINE)

valid_format = []
valid_fence  = []
valid_both   = []

# Per-item validation mirrors file checker semantics
def item_format_ok(item):
    return (
        isinstance(item, dict)
        and set(item.keys()) == {"id", "response"}
        and isinstance(item["id"], int)
        and isinstance(item["response"], str)
    )

for item in data:
    vfmt = item_format_ok(item)
    vf   = bool(fence_pat.match(item["response"])) if vfmt else False
    valid_format.append(vfmt)
    valid_fence.append(vf)
    valid_both.append(vfmt and vf)

# After computing valid_format, valid_fence, valid_both
for i, item in enumerate(data):
    if not valid_format[i]:
        print(f"‚ùå Format Error at index {i}: {item}")
    elif not valid_fence[i]:
        print(f"‚ùå Fencing Error at index {i} (id={item.get('id')}):")
        print(item["response"])
        print("-" * 50)



# Report stats
nf = sum(valid_fence)
nm = sum(valid_format)
nb = sum(valid_both)
den = max(n, 1)
print(f"Fencing valid: {nf}/{n} ({nf*100.0/den:.1f}%)")
print(f"Format valid:  {nm}/{n} ({nm*100.0/den:.1f}%)")
print(f"Both valid:    {nb}/{n} ({nb*100.0/den:.1f}%)")

# Strict policy: blank responses that fail ANY check
for i, ok in enumerate(valid_both):
    if not ok and isinstance(data[i], dict) and "response" in data[i]:
        data[i]["response"] = ""

# Overwrite submission.json (id+response only)
with open(SUB_PATH, "w", encoding="utf-8") as f:
    json.dump(
        [{"id": item["id"], "response": item["response"]} for item in data],
        f, ensure_ascii=False, indent=2
    )
print("‚úÖ Updated submission.json after checks (invalid responses blanked).")

# Final file-level check (should pass)
_ = file_format_check(SUB_PATH)

# Zip as submission.zip (Jupyter-friendly, no shell commands)
with zipfile.ZipFile("submission.zip", "w", compression=zipfile.ZIP_DEFLATED) as zf:
    zf.write(SUB_PATH)
print("üì¶ Created submission.zip containing submission.json.")
