In [1]:
!pip install -q --upgrade pip
!pip install -q datasets transformers accelerate evaluate sacrebleu rouge_score
!pip install -q git+https://github.com/huggingface/peft.git

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m

[2K   [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.8 MB[0m [31m9.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.8/1.8 MB[0m [31m27.8 MB/s[0m eta [36m0:00:01[0m

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25h

  Installing build dependencies ... [?25l[?25hdone


  Getting requirements to build wheel ... [?25l[?25hdone


  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


  Building wheel for rouge_score (pyproject.toml) ... [?25l[?25hdone


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
pylibcudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.
cudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.
bigframes 2.12.0 requires google-cloud-bigquery[bqstorage,pandas]>=3.31.0, but you have google-cloud-bigquery 3.25.0 which is incompatible.
bigframes 2.12.0 requires rich<14,>=12.4.4, but you have rich 14.1.0 which is incompatible.
libcugraph-cu12 25.6.0 requires libraft-cu12==25.6.*, but you have libraft-cu12 25.2.0 which is incompatible.
gradio 5.38.1 requires pydantic<2.12,>=2.0, but you have pydantic 2.12.0a1 which is incompatible.
cud

  Installing build dependencies ... [?25l[?25hdone


  Getting requirements to build wheel ... [?25l[?25hdone


  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone


In [2]:
# ----------------------
# 1) Imports & environment config
# ----------------------
import os
os.environ["WANDB_DISABLED"] = "true"  # disable wandb login prompt (recommended for Colab)

import math
import time
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    GPT2LMHeadModel,
    GPT2Tokenizer,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
    logging as hf_logging,
)
from peft import LoraConfig, get_peft_model  # for LoRA
import evaluate
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

hf_logging.set_verbosity_info()

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# ----------------------
# 2) Dataset loading (CodeXGLUE Java example; fallback to codeparrot)
# ----------------------
DATASET_ID = "google/code_x_glue_ct_code_to_text"  # CodeXGLUE; subset "java"
DATASET_CONFIG = "java"  # for code_x_glue_ct_code_to_text

try:
    print("Loading dataset:", DATASET_ID, DATASET_CONFIG)
    dataset = load_dataset(DATASET_ID, DATASET_CONFIG)
except Exception as e:
    print("Primary dataset load failed:", e)
    print("Falling back to codeparrot/codeparrot-py (small demo).")
    dataset = load_dataset("codeparrot/codeparrot-py")  # python examples

print(dataset)

# For quick testing on Colab reduce dataset size (optional)
# dataset = dataset.map(lambda x: x, batched=True).shard(num_shards=10, index=0)  # example

# ----------------------
# 3) Tokenizer & Preprocessing
# ----------------------
# Choose model/tokenizer - GPT-2 base example; change to code-aware tokenizer if available
MODEL_NAME = "gpt2"  # small model for Colab demo; replace with code models if available
print("Loading tokenizer and model:", MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
# GPT-2 has no pad token by default; set it (required for batch padding)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "<|pad|>"})

# Load model (causal LM)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(len(tokenizer))  # in case tokenizer added tokens
model.to(device)

# Identify dataset code/text fields automatically (best-effort)
print("Dataset column names (train):", dataset["train"].column_names)
# Common CodeXGLUE fields: 'code' and 'docstring' or similar. We'll try common names:
def detect_fields(ds):
    cols = ds.column_names
    # possible code fields
    code_candidates = ["code", "function", "snippet", "original_string", "task"]
    text_candidates = ["docstring", "summary", "text", "nl", "doc"]
    code_key = next((c for c in code_candidates if c in cols), None)
    text_key = next((c for c in text_candidates if c in cols), None)
    # fallback: try first two columns
    if code_key is None:
        code_key = cols[0]
    if text_key is None and len(cols) > 1:
        text_key = cols[1]
    return code_key, text_key

code_key, text_key = detect_fields(dataset["train"])
print("Using code field:", code_key, "and text field:", text_key)

# Build input format: prompt -> code generation.
# For code-to-text tasks might be reversed; adjust based on your task.
# We'll assume inputs are text (spec) and outputs are code (generation) for a code generation assistant:
def format_example_for_lm(example):
    # Many datasets have code in the 'code' column and a surrounding docstring in 'docstring'
    # We'll create an input like: "### Prompt:\n{prompt}\n### Code:\n{code}"
    prompt = example.get(text_key, "") or ""
    code = example.get(code_key, "") or ""
    # Use a separator token
    full = f"### Prompt:\n{prompt}\n### Code:\n{code}"
    return {"text": full}

# Map to a simple text column if not present
if "text" not in dataset["train"].column_names:
    dataset = dataset.map(lambda ex: format_example_for_lm(ex), batched=False)

# Tokenize
MAX_LENGTH = 512
def tokenize_function(examples):
    outputs = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
    )
    # For causal LM training, set labels = input_ids
    outputs["labels"] = outputs["input_ids"].copy()
    return outputs

tokenized = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)

# Quick peek
print(tokenized)
print("Sample tokenized keys:", list(tokenized["train"].features.keys()))

# ----------------------
# 4) Data collator and Trainer setup
# ----------------------
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

OUTPUT_DIR = "./codegen_model_demo"
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_strategy="epoch", # Changed from evaluation_strategy
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    fp16=torch.cuda.is_available(),
    learning_rate=5e-5,
    weight_decay=0.01,
    push_to_hub=False,
    remove_unused_columns=False,
    num_train_epochs=1 # Reduced epochs
)

# Use a smaller subset of the training data for faster training
small_train_dataset = tokenized["train"].select(range(10000)) # Reduced dataset size

# ----------------------
# 5) Optional: LoRA (PEFT) — set USE_LORA=True to enable
# ----------------------
USE_LORA = False  # set True to enable LoRA (recommended for low-resource setups)
if USE_LORA:
    print("Applying LoRA...")
    lora_config = LoraConfig(
        r=8,
        lora_alpha=32,
        target_modules=["c_attn", "c_proj"],  # these target modules may need adjusting per model
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

# ----------------------
# 6) Trainer and training
# ----------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset, # Use the smaller dataset
    eval_dataset=tokenized["validation"] if "validation" in tokenized else tokenized["test"],
    data_collator=data_collator,
)

print("Starting training... (this may take a while on larger datasets)")
trainer.train()

# Save model
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Saved model & tokenizer to", OUTPUT_DIR)

# ----------------------
# 7) Evaluation: generate predictions, compute BLEU & ROUGE
# ----------------------
# Load metrics
bleu_metric = evaluate.load("sacrebleu")
rouge_metric = evaluate.load("rouge")

# Prepare a small evaluation set (tune this)
EVAL_SAMPLES = 50
eval_ds = tokenized["validation"] if "validation" in tokenized else tokenized["test"]
eval_ds = eval_ds.select(range(min(len(eval_ds), EVAL_SAMPLES)))

model.eval()
preds = []
refs = []

# Helper to decode and clean text
def decode_text(ids):
    return tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=True).strip()

# If dataset still contains 'text' combining prompt & code, we need to split prompt vs code in ref.
# For simplicity, we assume the reference code starts after "### Code:\n" token in the original text.
def extract_ref_code_from_text(raw_text):
    if raw_text is None:
        return ""
    marker = "### Code:"
    if marker in raw_text:
        return raw_text.split(marker, 1)[1].strip()
    return raw_text

# We'll iterate and generate
for ex in tqdm(eval_ds):
    input_ids = torch.tensor([ex["input_ids"]], device=model.device)
    # Find input length: we want to generate continuation beyond prompt if our formatting uses prompt+code
    # For generality use the whole input as context and generate new tokens (but TBH for LM labels==input)
    with torch.no_grad():
        gen_ids = model.generate(
            input_ids,
            max_new_tokens=128,
            num_beams=3,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )
    pred_text = decode_text(gen_ids[0].cpu().tolist())
    # Get reference text by decoding labels
    raw_text = decode_text(ex["labels"])
    ref_code = extract_ref_code_from_text(raw_text)
    preds.append(pred_text)
    refs.append(ref_code)

# Trim/truncate long refs/preds for metrics (sacrebleu expects reasonable lengths)
# Compute BLEU (sacrebleu via evaluate expects references as list of reference lists)
bleu_res = bleu_metric.compute(predictions=preds, references=[[r] for r in refs])
rouge_res = rouge_metric.compute(predictions=preds, references=refs)
print(f"BLEU score: {bleu_res['score']:.2f}")
if "rougeL" in rouge_res:
    print(f"ROUGE-L F1: {rouge_res['rougeL']:.2f}")
else:
    print("ROUGE output:", rouge_res)

# ----------------------
# 8) Execution-based testing (VERY DANGEROUS — read warnings below)
# ----------------------
# WARNING: Executing generated code is potentially harmful. This toy harness is NOT secure.
# Use proper sandboxing for untrusted or unknown code (Docker with resource/time limits).
def safe_exec_test(code_str, test_fn_name=None, input_args=None, timeout_seconds=2):
    """
    Very small-scope executor for *trusted* snippets only (toy use).
    - code_str: Python code that defines a function or piece of code.
    - test_fn_name: optional function name in code_str to call for a simple test.
    Returns (passed: bool, error/message).
    """
    try:
        # Define empty globals/locals to limit exposure
        local_ns = {}
        global_ns = {"__builtins__": {}}  # very restrictive builtins (may break normal code)
        # If the code uses common builtins like range/int/list, need to allow them; we keep strict here.
        exec(code_str, global_ns, local_ns)
        if test_fn_name:
            if test_fn_name not in local_ns:
                return False, f"Function {test_fn_name} not found."
            fn = local_ns[test_fn_name]
            # Simple call with provided args
            out = fn(* (input_args or []))
            return True, f"Ran {test_fn_name} -> output: {out}"
        return False, "Execution failed or timed out." # Basic placeholder, enhance with actual timeout logic
    except Exception as e:
        return False, str(e)

# Example usage on preds (ONLY if snippets are Python and you trust them)
exec_results = []
for p in preds[:20]:  # run only first 20 for speed & safety
    ok, msg = safe_exec_test(p)
    exec_results.append(ok)

exec_pass_rate = sum(1 for x in exec_results if x) / max(1, len(exec_results))
print(f"Toy execution pass rate (untrusted code WARNING): {exec_pass_rate:.2%}")

# ----------------------
# 9) Collect & plot metrics per run (single epoch demo)
# ----------------------
metrics_df = pd.DataFrame({
    "epoch": [1],
    "BLEU": [bleu_res["score"]],
    "ROUGE-L": [rouge_res.get("rougeL", 0.0) if isinstance(rouge_res, dict) else 0.0],
    "exec_pass_rate": [exec_pass_rate],
})
print(metrics_df)

# plotting function (simple)
def plot_metrics(df, out_dir="/mnt/data/codegen_metrics_examples"):
    os.makedirs(out_dir, exist_ok=True)
    epochs = df["epoch"].tolist()
    # Loss may not exist in df for this demo; plotting BLEU/ROUGE & exec rate
    plt.figure(figsize=(8,4))
    plt.plot(epochs, df["BLEU"], marker='o')
    plt.plot(epochs, df["ROUGE-L"], marker='o')
    plt.xlabel("Epoch"); plt.ylabel("Score"); plt.title("BLEU & ROUGE-L")
    plt.legend(["BLEU", "ROUGE-L"])
    plt.grid(True); plt.tight_layout()
    bleu_path = os.path.join(out_dir, "bleu_rouge_plot.png")
    plt.savefig(bleu_path); plt.close()

    plt.figure(figsize=(8,4))
    plt.plot(epochs, df["exec_pass_rate"], marker='o')
    plt.xlabel("Epoch"); plt.ylabel("Execution Pass Rate"); plt.title("Execution Pass Rate")
    plt.ylim(0,1.05); plt.grid(True); plt.tight_layout()
    exec_path = os.path.join(out_dir, "exec_pass_plot.png")
    plt.savefig(exec_path); plt.close()
    print("Saved plots to", out_dir)
    return bleu_path, exec_path

plot_metrics(metrics_df)

# ----------------------
# End - summary
# ----------------------
print("Done. Summary metrics:")
print(metrics_df.to_string(index=False))
print("Model & tokenizer saved to:", OUTPUT_DIR)
print("Plots saved to /mnt/data/codegen_metrics_examples")

2025-11-03 21:41:17.631824: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762206077.925969      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762206078.010163      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Device: cpu
Loading dataset: google/code_x_glue_ct_code_to_text java


README.md: 0.00B [00:00, ?B/s]

java/train-00000-of-00001.parquet:   0%|          | 0.00/141M [00:00<?, ?B/s]

java/validation-00000-of-00001.parquet:   0%|          | 0.00/4.25M [00:00<?, ?B/s]

java/test-00000-of-00001.parquet:   0%|          | 0.00/9.38M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/164923 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5183 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10955 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 164923
    })
    validation: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 5183
    })
    test: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 10955
    })
})
Loading tokenizer and model: gpt2


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json


Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.53.3",
  "use_cache": true,
  "vocab_size": 50257
}



vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

loading file vocab.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/vocab.json


loading file merges.txt from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/merges.txt


loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/tokenizer.json


loading file added_tokens.json from cache at None


loading file special_tokens_map.json from cache at None


loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/tokenizer_config.json


loading file chat_template.jinja from cache at None


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json


Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.53.3",
  "use_cache": true,
  "vocab_size": 50257
}



loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json


Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.53.3",
  "use_cache": true,
  "vocab_size": 50257
}





model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/model.safetensors


Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256
}



All model checkpoint weights were used when initializing GPT2LMHeadModel.



All the weights of GPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/generation_config.json


Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256
}



You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 50258. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Dataset column names (train): ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url']
Using code field: code and text field: docstring


Map:   0%|          | 0/164923 [00:00<?, ? examples/s]

Map:   0%|          | 0/5183 [00:00<?, ? examples/s]

Map:   0%|          | 0/10955 [00:00<?, ? examples/s]

Map:   0%|          | 0/164923 [00:00<?, ? examples/s]

Map:   0%|          | 0/5183 [00:00<?, ? examples/s]

Map:   0%|          | 0/10955 [00:00<?, ? examples/s]

PyTorch: setting up devices


The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 164923
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5183
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10955
    })
})
Sample tokenized keys: ['input_ids', 'attention_mask', 'labels']


***** Running training *****


  Num examples = 10,000


  Num Epochs = 1


  Instantaneous batch size per device = 4


  Total train batch size (w. parallel, distributed & accumulation) = 4


  Gradient Accumulation steps = 1


  Total optimization steps = 2,500


  Number of trainable parameters = 124,440,576


Starting training... (this may take a while on larger datasets)


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss


In [None]:
 #----------------------
# 7) Evaluation: generate predictions, compute BLEU & ROUGE
# ----------------------
# Load metrics
bleu_metric = evaluate.load("sacrebleu")
rouge_metric = evaluate.load("rouge")

# Prepare a small evaluation set (tune this)
EVAL_SAMPLES = 50
eval_ds = tokenized["validation"] if "validation" in tokenized else tokenized["test"]
eval_ds = eval_ds.select(range(min(len(eval_ds), EVAL_SAMPLES)))

model.eval()
preds = []
refs = []

# Helper to decode and clean text
def decode_text(ids):
    return tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=True).strip()

# If dataset still contains 'text' combining prompt & code, we need to split prompt vs code in ref.
# For simplicity, we assume the reference code starts after "### Code:\n" token in the original text.
def extract_ref_code_from_text(raw_text):
    if raw_text is None:
        return ""
    marker = "### Code:"
    if marker in raw_text:
        return raw_text.split(marker, 1)[1].strip()
    return raw_text

# We'll iterate and generate
for ex in tqdm(eval_ds):
    input_ids = torch.tensor([ex["input_ids"]], device=model.device)
    # Find input length: we want to generate continuation beyond prompt if our formatting uses prompt+code
    # For generality use the whole input as context and generate new tokens (but TBH for LM labels==input)
    with torch.no_grad():
        gen_ids = model.generate(
            input_ids,
            max_new_tokens=128,
            num_beams=3,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )
    pred_text = decode_text(gen_ids[0].cpu().tolist())
    # Get reference text by decoding labels
    raw_text = decode_text(ex["labels"])
    ref_code = extract_ref_code_from_text(raw_text)
    preds.append(pred_text)
    refs.append(ref_code)

# Trim/truncate long refs/preds for metrics (sacrebleu expects reasonable lengths)
# Compute BLEU (sacrebleu via evaluate expects references as list of reference lists)
bleu_res = bleu_metric.compute(predictions=preds, references=[[r] for r in refs])
rouge_res = rouge_metric.compute(predictions=preds, references=refs)
print(f"BLEU score: {bleu_res['score']:.2f}")
if "rougeL" in rouge_res:
    print(f"ROUGE-L F1: {rouge_res['rougeL']:.2f}")
else:
    print("ROUGE output:", rouge_res)

# ----------------------
# 8) Execution-based testing (VERY DANGEROUS — read warnings below)
# ----------------------
# WARNING: Executing generated code is potentially harmful. This toy harness is NOT secure.
# Use proper sandboxing for untrusted or unknown code (Docker with resource/time limits).
def safe_exec_test(code_str, test_fn_name=None, input_args=None, timeout_seconds=2):
    """
    Very small-scope executor for *trusted* snippets only (toy use).
    - code_str: Python code that defines a function or piece of code.
    - test_fn_name: optional function name in code_str to call for a simple test.
    Returns (passed: bool, error/message).
    """
    try:
        # Define empty globals/locals to limit exposure
        local_ns = {}
        global_ns = {"__builtins__": {}}  # very restrictive builtins (may break normal code)
        # If the code uses common builtins like range/int/list, need to allow them; we keep strict here.
        exec(code_str, global_ns, local_ns)
        if test_fn_name:
            if test_fn_name not in local_ns:
                return False, f"Function {test_fn_name} not found."
            fn = local_ns[test_fn_name]
            # Simple call with provided args
            out = fn(* (input_args or []))
            return True, f"Ran {test_fn_name} -> output: {out}"
        return False, "Execution failed or timed out." # Basic placeholder, enhance with actual timeout logic
    except Exception as e:
        return False, str(e)

# Example usage on preds (ONLY if snippets are Python and you trust them)
exec_results = []
for p in preds[:20]:  # run only first 20 for speed & safety
    ok, msg = safe_exec_test(p)
    exec_results.append(ok)

exec_pass_rate = sum(1 for x in exec_results if x) / max(1, len(exec_results))
print(f"Toy execution pass rate (untrusted code WARNING): {exec_pass_rate:.2%}")

# ----------------------
# 9) Collect & plot metrics per run (single epoch demo)
# ----------------------
metrics_df = pd.DataFrame({
    "epoch": [1],
    "BLEU": [bleu_res["score"]],
    "ROUGE-L": [rouge_res.get("rougeL", 0.0) if isinstance(rouge_res, dict) else 0.0],
    "exec_pass_rate": [exec_pass_rate],
})
print(metrics_df)

# plotting function (simple)
def plot_metrics(df, out_dir="/mnt/data/codegen_metrics_examples"):
    os.makedirs(out_dir, exist_ok=True)
    epochs = df["epoch"].tolist()
    # Loss may not exist in df for this demo; plotting BLEU/ROUGE & exec rate
    plt.figure(figsize=(8,4))
    plt.plot(epochs, df["BLEU"], marker='o')
    plt.plot(epochs, df["ROUGE-L"], marker='o')
    plt.xlabel("Epoch"); plt.ylabel("Score"); plt.title("BLEU & ROUGE-L")
    plt.legend(["BLEU", "ROUGE-L"])
    plt.grid(True); plt.tight_layout()
    bleu_path = os.path.join(out_dir, "bleu_rouge_plot.png")
    plt.savefig(bleu_path); plt.close()

    plt.figure(figsize=(8,4))
    plt.plot(epochs, df["exec_pass_rate"], marker='o')
    plt.xlabel("Epoch"); plt.ylabel("Execution Pass Rate"); plt.title("Execution Pass Rate")
    plt.ylim(0,1.05); plt.grid(True); plt.tight_layout()
    exec_path = os.path.join(out_dir, "exec_pass_plot.png")
    plt.savefig(exec_path); plt.close()
    print("Saved plots to", out_dir)
    return bleu_path, exec_path

plot_metrics(metrics_df)

# ----------------------
# End - summary
# ----------------------
print("Done. Summary metrics:")
print(metrics_df.to_string(index=False))
print("Model & tokenizer saved to:", OUTPUT_DIR)
print("Plots saved to /mnt/data/codegen_metrics_examples")