In [1]:
import os
import logging
from functools import partial

# Make sure CUDA_VISIBLE_DEVICES is set to use only GPU 0
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Add project root to path so we can import your modules
import sys
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # assuming notebook is inside jupyterNotebooks/
sys.path.append(project_root)

# Import pipeline helper functions and config
from transformers import set_seed
from model_training.config import (
    csv_path, output_dir, train_batch_size,
    gradient_accumulation, num_epochs, learning_rate, model_path,truncation_side,padding_side
)
from model_training.dataset_utils import load_dataset, preprocess_dataset,format_prompt
from model_training.tokenizer_utils import load_tokenizer, tokenize_function
from model_training.model_utils import load_model
from model_training.logging_utils import setup_main_logger, start_hardware_logging

print("Imports done")


  import pynvml  # type: ignore[import]


Imports done


In [2]:
set_seed(42)
print("Seed set to 42")


Seed set to 42


In [3]:
os.makedirs(output_dir, exist_ok=True)

# For notebook, we'll keep it simple — log to console instead of file
logger = logging.getLogger("train_pipeline")
logger.setLevel(logging.INFO)
if not logger.hasHandlers():
    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    ch.setFormatter(formatter)
    logger.addHandler(ch)

logger.info("Starting training pipeline")


2025-09-16 16:15:03,357 - INFO - Starting training pipeline


In [4]:
stop_event, log_thread = start_hardware_logging(output_dir, interval=5)
logger.info("Started hardware logging thread")


2025-09-16 16:15:10,277 - INFO - Started hardware logging thread


In [10]:
dataset = load_dataset(csv_path)
logger.info("Loaded dataset")
print(f"Dataset sample:\n{dataset[0]}")

NameError: name 'load_dataset' is not defined

In [None]:
split_dataset = preprocess_dataset(dataset)
logger.info("Split dataset into train/test")

print("Split dataset keys:", split_dataset.keys())
print(f"Train size: {len(split_dataset['train'])}")
print(f"Test size: {len(split_dataset['test'])}")
print("\nSample train record:")
print(split_dataset["train"][0])


In [None]:
from datasets import Dataset
from model_training.tokenizer_utils import load_tokenizer, tokenize_function
from functools import partial
dummy_data = {
    "input": [
        "Add two numbers.",  # Small
        "Add two numbers. Ensure the function handles both positive and negative integers.",  # Medium
        "Add two numbers. Ensure the function handles both positive and negative integers. The function should also support very large numbers, be optimized for performance, handle edge cases such as integer overflows, and provide meaningful error messages in case of invalid input. Additionally, it should be covered by unit tests and integration tests to ensure correctness."  # Large
    ],
    "output": [
        "Return their sum.",  # Small
        "Return the sum of the numbers, accounting for positive and negative values.",  # Medium
        "Return the sum of the numbers, ensuring correctness across all edge cases, including large integer values, and throw an appropriate exception in case of invalid inputs like None or non-integer types."  # Large
    ],
    "requirement_description": [
        "Write a function that takes two integers and returns their sum.",
        "Write a function that takes two integers and returns their sum. Ensure that the implementation is safe for all valid integer inputs.",
        "Write a function that takes two integers and returns their sum. The function must pass rigorous testing, including randomized tests, performance benchmarks, and edge case scenarios such as maximum and minimum integers."
    ],
    "test_steps": [
        "Input: 2, 3 -> Output: 5",
        "Input: -1, 4 -> Output: 3; Input: 0, 0 -> Output: 0",
        "Input: 2147483647, 1 -> Output: Error; Input: -2147483648, -1 -> Output: Error"
    ]
}

# Convert to HuggingFace dataset
dummy_dataset = Dataset.from_dict(dummy_data)

In [None]:
tokenizer = load_tokenizer()

tokenized_dataset = dummy_dataset.map(
    partial(tokenize_function, tokenizer=tokenizer),
    remove_columns=["input", "output", "requirement_description", "test_steps"],
    batched=True,
    batch_size=5,
)


In [6]:
from model_training.model_utils import load_model
def debug_trainable_params(model):
    total = 0
    trainable = 0
    print("\n[Trainable Parameters Check]")
    for name, param in model.named_parameters():
        total += param.numel()
        if param.requires_grad:
            trainable += param.numel()
            print(f"[✓] {name} | shape: {param.shape}")
        else:
            print(f"[ ] {name} | shape: {param.shape}")
    print(f"🔢 Total: {total:,} | Trainable: {trainable:,} ({100 * trainable / total:.2f}%)")

# After loading model
model = load_model()
debug_trainable_params(model)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]


[Trainable Parameters Check]
[ ] base_model.model.model.embed_tokens.weight | shape: torch.Size([32000, 4096])
[ ] base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight | shape: torch.Size([8388608, 1])
[✓] base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight | shape: torch.Size([8, 4096])
[✓] base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight | shape: torch.Size([4096, 8])
[ ] base_model.model.model.layers.0.self_attn.k_proj.weight | shape: torch.Size([2097152, 1])
[ ] base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight | shape: torch.Size([2097152, 1])
[✓] base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight | shape: torch.Size([8, 4096])
[✓] base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight | shape: torch.Size([1024, 8])
[ ] base_model.model.model.layers.0.self_attn.o_proj.weight | shape: torch.Size([8388608, 1])
[ ] base_model.model.model.layers.0.mlp.gate_proj.weight | shape: 

In [7]:
print(model)


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj)