In [1]:
import os
import logging
from functools import partial

# Make sure CUDA_VISIBLE_DEVICES is set to use only GPU 0
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Add project root to path so we can import your modules
import sys
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # assuming notebook is inside jupyterNotebooks/
sys.path.append(project_root)

# Import pipeline helper functions and config
from transformers import set_seed
from model_training.config import (
    csv_path, output_dir, train_batch_size,
    gradient_accumulation, num_epochs, learning_rate, model_path,truncation_side,padding_side
)
from model_training.dataset_utils import load_dataset, preprocess_dataset,format_prompt
from model_training.tokenizer_utils import load_tokenizer, tokenize_function
from model_training.model_utils import load_model
from model_training.logging_utils import setup_main_logger, start_hardware_logging

print("Imports done")


  import pynvml  # type: ignore[import]


Imports done


In [2]:
set_seed(42)
print("Seed set to 42")


Seed set to 42


In [3]:
os.makedirs(output_dir, exist_ok=True)

# For notebook, we'll keep it simple — log to console instead of file
logger = logging.getLogger("train_pipeline")
logger.setLevel(logging.INFO)
if not logger.hasHandlers():
    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    ch.setFormatter(formatter)
    logger.addHandler(ch)

logger.info("Starting training pipeline")


2025-09-16 00:29:29,583 - INFO - Starting training pipeline


In [4]:
stop_event, log_thread = start_hardware_logging(output_dir, interval=5)
logger.info("Started hardware logging thread")


2025-09-16 00:29:29,589 - INFO - Started hardware logging thread


In [5]:
dataset = load_dataset(csv_path)
logger.info("Loaded dataset")
print(f"Dataset sample:\n{dataset[0]}")

Map:   0%|          | 0/8570 [00:00<?, ? examples/s]

2025-09-16 00:29:29,967 - INFO - Loaded dataset


Dataset sample:
{'requirement_description': 'detection_degradation_requirements, Adcam_Low system shall degrade Traffic Sign Recognition feature for required Calibration faults and set a primary DTC.', 'test_steps': 'Step 1 : Set ECU Calibration TSR_VS_Critical_FunctionalityFiMXcp_b to 0 - Activated\r\nStep 2 : Set ECU Calibration TSR_VS_Low_FunctionalityFiMXcp_b to 0 - Activated\r\nStep 3 : Set ECU Calibration TSR_VS_DetFeature_DEGReason_Internal_Error_NonRecoverableFiMXcp_b to 1 - DeActivated\r\nStep 4 : Set ECU Calibration TSR_VS_DetFeature_DEGReason_Internal_Error_RecoverableFiMXcp_b to 1 - DeActivated\r\nStep 5 : Set ECU Calibration TSR_VS_DetFeature_DEGReason_External_FaultFiMXcp_b to 1 - DeActivated\r\nStep 6 : Set ECU Calibration TSR_VS_DetFeature_DEGReason_InputNotAvailable_Calib_Blockage_MsgsFiMXcp_b to 1 - DeActivated\r\nStep 7 : Set ECU Calibration TSR_VS_DEGReason_Sensor_BlockedFiMXcp_b to 1 - DeActivated\r\nStep 8 : Set ECU Calibration TSR_VS_DetFeature_DEGReason_Sensor_M

In [6]:
split_dataset = preprocess_dataset(dataset)
logger.info("Split dataset into train/test")

print("Split dataset keys:", split_dataset.keys())
print(f"Train size: {len(split_dataset['train'])}")
print(f"Test size: {len(split_dataset['test'])}")
print("\nSample train record:")
print(split_dataset["train"][0])


2025-09-16 00:29:29,980 - INFO - Split dataset into train/test


Split dataset keys: dict_keys(['train', 'test'])
Train size: 8484
Test size: 86

Sample train record:


In [7]:
from datasets import Dataset
from model_training.tokenizer_utils import load_tokenizer, tokenize_function
from functools import partial
dummy_data = {
    "input": [
        "Add two numbers.",  # Small
        "Add two numbers. Ensure the function handles both positive and negative integers.",  # Medium
        "Add two numbers. Ensure the function handles both positive and negative integers. The function should also support very large numbers, be optimized for performance, handle edge cases such as integer overflows, and provide meaningful error messages in case of invalid input. Additionally, it should be covered by unit tests and integration tests to ensure correctness."  # Large
    ],
    "output": [
        "Return their sum.",  # Small
        "Return the sum of the numbers, accounting for positive and negative values.",  # Medium
        "Return the sum of the numbers, ensuring correctness across all edge cases, including large integer values, and throw an appropriate exception in case of invalid inputs like None or non-integer types."  # Large
    ],
    "requirement_description": [
        "Write a function that takes two integers and returns their sum.",
        "Write a function that takes two integers and returns their sum. Ensure that the implementation is safe for all valid integer inputs.",
        "Write a function that takes two integers and returns their sum. The function must pass rigorous testing, including randomized tests, performance benchmarks, and edge case scenarios such as maximum and minimum integers."
    ],
    "test_steps": [
        "Input: 2, 3 -> Output: 5",
        "Input: -1, 4 -> Output: 3; Input: 0, 0 -> Output: 0",
        "Input: 2147483647, 1 -> Output: Error; Input: -2147483648, -1 -> Output: Error"
    ]
}

# Convert to HuggingFace dataset
dummy_dataset = Dataset.from_dict(dummy_data)

In [8]:
tokenizer = load_tokenizer()

tokenized_dataset = dummy_dataset.map(
    partial(tokenize_function, tokenizer=tokenizer),
    remove_columns=["input", "output", "requirement_description", "test_steps"],
    batched=True,
    batch_size=5,
)


[Tokenizer] pad_token not found. Using eos_token (</s>) as pad_token.
[Tokenizer] Loaded tokenizer from /home/navpc24/Desktop/llm-finetuning/Mistral-3B-Instruct-v0.2-init
[Tokenizer] Truncation side: right
[Tokenizer] Padding side: right
[Tokenizer] Max sequence length: 2048


Map:   0%|          | 0/3 [00:00<?, ? examples/s]


--- Tokenization Debug Info ---

[Example 1]
Prompt string: '<s>[INST] Add two numbers. [/INST]Return their sum.</s>'...
input_ids: [1, 1, 733, 16289, 28793, 3301, 989, 5551, 28723, 733, 28748, 16289, 28793, 6168, 652, 2648, 28723, 2, 2, 2] ...
attention_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,