In [1]:
from datasets import Dataset
import pandas as pd

# Load data
df = pd.read_csv("C://Users//koushik//Desktop//project//data//flat_dataset//final_dataset.csv")
dataset = Dataset.from_pandas(df[['requirement_description', 'test_steps']])

# Format prompt
def format_prompt(example):
    return {
        "input": f"Write a test case for the following requirement:\n{example['requirement_description']}",
        "output": example["test_steps"]
    }

dataset = dataset.map(format_prompt)

# Split the dataset (1% for testing)
split_dataset = dataset.train_test_split(test_size=0.01, seed=42)
train_dataset = split_dataset['train']
test_dataset = split_dataset['test']



  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|█████████████████████████████████████████████████████████████| 16350/16350 [00:00<00:00, 22215.16 examples/s]


In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_path = r"C:\Users\koushik\Desktop\project\Mistral-3B-Instruct-v0.2-init"  # raw string for Windows path

tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_path,load_in_4bit=True, local_files_only=True, device_map="cpu")


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
The 8-bit optimizer is not available on your device, only available on CUDA for now.
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 3/3 [02:32<00:00, 50.91s/it]


In [3]:
print(dataset.column_names)


['requirement_description', 'test_steps', 'input', 'output']


In [22]:
def tokenize_function(example):
    input_text = example["input"] if example["input"] is not None else ""
    output_text = example["output"] if example["output"] is not None else ""

    combined_text = input_text + "\n" + output_text

    tokenized = tokenizer(
        combined_text,
        max_length=2048,
        truncation=True,
        padding="max_length",  # force padding
        return_attention_mask=True,  # 👈 ensure attention_mask is returned
        return_tensors=None
    )

    labels = tokenized["input_ids"].copy()
    labels = [label if label != tokenizer.pad_token_id else -100 for label in labels]

    return {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],  # make sure it's here
        "labels": labels,
    }


tokenized_train_dataset = train_dataset.map(tokenize_function, remove_columns=["input", "output", "requirement_description", "test_steps"])
tokenized_test_dataset = test_dataset.map(tokenize_function, remove_columns=["input", "output", "requirement_description", "test_steps"])

Map: 100%|████████████████████████████████████████████████████████████████| 16186/16186 [03:00<00:00, 89.59 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████| 164/164 [00:01<00:00, 94.13 examples/s]


In [23]:
# ✅ Add this right after tokenization:
tokenized_train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [24]:
print(tokenized_train_dataset[0].keys())
print(tokenized_test_dataset[0].keys())
# Check how many samples are missing attention_mask
missing_mask_train = [i for i, x in enumerate(tokenized_train_dataset) if "attention_mask" not in x]
missing_mask_eval = [i for i, x in enumerate(tokenized_test_dataset) if "attention_mask" not in x]

print(f"Train samples missing attention_mask: {len(missing_mask_train)}")
print(f"Eval samples missing attention_mask: {len(missing_mask_eval)}")


dict_keys(['input_ids', 'attention_mask', 'labels'])
dict_keys(['input_ids', 'attention_mask', 'labels'])
Train samples missing attention_mask: 0
Eval samples missing attention_mask: 0


In [26]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

# Step 1: Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

# Step 2: Define LoRA config
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # adjust for Mistral architecture
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Step 3: Attach LoRA to the quantized model
model = get_peft_model(model, peft_config)

# (optional) Print trainable parameters
model.print_trainable_parameters()


trainable params: 3,407,872 || all params: 2,815,954,944 || trainable%: 0.1210


In [27]:
print(type(tokenized_train_dataset))
print(type(tokenized_test_dataset))


<class 'datasets.arrow_dataset.Dataset'>
<class 'datasets.arrow_dataset.Dataset'>


In [28]:
from torch.utils.data import DataLoader

test_loader = DataLoader(tokenized_train_dataset, batch_size=2, collate_fn=data_collator)

for batch in test_loader:
    print("Batch keys:", batch.keys())
    break


Sample keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
Batch keys: dict_keys(['input_ids', 'attention_mask', 'labels'])


In [None]:
import torch
from trl import SFTTrainer, SFTConfig

# Your config with dataset_text_field=None because dataset is pre-tokenized
sft_config = SFTConfig(
    output_dir="./mistral3b-finetuned",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    logging_steps=10,
    learning_rate=2e-5,
    bf16=False,
    fp16=False,
    optim="adamw_torch",
    save_steps=100,
    save_total_limit=1,
    report_to="none",
    dataset_text_field=None
)

# Custom simple collator for tokenized dataset with labels
def data_collator(features):
    print("Sample keys:", features[0].keys())  # Debug print
    batch = {
    "input_ids": torch.stack([f["input_ids"] for f in features]),
    "attention_mask": torch.stack([f["attention_mask"] for f in features]),
    "labels": torch.stack([f["labels"] for f in features]),
    }

    return batch


trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    data_collator=None,
)

trainer.tokenizer = tokenizer  # optional, but recommended

trainer.train()


Truncating train dataset: 100%|███████████████████████████████████████| 16186/16186 [00:00<00:00, 116379.42 examples/s]
Truncating eval dataset: 100%|█████████████████████████████████████████████| 164/164 [00:00<00:00, 20500.87 examples/s]
Trainer.tokenizer is now deprecated. You should use `Trainer.processing_class = processing_class` instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss


In [14]:
trainer.model.save_pretrained("mistral3b-finetuned")
tokenizer.save_pretrained("mistral3b-finetuned")


NameError: name 'trainer' is not defined

In [None]:
prompt = "Write a test case for the following requirement:\nVerify login with valid credentials."

inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
outputs = model.generate(**inputs, max_new_tokens=300)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))
