In [1]:
# Cell 1 — Imports & Config

import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
)

BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"
OUT_DIR = "./mistral7b_qlora_out"
DATA_PATH = "spark_llm_dataset_50k.jsonl"

os.makedirs(OUT_DIR, exist_ok=True)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

supports_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
print("BF16 supported:", supports_bf16)


  from .autonotebook import tqdm as notebook_tqdm


Device: cuda
BF16 supported: True


In [2]:
# Cell 2 — Load tokenizer

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenizer.padding_side = "right"

print("Pad token:", tokenizer.pad_token)


Pad token: </s>


In [3]:
# Cell 3 — Load Mistral 7B in 4-bit

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if supports_bf16 else torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    quantization_config=bnb_config,
)

model.config.use_cache = False
model.config.pad_token_id = tokenizer.pad_token_id

# prepare norms for QLoRA
model = prepare_model_for_kbit_training(model)

print("Model loaded.")


Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00,  1.98s/it]


Model loaded.


In [4]:
# Cell 4 — LoRA config

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj", "k_proj", "v_proj",
        "o_proj", "gate_proj", "up_proj", "down_proj",
    ],
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 41,943,040 || all params: 7,283,675,136 || trainable%: 0.5758


In [5]:
# Cell 5 — Load JSONL dataset

raw_ds = load_dataset("json", data_files=DATA_PATH, split="train")
print("Total samples:", len(raw_ds))

split = raw_ds.train_test_split(test_size=0.05, seed=42)
train_raw, val_raw = split["train"], split["test"]


Total samples: 6259


In [6]:
# Debug: restrict to a smaller subset for faster, safer training
MAX_TRAIN = 1000
MAX_VAL = 200

train_raw = train_raw.select(range(min(MAX_TRAIN, len(train_raw))))
val_raw   = val_raw.select(range(min(MAX_VAL, len(val_raw))))

print("Using subset - train:", len(train_raw), "val:", len(val_raw))


Using subset - train: 1000 val: 200


In [7]:
# Cell 6 — Format each sample properly

def format_example(ex):
    return (
        "### Instruction:\n"
        f"{ex['instruction']}\n\n"
        "### Input:\n"
        f"{ex['input']}\n\n"
        "### Response:\n"
        f"{ex['output']}"
    )

train_texts = [format_example(ex) for ex in train_raw]
val_texts   = [format_example(ex) for ex in val_raw]

train_data = Dataset.from_dict({"text": train_texts})
val_data   = Dataset.from_dict({"text": val_texts})

print("Example:\n", train_data[0]["text"])


Example:
 ### Instruction:
Given this Spark log message, identify its template ID and template text.

### Input:
Created local directory at /opt/hdfs/nodemanager/usercache/curi/appcache/application_1448006111297_0137/blockmgr-3469d055-a40e-48a4-8eed-6e85e6061e2e

### Response:
EventId: E185
EventTemplate: Created local directory at <*>


In [8]:
# Cell 7 (FIXED): Tokenization + safe supervised masking

MAX_LENGTH = 256  # Set smaller for stability on your GPU

def tokenize_and_mask(batch):
    # Step 1: Tokenize with padding+truncation
    tokenized = tokenizer(
        batch["text"],
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length",
    )

    # Step 2: Deep copy labels from input_ids
    labels = []
    for row in tokenized["input_ids"]:
        labels.append(list(row))  # independent copy with fixed length

    # Step 3: Mask everything BEFORE "### Response:"
    for i, text in enumerate(batch["text"]):
        resp_index = text.find("### Response:")
        if resp_index == -1:
            # No response → don't train on this row
            labels[i] = [-100] * MAX_LENGTH
            continue

        prefix = text[:resp_index]
        prefix_ids = tokenizer(prefix, add_special_tokens=False)["input_ids"]

        # Cap the mask to MAX_LENGTH to prevent resizing errors
        prefix_len = min(len(prefix_ids), MAX_LENGTH)

        for j in range(prefix_len):
            labels[i][j] = -100

    tokenized["labels"] = labels
    return tokenized


# APPLY TOKENIZATION
train_tok = train_data.map(tokenize_and_mask, batched=True, remove_columns=["text"])
val_tok   = val_data.map(tokenize_and_mask,   batched=True, remove_columns=["text"])

train_tok.set_format("torch")
val_tok.set_format("torch")

print("Tokenization + masking DONE")
print("train_tok sample keys:", train_tok[0].keys())
print("input_ids length:", len(train_tok[0]["input_ids"]))
print("labels length:", len(train_tok[0]["labels"]))


Map: 100%|██████████| 1000/1000 [00:00<00:00, 9220.40 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 9452.17 examples/s]

Tokenization + masking DONE
train_tok sample keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
input_ids length: 256
labels length: 256





In [9]:
# Cell 8 — Correct collator for supervised causal LM

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding=True,
    return_tensors="pt",
)


In [10]:
# Cell 9 — Training arguments

training_args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=2,                     # recommended minimum
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,          # effective batch = 16
    learning_rate=2e-4,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=1,
    bf16=supports_bf16,
    fp16=(torch.cuda.is_available() and not supports_bf16),
    report_to="none",
)


In [11]:
# Cell 10 — Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    data_collator=data_collator,
)


In [12]:
# Sanity check shapes for first few samples
for i in range(5):
    ex = train_tok[i]
    print(
        i,
        "input_ids len =", len(ex["input_ids"]),
        "labels len =", len(ex["labels"]),
    )


0 input_ids len = 256 labels len = 256
1 input_ids len = 256 labels len = 256
2 input_ids len = 256 labels len = 256
3 input_ids len = 256 labels len = 256
4 input_ids len = 256 labels len = 256


In [13]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

train_result = trainer.train()
print("Training finished!")


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  return fn(*args, **kwargs)


Step,Training Loss
50,0.8627
100,0.0282
150,0.023
200,0.0147
250,0.0056
300,0.0027
350,0.0012
400,0.0002
450,0.0005
500,0.0007


  return fn(*args, **kwargs)


Training finished!


In [14]:
# Cell 12 — Save LoRA & tokenizer

trainer.save_model(OUT_DIR)
tokenizer.save_pretrained(OUT_DIR)

print("Saved to:", OUT_DIR)


Saved to: ./mistral7b_qlora_out
