In [None]:
!pip -q install --upgrade \
  "transformers>=4.44,<4.47" \
  "peft>=0.12.0" \
  "trl>=0.9.6" \
  "accelerate>=0.34.2" \
  "bitsandbytes>=0.44.0" \
  "datasets>=2.20.0"

import os, sys, time
print(" Deps installed. Restarting runtime to load them cleanly...")
time.sleep(1)
os.kill(os.getpid(), 9)  # Colab auto-restart


In [None]:
import torch, warnings
from pathlib import Path
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer, SFTConfig

warnings.filterwarnings("ignore")
print(" Imports OK. torch:", torch.__version__)


In [None]:

from datasets import load_dataset

dataset = load_dataset(
    "csv",
    data_files={
        "train": "/content/train.csv",
        "validation": "/content/val.csv",
        "test": "/content/test.csv"
    }
)

print(dataset)
print("Sample row:", dataset["train"][0])


In [None]:
# Build a single prompt string per row: Instruction + Input + Response
INSTR_CANON = "Explain the following log and propose a fix. Return exactly:\nCause:\n<2-3 lines>\nFix:\n<1 line>"

def format_row(ex):
    instr = (ex.get("instruction") or INSTR_CANON).strip()
    inp   = (ex.get("input") or "").strip()
    out   = (ex.get("output") or "").strip()
    return {
        "text": f"### Instruction:\n{instr}\n\n### Input:\n{inp}\n\n### Response:\n{out}"
    }

cols = dataset["train"].column_names
dataset_fmt = dataset.map(format_row, remove_columns=cols)

print(dataset_fmt)
print("\nPreview:\n", dataset_fmt["train"][0]["text"][:400], "...\n")


In [None]:
# Load a good, ungated base model in 4-bit for QLoRA
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "Qwen/Qwen2.5-7B-Instruct"

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_cfg,
    device_map="auto",
    torch_dtype=torch.bfloat16
)

tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

# Optional memory saver for Colab
model.gradient_checkpointing_enable()

print("✅ Model :", model_id)
print("Tokenizer :", tok)



In [None]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

# 1) Make k-bit training-safe
model.config.use_cache = False
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(
    model, use_gradient_checkpointing=True
)


lora_cfg = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05, bias="none",
    target_modules=["q_proj","k_proj","v_proj","o_proj","up_proj","down_proj","gate_proj"],
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_cfg)
model.train()
model.print_trainable_parameters()


In [None]:
from transformers import TrainingArguments, DataCollatorForLanguageModeling
from trl import SFTTrainer


def tokenize(batch):
    return tok(batch["text"], padding="max_length", truncation=True, max_length=1024)

dataset_tok = dataset_fmt.map(tokenize, batched=True, remove_columns=["text"])

# Training arguments
args = TrainingArguments(
    output_dir="./qlora-logs",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    logging_steps=10,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    fp16=False,
    bf16=True,
    report_to="none",
)


collator = DataCollatorForLanguageModeling(
    tokenizer=tok,
    mlm=False
)


trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_tok["train"],
    eval_dataset=dataset_tok["validation"],
    args=args,
    data_collator=collator,
     # IMPORTANT: we already tokenized; packing expects raw text
)


# 5) Train
trainer.train()

In [None]:
# Save only the LoRA adapters (small) and tokenizer
save_dir = "/content/qlora-log-explainer"
model.save_pretrained(save_dir)
tok.save_pretrained(save_dir)

print("Saved to:", save_dir)
!ls -lh $save_dir


In [None]:
import torch

def test_model(log_line: str):
    model.eval()   # put model in evaluation mode

    # Create the same style of prompt used during training
    prompt = (
    "### Instruction:\n"
    "You are an assistant. Respond STRICTLY in this format:\n"
    "Cause:\n<2-3 lines only>\n"
    "Fix:\n<1 line only>\n"
    "Do not add extra explanations, repeats, or multiple Cause/Fix blocks.\n\n"
    "### Input:\n"
    f"{log_line}\n\n"
    "### Response:\n"
)


    # Convert text → tokens → tensors
    inputs = tok(prompt, return_tensors="pt").to(model.device)

    # Generate output
    with torch.inference_mode():
        out = model.generate(
    **inputs,
    max_new_tokens=150,
    do_sample=False,
    temperature=0.2,
    top_p=0.9,
    repetition_penalty=1.15,
    no_repeat_ngram_size=4,
    pad_token_id=tok.eos_token_id,
    eos_token_id=tok.eos_token_id,
)



    # Decode tokens → text
    text = tok.decode(out[0], skip_special_tokens=True)

    # Just return the model’s answer (after "### Response:")
    return text.split("### Response:")[-1].strip()


In [None]:
sample_log = "kafka.common.MessageSizeTooLargeException: Message exceeds the maximum size allowed"
print(test_model(sample_log))
