In [None]:
# 1. Install necessary libraries
!pip install -q git+https://github.com/huggingface/transformers.git \
                git+https://github.com/huggingface/peft.git \
                datasets accelerate bitsandbytes
!pip install -U transformers

In [None]:
!pip install -q psutil gputil
import threading, psutil, time
import GPUtil

def monitor_ram(interval=5):
    while True:
        mem = psutil.virtual_memory()
        print(f"[RAM] Used: {mem.used / 1024**3:.2f} GB | "
              f"Available: {mem.available / 1024**3:.2f} GB | "
              f"Usage: {mem.percent:.1f}%")
        time.sleep(interval)

def monitor_gpu(interval=5):
    while True:
        gpus = GPUtil.getGPUs()
        for gpu in gpus:
            print(f"[GPU] {gpu.name} | {gpu.memoryUsed:.0f} MB / {gpu.memoryTotal:.0f} MB "
                  f"({gpu.memoryUtil*100:.1f}%)")
        time.sleep(interval)

# Start background threads
threading.Thread(target=monitor_ram, daemon=True).start()
threading.Thread(target=monitor_gpu, daemon=True).start()

In [None]:
from huggingface_hub import login
from getpass import getpass

token = getpass("🔑 Enter your Hugging Face token: ")
login(token)


In [None]:

# 2. Load tokenizer and model (Mistral 7B, 4-bit quantized)
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import get_peft_model, PrefixTuningConfig, TaskType
import torch

model_name = "mistralai/Mistral-7B-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16  # or torch.float32 for full precision
)


In [None]:
# 3. Load dataset from file
from datasets import Dataset
import json

file_path = "prompt_snort_rule.jsonl"  # Make sure the path is correct

with open(file_path, 'r') as f:
    data = [json.loads(line) for line in f]

# Convert to Hugging Face Dataset
dataset = Dataset.from_list(data)

# Shuffle it
dataset = dataset.shuffle(seed=42)

# 4. Split into 80/10/10
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
train_dataset = dataset.select(range(train_size))
val_dataset = dataset.select(range(train_size, train_size + val_size))
test_dataset = dataset.select(range(train_size + val_size, len(dataset)))



# 5. Save test set to TXT for manual testing
with open("test_set.txt", "w") as f:
    for item in test_dataset:
        f.write(f"Prompt: {item['prompt']}\nResponse: {item['response']}\n\n")



In [None]:
# 6. Tokenize datasets
def preprocess(example):
    text = f"### Question: {example['prompt']}\n### Answer: {example['response']}"
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=256)
    encoding["labels"] = encoding["input_ids"].copy()
    return encoding

train_dataset = train_dataset.map(preprocess)
val_dataset = val_dataset.map(preprocess)


# 7. Prefix Tuning config
peft_config = PrefixTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    num_virtual_tokens=20,
    encoder_hidden_size=model.config.hidden_size,
)

prefix_model = get_peft_model(model, peft_config)

# 8. Training arguments
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./mistral_prefix_tuned",    # Directory for checkpoints
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    max_steps=1000,                         # <-- Set this
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=1e-4,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    report_to="none",                    # Disable W&B/loggers
)

trainer = Trainer(
    model=prefix_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)



In [None]:
# 9. Train
trainer.train()


In [None]:

# 10. Save model
prefix_model.save_pretrained("./mistral_prefix_tuned")

In [None]:
# zip for download
!zip -r mistral_prefix_tuned.zip mistral_prefix_tuned