## Library

In [1]:
import pandas as pd
from pathlib import Path
from datasets import Dataset, load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Load Model

In [2]:
import yaml
import torch
from unsloth import FastLanguageModel
from typing import Dict, Any

def load_config(config_path: str) -> Dict[str, Any]:
    with open(config_path, 'r') as f:
        return yaml.safe_load(f)

config = load_config('./config/config-0505-qwen3.yml')

# Model and Training Config
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=config.get('model_name', 'unsloth/Llama-3.2-3B-Instruct-bnb-4bit'),
    max_seq_length=config.get('max_seq_length', 30000),
    dtype=config.get('dtype', None),
    load_in_4bit=config.get('load_in_4bit', True)
)

model = FastLanguageModel.get_peft_model(
    model,
    r=config.get('lora_r', 32),
    target_modules=config.get('target_modules', ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]),
    lora_alpha=config.get('lora_alpha', 32),
    lora_dropout=config.get('lora_dropout', 0),
    bias=config.get('bias', 'none'),
    use_gradient_checkpointing=config.get('use_gradient_checkpointing', 'unsloth'),
    random_state=config.get('random_state', 3407),
    use_rslora=config.get('use_rslora', False),
    loftq_config=config.get('loftq_config', None),
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.4.8: Fast Qwen3 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 3090. Num GPUs = 1. Max memory: 24.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.6. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.4.8 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


## Load Dataset

In [3]:
def count_tokens_in_conversation(conversation, tokenizer):
    """
    Count tokens in a ChatML conversation EXACTLY as done during training.
    """
    tokenized = tokenizer.apply_chat_template(
        conversation,
        tokenize=True,
        add_generation_prompt=False,  # Match training
        truncation=False,
        padding=False,
        return_tensors="pt",
    )
    return tokenized.shape[1]

def analyze_dataset_tokens(dataset, tokenizer, max_seq_length):
    """
    Analyze token usage statistics across a dataset with ChatML format.
    """
    token_counts = []

    for idx, item in enumerate(dataset):
        if idx % 1000 == 0:
            print(f"Processing conversation {idx}/{len(dataset)}")

        tokens = count_tokens_in_conversation(item["conversations"], tokenizer)
        token_counts.append(tokens)

    return {
        "total_conversations": len(token_counts),
        "total_tokens": sum(token_counts),
        "average_tokens": sum(token_counts) / len(token_counts),
        "max_tokens": max(token_counts),
        "min_tokens": min(token_counts),
        "conversations_over_limit": sum(1 for c in token_counts if c > max_seq_length),
        "percent_over_limit": (sum(c > max_seq_length for c in token_counts) / len(token_counts)) * 100,
    }


In [4]:
dataset = load_dataset("jordinia/netpro-finetune", name="chatml_thought_7k")
max_seq_length = config.get('max_seq_length', 30000)

train_stats = analyze_dataset_tokens(dataset["train"], tokenizer, max_seq_length)
print("Train split stats:", train_stats)

val_stats = analyze_dataset_tokens(dataset["validation"], tokenizer, max_seq_length)
print("Validation split stats:", val_stats)

Processing conversation 0/7245
Processing conversation 1000/7245
Processing conversation 2000/7245
Processing conversation 3000/7245
Processing conversation 4000/7245
Processing conversation 5000/7245
Processing conversation 6000/7245
Processing conversation 7000/7245
Train split stats: {'total_conversations': 7245, 'total_tokens': 36559657, 'average_tokens': 5046.191442374051, 'max_tokens': 10186, 'min_tokens': 3495, 'conversations_over_limit': 0, 'percent_over_limit': 0.0}
Processing conversation 0/60
Validation split stats: {'total_conversations': 60, 'total_tokens': 303146, 'average_tokens': 5052.433333333333, 'max_tokens': 7312, 'min_tokens': 3712, 'conversations_over_limit': 0, 'percent_over_limit': 0.0}


In [5]:
def convert_to_text(example):
    return {
        "text": tokenizer.apply_chat_template(
            example["conversations"],
            tokenize=False,
            add_generation_prompt=False  # match training setup
        )
    }

dataset["train"] = dataset["train"].map(convert_to_text, num_proc=4)
dataset["validation"] = dataset["validation"].map(convert_to_text, num_proc=1)


In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['conversations', 'text'],
        num_rows: 7245
    })
    validation: Dataset({
        features: ['conversations', 'text'],
        num_rows: 60
    })
})

## Training

In [7]:
# @title wandb init
import wandb
%env WANDB_WATCH=all
%env WANDB_SILENT=true
wandb.login(relogin=True)

env: WANDB_WATCH=all
env: WANDB_SILENT=true


[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

True

In [8]:
import os
from transformers.utils import logging
import wandb
from dotenv import load_dotenv

load_dotenv()  # Loads from .env file automatically

# 2. Verify token loading
if not os.getenv("WANDB_API_KEY"):
    raise ValueError("WANDB_API_KEY not found in .env file")

# 3. Initialize and upload
os.environ["WANDB_API_KEY"]=os.getenv("WANDB_API_KEY")

logging.set_verbosity_info()
project_name = "netpro-finetune" 
entity_name = "jordinia-netpro"
run_name = "unsloth/Qwen3-4B-unsloth-sft-2025-05-11-TESTING" # Set your desired run name

# Initialize WANDB (FIXED ENTITY/PROJECT)
try:
    run = wandb.init(
        entity=entity_name,
        project=project_name,
        name=run_name,
        # id="j4vh49mi",
        # resume="allow" # Uncomment to resume a previous run
    )
    print("Successfully connected to WANDB!")
except Exception as e:
    print(f"Failed to initialize WANDB: {str(e)}")
    # Consider exiting if WANDB is critical
    # sys.exit(1)

Successfully connected to WANDB!


In [9]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported


args = TrainingArguments(
    output_dir                  =config.get('output_dir', 'outputs'),
    per_device_train_batch_size =config.get('per_device_train_batch_size', 2),
    per_device_eval_batch_size  =config.get('per_device_eval_batch_size', 1),
    gradient_accumulation_steps =config.get('gradient_accumulation_steps', 4),
    warmup_ratio                =config.get('warmup_ratio', 0.05),
    max_steps                   =config.get('max_steps', 4125),
    learning_rate               =2e-5,
    fp16                        =not is_bfloat16_supported(),
    bf16                        =is_bfloat16_supported(),
    optim                       =config.get('optim', 'adamw_8bit'),
    weight_decay                =config.get('weight_decay', 0.1),
    lr_scheduler_type           =config.get('lr_scheduler_type', 'cosine'),
    eval_strategy               =config.get('eval_strategy', 'steps'),
    eval_steps                  =config.get('eval_steps', 50),
    save_strategy               =config.get('save_strategy', 'steps'),
    save_steps                  =config.get('save_steps', 100),
    save_total_limit            =config.get('save_total_limit', 3),
    logging_steps               =config.get('logging_steps', 10),
    seed                        =config.get('seed', 3407),
    report_to                   =config.get('report_to', 'wandb'),
    load_best_model_at_end      =config.get('load_best_model_at_end', True),
    metric_for_best_model       =config.get('metric_for_best_model', 'eval_loss'),
    # max_grad_norm               =config.get('max_grad_norm', 0.3),
)

trainer = SFTTrainer(
    model               =model,
    tokenizer           =tokenizer,
    train_dataset       =dataset["train"],
    eval_dataset        =dataset["validation"],
    dataset_text_field  =config.get('dataset_text_field', 'text'),
    max_seq_length      =config.get('max_seq_length', 30000),
    data_collator       =DataCollatorForSeq2Seq(
        tokenizer               =tokenizer,
        padding                 =config.get('data_collator', {}).get('padding', True),
        pad_to_multiple_of      =config.get('data_collator', {}).get('pad_to_multiple_of', 8),
        max_length              =config.get('max_seq_length', 30000),
    ),
    dataset_num_proc    =config.get('dataset_num_proc', 2),
    packing             =config.get('packing', False),
    args                =args,
)

PyTorch: setting up devices
PyTorch: setting up devices
max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend


### Train on Completions Only

In [10]:
# def train_on_responses_only_custom(trainer):
#     """
#     Custom version for website classification that:
#     1. Preserves system prompt context
#     2. Only trains on assistant responses (JSON + reasoning)
#     3. Handles length consistency for batching
#     """
#     tokenizer = trainer.tokenizer
    
#     # Manually define token sequences for your template
#     SYSTEM_TOKENS = tokenizer.encode(
#         "<|start_header_id|>system<|end_header_id|>\n\n", 
#         add_special_tokens=False
#     )
#     USER_TOKENS = tokenizer.encode(
#         "<|start_header_id|>user<|end_header_id|>\n\n", 
#         add_special_tokens=False
#     )
#     ASSISTANT_TOKENS = tokenizer.encode(
#         "<|start_header_id|>assistant<|end_header_id|>\n\n", 
#         add_special_tokens=False
#     )

#     def custom_masking(examples):
#         input_ids = examples["input_ids"]
        
#         # Create labels if they don't exist
#         labels = examples.get("labels", [ids.copy() for ids in input_ids])
#         new_labels = []
        
#         for seq_id, seq_labels in zip(input_ids, labels):
#             n = len(seq_id)
#             mask = [-100] * n
#             i = 0
            
#             while i < n:
#                 # Check for system prompt
#                 if seq_id[i:i+len(SYSTEM_TOKENS)] == SYSTEM_TOKENS:
#                     i += len(SYSTEM_TOKENS)
#                     continue
                    
#                 # Check for user prompt
#                 if seq_id[i:i+len(USER_TOKENS)] == USER_TOKENS:
#                     i += len(USER_TOKENS)
#                     continue
                    
#                 # Check for assistant prompt
#                 if seq_id[i:i+len(ASSISTANT_TOKENS)] == ASSISTANT_TOKENS:
#                     start = i
#                     i += len(ASSISTANT_TOKENS)
                    
#                     # Find end of assistant response
#                     while i < n:
#                         if seq_id[i] == tokenizer.eos_token_id:
#                             end = i
#                             break
#                         i += 1
#                     else:
#                         end = n
                    
#                     # Unmask assistant response
#                     mask[start:end] = seq_labels[start:end]
#                     break
                    
#                 i += 1
            
#             # Enforce length matching (critical fix)
#             if len(mask) != len(seq_id):
#                 mask = mask[:len(seq_id)] + [-100] * (len(seq_id) - len(mask))
            
#             new_labels.append(mask)
        
#         return {"labels": new_labels}

#     # Apply to datasets with length verification
#     def apply_masking(dataset):
#         dataset = dataset.map(
#             custom_masking,
#             batched=True,
#             batch_size=1000,
#             num_proc=4,
#         )
#         # Verify lengths
#         for i in range(min(3, len(dataset))):
#             assert len(dataset[i]["input_ids"]) == len(dataset[i]["labels"]), \
#                 f"Length mismatch in sample {i}"
#         return dataset
    
#     trainer.train_dataset = apply_masking(trainer.train_dataset)
    
#     if trainer.eval_dataset is not None:
#         trainer.eval_dataset = apply_masking(trainer.eval_dataset)
        
#     return trainer

In [11]:
# trainer = train_on_responses_only_custom(trainer)

In [12]:
# # Final verification (optional)
# print("\n=== Final Pre-Train Check ===")
# sample = trainer.train_dataset[0]
# print("First sample labels preview:")
# print("Input IDs length:", len(sample["input_ids"])) 
# print("Labels length:", len(sample["labels"]))
# print("Last 10 labels:", sample["labels"][-10:])  # Should show unmasked assistant tokens

In [13]:
# # Get a sample from the training set
# sample_index = 5  # Try different indices
# sample = trainer.train_dataset[sample_index]

# # Decode the full input context
# print("==== Full Input Context ====")
# print(tokenizer.decode(sample["input_ids"]))
# print("\n")

# # Decode the labels with masking visualization
# print("==== Training Targets (Masked) ====")
# masked_labels = []
# for token_id, label_id in zip(sample["input_ids"], sample["labels"]):
#     if label_id == -100:
#         # Show masked tokens as blank spaces
#         masked_labels.append(" ")
#     else:
#         # Show actual token
#         masked_labels.append(tokenizer.decode([token_id]))

# print("".join(masked_labels))

In [14]:
# batch = trainer.train_dataset.select(range(2)).with_format("torch")[0]
# print(batch["input_ids"])
# print(batch["labels"])


### Memory Stats

In [15]:
trainer.train_dataset

Dataset({
    features: ['conversations', 'text', 'input_ids', 'attention_mask'],
    num_rows: 7245
})

In [16]:
# dataset["train"][0]["text"]

In [17]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3090. Max memory = 24.0 GB.
4.207 GB of memory reserved.


In [18]:
trainer_stats = trainer.train()

# checkpoint_path = "./outputs/checkpoint-700"
# if os.path.exists(checkpoint_path):
#     trainer.train(resume_from_checkpoint=checkpoint_path)
# else:
#     print(f"Warning: Checkpoint {checkpoint_path} not found. Starting from scratch.")

The following columns in the training set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: conversations, text. If conversations, text are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 7,245 | Num Epochs = 2 | Total steps = 906
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 132,120,576/4,000,000,000 (3.30% trained)
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
10,1.8959,1.875898
20,1.7579,1.708042
30,1.5764,1.539896
40,1.4784,1.40415


The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: conversations, text. If conversations, text are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 60
  Batch size = 1
Unsloth: Not an error, but Qwen3ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: conversations, text. If conversations, text are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 60
  Batch size = 1
The following columns in the evaluation set don't have a corresponding argument in `PeftModelF

KeyboardInterrupt: 

In [None]:
# Run this before the automatic evaluation at step 500
trainer.evaluate(eval_dataset=trainer.eval_dataset)

In [None]:
run.finish()

In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

## Save Model

In [None]:
model.save_pretrained(config.get('lora_model', 'outputs'))
tokenizer.save_pretrained(config.get('lora_model', 'outputs'))

In [None]:
from huggingface_hub import HfApi, HfFolder
import os

# Constants
model_id = "jordinia/test-finetune"
local_model_path = "/home/fishmon/AJ/LLM-Finetuning/Malicious-Web/model/qwen3-0505-lora_model"
hf_token = ""

# Authenticate
HfFolder.save_token(hf_token)
api = HfApi()

# Upload the folder with a commit message
api.upload_folder(
    folder_path=local_model_path,
    repo_id=model_id,
    repo_type="model",
    use_auth_token=hf_token,
    commit_message="Initial commit of Qwen3-0505-LORA model",  # Add your commit message here
)

100%|██████████| 2/2 [00:06<00:00,  3.46s/it]


CommitInfo(commit_url='https://huggingface.co/jordinia/test-finetune/commit/2b75560795409c528fa06c24185ba3ef1c45a59a', commit_message='Initial commit of Qwen3-0505-LORA model', commit_description='', oid='2b75560795409c528fa06c24185ba3ef1c45a59a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jordinia/test-finetune', endpoint='https://huggingface.co', repo_type='model', repo_id='jordinia/test-finetune'), pr_revision=None, pr_num=None)

In [None]:
from huggingface_hub import HfApi

api = HfApi(token=os.getenv("HF_TOKEN"))
api.upload_folder(
    folder_path="/path/to/local/model",
    repo_id="jordinia/netpro-0505-lora",
    repo_type="model",
)


In [2]:
model.save_pretrained_merged("qwen3-0505-fp16-vllm", tokenizer, save_method = "merged_16bit",)

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 12.5 out of 31.26 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 36/36 [00:00<00:00, 407.94it/s]

Unsloth: Saving tokenizer...




 Done.
Done.


In [4]:
!python /home/fishmon/AJ/llama.cpp/convert_hf_to_gguf.py qwen3-0505-fp16-vllm --outfile qwen3-0505-fp16-vllm.gguf --outtype f16

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO:hf-to-gguf:Loading model: qwen3-0505-fp16-vllm
INFO:hf-to-gguf:Model architecture: Qwen3ForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00003.safetensors'
INFO:hf-to-gguf:token_embd.weight,         torch.bfloat16 --> F16, shape = {2560, 151936}
INFO:hf-to-gguf:blk.0.attn_norm.weight,    torch.bfloat16 --> F32, shape = {2560}
INFO:hf-to-gguf:blk.0.ffn_down.weight,     torch.bfloat16 --> F16, shape = {9728, 2560}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,     torch.bfloat16 --> F16, shape = {2560, 9728}
INFO:hf-to-gguf:blk.0.ffn_up.weight,       torch.bfloat16 --> F16, shape = {2560, 9728}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,     torch.bfloat16 --> F32, shape = {2560}
INFO:hf-to-gguf:blk.0.attn_k_norm.weight,  torch.bfloat16 --> F32, shape = {128}
INFO:hf-to-gguf:blk.0.attn_k.weig

In [9]:
# Temporarily skip merging for conversion
model.save_pretrained("qwen3-0505-q4-vllm", tokenizer, save_method="4bit")

In [12]:
!python /home/fishmon/AJ/llama.cpp/convert_hf_to_gguf.py qwen3-0505-q4-vllm --outfile qwen3-0505-q4-vllm.gguf --outtype auto

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO:hf-to-gguf:Loading model: qwen3-0505-q4-vllm
INFO:hf-to-gguf:Model architecture: Qwen3ForCausalLM
INFO:hf-to-gguf:gguf: loading model part 'model.safetensors'
INFO:hf-to-gguf:choosing --outtype bf16 from first tensor type (torch.bfloat16)
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model part 'model.safetensors'
INFO:hf-to-gguf:token_embd.weight,         torch.bfloat16 --> BF16, shape = {2560, 151936}
INFO:hf-to-gguf:blk.0.attn_norm.weight,    torch.bfloat16 --> F32, shape = {2560}
INFO:hf-to-gguf:blk.0.ffn_down.weight,     torch.bfloat16 --> BF16, shape = {9728, 2560}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,     torch.bfloat16 --> BF16, shape = {2560, 9728}
INFO:hf-to-gguf:blk.0.ffn_up.weight,       torch.bfloat16 --> BF16, shape = {2560, 9728}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,     torch.bfloat16 --> F32, shape = {2560}
INFO:hf-to-gguf:blk.0.attn_k_norm.weight,  torch.bfloat16 --> F32, sh

In [None]:
!python convert_hf_to_gguf.py qwen3-0505-fp16-vllm --outfile qwen3-0505--q4fp16-vllm.gguf --outtype f16

## Inference

In [1]:
from unsloth import FastLanguageModel
from peft import PeftModel
import torch
import re
import json

# 1. Load base model (EXACTLY matching your training config)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-4B-unsloth-bnb-4bit",  # Must match training
    max_seq_length = 30000,  # Same as training
    load_in_4bit = True,     # Must match training
    device_map = "auto",
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.4.8: Fast Qwen3 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 3090. Num GPUs = 1. Max memory: 24.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.6. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
# 2. Load your fine-tuned adapter
model = PeftModel.from_pretrained(
    model,
    model_id = "/home/jordinia/Projects/LLM-Finetuning/Malicious-Web/model/netpro-qwen3-0505-lora",
    adapter_name = "web_classifier",
    is_trainable = False,  # Crucial for inference
)

                

In [None]:
from unsloth import FastLanguageModel
from peft import PeftModel
import torch
import re
import json

# 1. Load base model (EXACTLY matching your training config)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-4B-unsloth-bnb-4bit",  # Must match training
    max_seq_length = 30000,  # Same as training
    load_in_4bit = True,     # Must match training
    device_map = "auto",
)

# 2. Load your fine-tuned adapter
model = PeftModel.from_pretrained(
    model,
    model_id = "model/netpro-qwen3-0505-lora",
    adapter_name = "web_classifier",
    is_trainable = False,  # Crucial for inference
)

# 3. Merge and optimize (reduces VRAM by ~30%)
model = model.merge_and_unload()
model = FastLanguageModel.for_inference(model)


# 3. Load prompts
with open('./prompt/labelling_promptv4.txt', 'r') as f:
    system_prompt = f.read()
with open('./prompt/class_0_sample1.txt', 'r') as f:
    sample_text = f.read()

# 4. Format input
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"Classify the given URL as 0 (benign), 1 (gambling), 2 (pornography), or 3 (harmful). Output MUST be JSON.\n {sample_text}"}
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True,
    return_tensors = "pt",
    max_length = 30000,  # Prevent OOM
    truncation = True,
    enable_thinking = True, 
).to("cuda")

# 5. Generate
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 2048,
                   use_cache = True, temperature = 0.7, min_p = 0.1, top_p = 0.9,)
# outputs = model.generate(
#     inputs,
#     max_new_tokens = 2048,  
#     temperature = 0.7,  
#     top_p = 0.9,
#     repetition_penalty = 1.2,
#     eos_token_id        = tokenizer.eos_token_id,
#     pad_token_id        = tokenizer.pad_token_id,
#     use_cache = True,
# )
# tokenizer.batch_decode(outputs)
# # print(tokenizer.decode(outputs[0]))                    

In [None]:
# 3. Merge and optimize (reduces VRAM by ~30%)
model = model.merge_and_unload()
model = FastLanguageModel.for_inference(model)


# 3. Load prompts
with open('./prompt/labelling_promptv4.txt', 'r') as f:
    system_prompt = f.read()
with open('./prompt/class_0_sample1.txt', 'r') as f:
    sample_text = f.read()

# 4. Format input
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"Classify the given URL as 0 (benign), 1 (gambling), 2 (pornography), or 3 (harmful). Output MUST be JSON.\n {sample_text}"}
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True,
    return_tensors = "pt",
    max_length = 30000,  # Prevent OOM
    truncation = True,
    enable_thinking = True, 
).to("cuda")

# 5. Generate
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 2048,
                   use_cache = True, temperature = 0.7, min_p = 0.1, top_p = 0.9,)
# outputs = model.generate(
#     inputs,
#     max_new_tokens = 2048,  
#     temperature = 0.7,  
#     top_p = 0.9,
#     repetition_penalty = 1.2,
#     eos_token_id        = tokenizer.eos_token_id,
#     pad_token_id        = tokenizer.pad_token_id,
#     use_cache = True,
# )
# tokenizer.batch_decode(outputs)
# # print(tokenizer.decode(outputs[0]))    

In [2]:
with open('./prompt/labelling_promptv4.txt', 'r') as f:
    system_prompt = f.read()
with open('./prompt/class_3_sample1.txt', 'r') as f:
    sample_text = f.read()

# 4. Format input
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"Classify the given URL as 0 (benign), 1 (gambling), 2 (pornography), or 3 (harmful). Output MUST be JSON.\n {sample_text}"}
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True,
    return_tensors = "pt",
    max_length = 30000,  # Prevent OOM
    truncation = True,
).to("cuda")

# 5. Generate
outputs = model.generate(
    inputs,
    max_new_tokens = 2048,  
    temperature = 0.7,  
    top_p = 0.9,
    repetition_penalty = 1.2,
    eos_token_id        = tokenizer.eos_token_id,
    pad_token_id        = tokenizer.pad_token_id,
    use_cache = True,
)
tokenizer.batch_decode(outputs)

['<|im_start|>system\nYou are an Expert Website Classifier tasked with categorizing websites (using provided Domain & Content) into distinct categories: **0 - Benign**, **1 - Gambling**, **2 - Pornography**, or **3 - Harmful**. Output strict JSON including classification, reason, and a point-based confidence score (0-100). This aids a sophisticated website prediction system for digital safety.\n\n### **Categories & Definitions:**\n\n-   **0 - Benign:** General info, news, entertainment, services, e-commerce, educational sites, blogs, informational pages, and general entertainment sites. No gambling/porn themes.\n    -   **Includes:** Educational discussion of sensitive topics (drugs, gambling, adult, illegal), suggestive content (dating, lingerie) *without* explicit material, drugs for educational or news purposes (e.g., addiction recovery, medical cannabis research).\n\n-   **1 - Gambling:** Promotes/facilitates betting, casino, poker, lottery, wagering. Includes online betting platfo