## Library

In [None]:
import pandas as pd
from pathlib import Path
from datasets import Dataset, load_dataset

## Load Model

In [None]:
import yaml
import torch
from unsloth import FastLanguageModel
from typing import Dict, Any

def load_config(config_path: str) -> Dict[str, Any]:
    with open(config_path, 'r') as f:
        return yaml.safe_load(f)

config = load_config('./config/config-0505-qwen3.yml')

# Model and Training Config
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=config.get('model_name', 'unsloth/Llama-3.2-3B-Instruct-bnb-4bit'),
    max_seq_length=config.get('max_seq_length', 30000),
    dtype=config.get('dtype', None),
    load_in_4bit=config.get('load_in_4bit', True)
)

model = FastLanguageModel.get_peft_model(
    model,
    r=config.get('lora_r', 32),
    target_modules=config.get('target_modules', ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]),
    lora_alpha=config.get('lora_alpha', 32),
    lora_dropout=config.get('lora_dropout', 0),
    bias=config.get('bias', 'none'),
    use_gradient_checkpointing=config.get('use_gradient_checkpointing', 'unsloth'),
    random_state=config.get('random_state', 3407),
    use_rslora=config.get('use_rslora', False),
    loftq_config=config.get('loftq_config', None),
)

## Load Dataset

In [None]:
def count_tokens_in_conversation(conversation, tokenizer):
    """
    Count tokens in a ChatML conversation EXACTLY as done during training.
    """
    tokenized = tokenizer.apply_chat_template(
        conversation,
        tokenize=True,
        add_generation_prompt=False,  # Match training
        truncation=False,
        padding=False,
        return_tensors="pt",
    )
    return tokenized.shape[1]

def analyze_dataset_tokens(dataset, tokenizer, max_seq_length):
    """
    Analyze token usage statistics across a dataset with ChatML format.
    """
    token_counts = []

    for idx, item in enumerate(dataset):
        if idx % 1000 == 0:
            print(f"Processing conversation {idx}/{len(dataset)}")

        tokens = count_tokens_in_conversation(item["conversations"], tokenizer)
        token_counts.append(tokens)

    return {
        "total_conversations": len(token_counts),
        "total_tokens": sum(token_counts),
        "average_tokens": sum(token_counts) / len(token_counts),
        "max_tokens": max(token_counts),
        "min_tokens": min(token_counts),
        "conversations_over_limit": sum(1 for c in token_counts if c > max_seq_length),
        "percent_over_limit": (sum(c > max_seq_length for c in token_counts) / len(token_counts)) * 100,
    }


In [None]:
dataset = load_dataset("jordinia/netpro-finetune", name="chatml_thought_7k")
max_seq_length = config.get('max_seq_length', 30000)

train_stats = analyze_dataset_tokens(dataset["train"], tokenizer, max_seq_length)
print("Train split stats:", train_stats)

val_stats = analyze_dataset_tokens(dataset["validation"], tokenizer, max_seq_length)
print("Validation split stats:", val_stats)

In [None]:
def convert_to_text(example):
    return {
        "text": tokenizer.apply_chat_template(
            example["conversations"],
            tokenize=False,
            add_generation_prompt=False  # match training setup
        )
    }

dataset["train"] = dataset["train"].map(convert_to_text, num_proc=4)
dataset["validation"] = dataset["validation"].map(convert_to_text, num_proc=1)


In [None]:
dataset

## Training

In [None]:
# @title wandb init
import wandb
%env WANDB_WATCH=all
%env WANDB_SILENT=true
wandb.login(relogin=True)

In [None]:
import os
from transformers.utils import logging
import wandb
from dotenv import load_dotenv

load_dotenv()  # Loads from .env file automatically

# 2. Verify token loading
if not os.getenv("WANDB_API_KEY"):
    raise ValueError("WANDB_API_KEY not found in .env file")

# 3. Initialize and upload
os.environ["WANDB_API_KEY"]=os.getenv("WANDB_API_KEY")

logging.set_verbosity_info()
project_name = "netpro-finetune" 
entity_name = "jordinia-netpro"
run_name = "unsloth/Qwen3-4B-unsloth-sft-2025-05-11-TESTING" # Set your desired run name

# Initialize WANDB (FIXED ENTITY/PROJECT)
try:
    run = wandb.init(
        entity=entity_name,
        project=project_name,
        name=run_name,
        # id="j4vh49mi",
        # resume="allow" # Uncomment to resume a previous run
    )
    print("Successfully connected to WANDB!")
except Exception as e:
    print(f"Failed to initialize WANDB: {str(e)}")
    # Consider exiting if WANDB is critical
    # sys.exit(1)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported


args = TrainingArguments(
    output_dir                  =config.get('output_dir', 'outputs'),
    per_device_train_batch_size =config.get('per_device_train_batch_size', 2),
    per_device_eval_batch_size  =config.get('per_device_eval_batch_size', 1),
    gradient_accumulation_steps =config.get('gradient_accumulation_steps', 4),
    warmup_ratio                =config.get('warmup_ratio', 0.05),
    max_steps                   =config.get('max_steps', 4125),
    learning_rate               =2e-5,
    fp16                        =not is_bfloat16_supported(),
    bf16                        =is_bfloat16_supported(),
    optim                       =config.get('optim', 'adamw_8bit'),
    weight_decay                =config.get('weight_decay', 0.1),
    lr_scheduler_type           =config.get('lr_scheduler_type', 'cosine'),
    eval_strategy               =config.get('eval_strategy', 'steps'),
    eval_steps                  =config.get('eval_steps', 50),
    save_strategy               =config.get('save_strategy', 'steps'),
    save_steps                  =config.get('save_steps', 100),
    save_total_limit            =config.get('save_total_limit', 3),
    logging_steps               =config.get('logging_steps', 10),
    seed                        =config.get('seed', 3407),
    report_to                   =config.get('report_to', 'wandb'),
    load_best_model_at_end      =config.get('load_best_model_at_end', True),
    metric_for_best_model       =config.get('metric_for_best_model', 'eval_loss'),
    # max_grad_norm               =config.get('max_grad_norm', 0.3),
)

trainer = SFTTrainer(
    model               =model,
    tokenizer           =tokenizer,
    train_dataset       =dataset["train"],
    eval_dataset        =dataset["validation"],
    dataset_text_field  =config.get('dataset_text_field', 'text'),
    max_seq_length      =config.get('max_seq_length', 30000),
    data_collator       =DataCollatorForSeq2Seq(
        tokenizer               =tokenizer,
        padding                 =config.get('data_collator', {}).get('padding', True),
        pad_to_multiple_of      =config.get('data_collator', {}).get('pad_to_multiple_of', 8),
        max_length              =config.get('max_seq_length', 30000),
    ),
    dataset_num_proc    =config.get('dataset_num_proc', 2),
    packing             =config.get('packing', False),
    args                =args,
)

### Train on Completions Only

In [None]:
# def train_on_responses_only_custom(trainer):
#     """
#     Custom version for website classification that:
#     1. Preserves system prompt context
#     2. Only trains on assistant responses (JSON + reasoning)
#     3. Handles length consistency for batching
#     """
#     tokenizer = trainer.tokenizer
    
#     # Manually define token sequences for your template
#     SYSTEM_TOKENS = tokenizer.encode(
#         "<|start_header_id|>system<|end_header_id|>\n\n", 
#         add_special_tokens=False
#     )
#     USER_TOKENS = tokenizer.encode(
#         "<|start_header_id|>user<|end_header_id|>\n\n", 
#         add_special_tokens=False
#     )
#     ASSISTANT_TOKENS = tokenizer.encode(
#         "<|start_header_id|>assistant<|end_header_id|>\n\n", 
#         add_special_tokens=False
#     )

#     def custom_masking(examples):
#         input_ids = examples["input_ids"]
        
#         # Create labels if they don't exist
#         labels = examples.get("labels", [ids.copy() for ids in input_ids])
#         new_labels = []
        
#         for seq_id, seq_labels in zip(input_ids, labels):
#             n = len(seq_id)
#             mask = [-100] * n
#             i = 0
            
#             while i < n:
#                 # Check for system prompt
#                 if seq_id[i:i+len(SYSTEM_TOKENS)] == SYSTEM_TOKENS:
#                     i += len(SYSTEM_TOKENS)
#                     continue
                    
#                 # Check for user prompt
#                 if seq_id[i:i+len(USER_TOKENS)] == USER_TOKENS:
#                     i += len(USER_TOKENS)
#                     continue
                    
#                 # Check for assistant prompt
#                 if seq_id[i:i+len(ASSISTANT_TOKENS)] == ASSISTANT_TOKENS:
#                     start = i
#                     i += len(ASSISTANT_TOKENS)
                    
#                     # Find end of assistant response
#                     while i < n:
#                         if seq_id[i] == tokenizer.eos_token_id:
#                             end = i
#                             break
#                         i += 1
#                     else:
#                         end = n
                    
#                     # Unmask assistant response
#                     mask[start:end] = seq_labels[start:end]
#                     break
                    
#                 i += 1
            
#             # Enforce length matching (critical fix)
#             if len(mask) != len(seq_id):
#                 mask = mask[:len(seq_id)] + [-100] * (len(seq_id) - len(mask))
            
#             new_labels.append(mask)
        
#         return {"labels": new_labels}

#     # Apply to datasets with length verification
#     def apply_masking(dataset):
#         dataset = dataset.map(
#             custom_masking,
#             batched=True,
#             batch_size=1000,
#             num_proc=4,
#         )
#         # Verify lengths
#         for i in range(min(3, len(dataset))):
#             assert len(dataset[i]["input_ids"]) == len(dataset[i]["labels"]), \
#                 f"Length mismatch in sample {i}"
#         return dataset
    
#     trainer.train_dataset = apply_masking(trainer.train_dataset)
    
#     if trainer.eval_dataset is not None:
#         trainer.eval_dataset = apply_masking(trainer.eval_dataset)
        
#     return trainer

In [None]:
# trainer = train_on_responses_only_custom(trainer)

In [None]:
# # Final verification (optional)
# print("\n=== Final Pre-Train Check ===")
# sample = trainer.train_dataset[0]
# print("First sample labels preview:")
# print("Input IDs length:", len(sample["input_ids"])) 
# print("Labels length:", len(sample["labels"]))
# print("Last 10 labels:", sample["labels"][-10:])  # Should show unmasked assistant tokens

In [None]:
# # Get a sample from the training set
# sample_index = 5  # Try different indices
# sample = trainer.train_dataset[sample_index]

# # Decode the full input context
# print("==== Full Input Context ====")
# print(tokenizer.decode(sample["input_ids"]))
# print("\n")

# # Decode the labels with masking visualization
# print("==== Training Targets (Masked) ====")
# masked_labels = []
# for token_id, label_id in zip(sample["input_ids"], sample["labels"]):
#     if label_id == -100:
#         # Show masked tokens as blank spaces
#         masked_labels.append(" ")
#     else:
#         # Show actual token
#         masked_labels.append(tokenizer.decode([token_id]))

# print("".join(masked_labels))

In [None]:
# batch = trainer.train_dataset.select(range(2)).with_format("torch")[0]
# print(batch["input_ids"])
# print(batch["labels"])


### Memory Stats

In [None]:
trainer.train_dataset

In [None]:
# dataset["train"][0]["text"]

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

# checkpoint_path = "./outputs/checkpoint-700"
# if os.path.exists(checkpoint_path):
#     trainer.train(resume_from_checkpoint=checkpoint_path)
# else:
#     print(f"Warning: Checkpoint {checkpoint_path} not found. Starting from scratch.")

In [None]:
# Run this before the automatic evaluation at step 500
trainer.evaluate(eval_dataset=trainer.eval_dataset)

In [None]:
run.finish()

In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

## Save Model

In [None]:
model.save_pretrained(config.get('lora_model', 'outputs'))
tokenizer.save_pretrained(config.get('lora_model', 'outputs'))

In [None]:
from huggingface_hub import HfApi, HfFolder
import os

# Constants
model_id = "jordinia/test-finetune"
local_model_path = "/home/fishmon/AJ/LLM-Finetuning/Malicious-Web/model/qwen3-0505-lora_model"
hf_token = ""

# Authenticate
HfFolder.save_token(hf_token)
api = HfApi()

# Upload the folder with a commit message
api.upload_folder(
    folder_path=local_model_path,
    repo_id=model_id,
    repo_type="model",
    use_auth_token=hf_token,
    commit_message="Initial commit of Qwen3-0505-LORA model",  # Add your commit message here
)

In [None]:
model.push_to_hub_merged("NetPro-Qwen-3-4B-0505", tokenizer, save_method = "merged_16bit", token = "<YOUR_HUGGINGFACE_TOKEN>")

In [None]:
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "model/netpro-qwen3-0505-lora", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = 30000,
    load_in_4bit = True,
)

In [None]:
model.save_pretrained_merged("model/netpro-qwen3-0505-fp16-hf", tokenizer, save_method = "merged_16bit",)

In [None]:
!python "C:\Users\rizky\Documents\Projects\llama.cpp\convert_hf_to_gguf.py" model\netpro-qwen3-0505-fp16-hf --outfile "C:\Users\rizky\Documents\Projects\LLM-Finetuning\Malicious-Web\model\netpro-qwen3-0505-fp16-gguf\netpro-qwen3-0505-fp16.gguf" --outtype f16

## Inference

In [None]:
!git clone https://huggingface.co/jordinia/netpro-qwen3-0505-lora

In [None]:
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "model/netpro-qwen3-0505-lora", 
    max_seq_length = 30000,
    load_in_4bit = True,
)

model = FastLanguageModel.for_inference(model)


# 3. Load prompts
with open('./prompt/labelling_promptv4.txt', 'r') as f:
    system_prompt = f.read()
with open('./prompt/class_3_sample1.txt', 'r') as f:
    sample_text = f.read()

# 4. Format input
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"Classify the given URL as 0 (benign), 1 (gambling), 2 (pornography), or 3 (harmful). Output MUST be JSON.\n {sample_text}"}
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True,
    return_tensors = "pt",
    max_length = 30000,  # Prevent OOM
    truncation = True,
    enable_thinking = True, 
).to("cuda")

# 5. Generate
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 2048,
                   use_cache = True, temperature = 0.7, min_p = 0.1, top_p = 0.9,)       

In [None]:
with open('./prompt/labelling_promptv4.txt', 'r') as f:
    system_prompt = f.read()
with open('./prompt/class_3_sample1.txt', 'r') as f:
    sample_text = f.read()

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"Classify the given URL as 0 (benign), 1 (gambling), 2 (pornography), or 3 (harmful). Output MUST be JSON.\n {sample_text}"}
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True,
    return_tensors = "pt",
    max_length = 30000, 
    truncation = True,
    enable_thinking = True, 
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 2048,
                   use_cache = True, temperature = 0.7, min_p = 0.1, top_p = 0.9,)  

In [None]:
with open('./prompt/labelling_promptv4.txt', 'r') as f:
    system_prompt = f.read()
with open('./prompt/class_3_sample1.txt', 'r') as f:
    sample_text = f.read()

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"Classify the given URL as 0 (benign), 1 (gambling), 2 (pornography), or 3 (harmful). Output MUST be JSON.\n {sample_text}"}
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True,
    return_tensors = "pt",
    max_length = 30000,  
    truncation = True,
).to("cuda")

outputs = model.generate(
    inputs,
    max_new_tokens = 2048,  
    temperature = 0.7,  
    top_p = 0.9,
    repetition_penalty = 1.2,
    eos_token_id        = tokenizer.eos_token_id,
    pad_token_id        = tokenizer.pad_token_id,
    use_cache = True,
)
tokenizer.batch_decode(outputs)