## Library

In [1]:
import json
import pandas as pd
from pathlib import Path
from jsonschema import validate, ValidationError
from datasets import Dataset, load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Load Dataset

In [2]:
# dataset = load_dataset("json", data_files="./dataset/netpro_chatml_thought.jsonl")

from datasets import load_dataset
dataset = load_dataset("jordinia/netpro-finetune", split = "train")

In [3]:
dataset

Dataset({
    features: ['conversations'],
    num_rows: 33262
})

In [4]:
# dataset = dataset["train"]  # Access the 'train' split

In [4]:
print("\nOutput Type:", type(dataset))
print("Dataset Info:")
print(dataset)


Output Type: <class 'datasets.arrow_dataset.Dataset'>
Dataset Info:
Dataset({
    features: ['conversations'],
    num_rows: 33262
})


## Load Model

In [5]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 30000 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    # model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit", # or choose "unsloth/Llama-3.2-1B-Instruct"
    model_name= "unsloth/Llama-3.2-3B-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.5.1 with CUDA 1201 (you have 2.6.0+cu124)
    Python  3.11.10 (you have 3.11.11)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 3090 Ti. Num GPUs = 1. Max memory: 23.551 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [6]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.3.19 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [7]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

In [8]:
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [None]:
dataset[5]["conversations"]

In [None]:
dataset[5]["text"]

In [11]:
dataset_dict = dataset.train_test_split(test_size=0.004)

In [12]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['conversations', 'text'],
        num_rows: 33128
    })
    test: Dataset({
        features: ['conversations', 'text'],
        num_rows: 134
    })
})

In [13]:
# @title wandb init
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mjordinia[0m ([33mjordinia-netpro[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [14]:
%env WANDB_WATCH=all
%env WANDB_SILENT=true

env: WANDB_WATCH=all
env: WANDB_SILENT=true


## Training

In [15]:
wandb.login(relogin=True)

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/fishmon/.netrc


True

In [16]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
import os
from transformers.utils import logging
import wandb
from dotenv import load_dotenv

load_dotenv()  # Loads from .env file automatically

# 2. Verify token loading
if not os.getenv("WANDB_API_KEY"):
    raise ValueError("WANDB_API_KEY not found in .env file")

# 3. Initialize and upload
os.environ["WANDB_API_KEY"]=os.getenv("WANDB_API_KEY")

logging.set_verbosity_info()
project_name = "netpro-finetune" 
entity_name = "jordinia-netpro"
run_name = "llama-3.2-3b-unsloth-sft-2025-04-29"  # Set your desired run name

# Initialize WANDB (FIXED ENTITY/PROJECT)
try:
    run = wandb.init(
        entity=entity_name,
        project=project_name,
        name=run_name,
        id="j4vh49mi",
        resume="allow"
    )
    print("Successfully connected to WANDB!")
except Exception as e:
    print(f"Failed to initialize WANDB: {str(e)}")
    # Consider exiting if WANDB is critical
    # sys.exit(1)

Successfully connected to WANDB!


In [20]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['conversations', 'text'],
        num_rows: 33128
    })
    test: Dataset({
        features: ['conversations', 'text'],
        num_rows: 134
    })
})

In [17]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

# Calculate training length (33k samples)
total_samples = 33000  # Your balanced dataset size
batch_size = 2 * 4  # batch_size * gradient_accum
steps_per_epoch = total_samples // batch_size  # ~4125 steps

args = TrainingArguments(
    output_dir               = "outputs",
    per_device_train_batch_size    = 2,
    per_device_eval_batch_size     = 1,
    gradient_accumulation_steps    = 4,
    warmup_ratio             = 0.05,           # ~10% of steps
    max_steps                = steps_per_epoch * 1,
    learning_rate            = 5e-6,
    fp16                     = not is_bfloat16_supported(),
    bf16                     = is_bfloat16_supported(),
    optim                    = "adamw_8bit",
    weight_decay             = 0.1,
    lr_scheduler_type        = "cosine",      # linear decay
    eval_strategy            = "steps",
    eval_steps               = 50,           # every ~12% of the epoch
    save_strategy            = "steps",
    save_steps               = 100,
    save_total_limit         = 3,
    logging_steps            = 10,
    seed                     = 3407,
    report_to                = "wandb",
    load_best_model_at_end   = True,
    metric_for_best_model    = "eval_loss",
)

trainer = SFTTrainer(
    model               = model,
    tokenizer           = tokenizer,
    train_dataset       = dataset_dict["train"],
    eval_dataset        = dataset_dict["test"],
    dataset_text_field  = "text",
    max_seq_length      = max_seq_length,     # ← your pre-configured variable
    data_collator       = DataCollatorForSeq2Seq(
                             tokenizer = tokenizer,
                             padding = True,
                             pad_to_multiple_of = 8,
                             max_length          = max_seq_length,
                         ),
    dataset_num_proc    = 2,
    packing             = False,
    args                = args,
)

PyTorch: setting up devices
PyTorch: setting up devices
max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend


### Train on Completions Only

In [25]:
def train_on_responses_only_custom(trainer):
    """
    Custom version for website classification that:
    1. Preserves system prompt context
    2. Only trains on assistant responses (JSON + reasoning)
    3. Handles length consistency for batching
    """
    tokenizer = trainer.tokenizer
    
    # Manually define token sequences for your template
    SYSTEM_TOKENS = tokenizer.encode(
        "<|start_header_id|>system<|end_header_id|>\n\n", 
        add_special_tokens=False
    )
    USER_TOKENS = tokenizer.encode(
        "<|start_header_id|>user<|end_header_id|>\n\n", 
        add_special_tokens=False
    )
    ASSISTANT_TOKENS = tokenizer.encode(
        "<|start_header_id|>assistant<|end_header_id|>\n\n", 
        add_special_tokens=False
    )

    def custom_masking(examples):
        input_ids = examples["input_ids"]
        
        # Create labels if they don't exist
        labels = examples.get("labels", [ids.copy() for ids in input_ids])
        new_labels = []
        
        for seq_id, seq_labels in zip(input_ids, labels):
            n = len(seq_id)
            mask = [-100] * n
            i = 0
            
            while i < n:
                # Check for system prompt
                if seq_id[i:i+len(SYSTEM_TOKENS)] == SYSTEM_TOKENS:
                    i += len(SYSTEM_TOKENS)
                    continue
                    
                # Check for user prompt
                if seq_id[i:i+len(USER_TOKENS)] == USER_TOKENS:
                    i += len(USER_TOKENS)
                    continue
                    
                # Check for assistant prompt
                if seq_id[i:i+len(ASSISTANT_TOKENS)] == ASSISTANT_TOKENS:
                    start = i
                    i += len(ASSISTANT_TOKENS)
                    
                    # Find end of assistant response
                    while i < n:
                        if seq_id[i] == tokenizer.eos_token_id:
                            end = i
                            break
                        i += 1
                    else:
                        end = n
                    
                    # Unmask assistant response
                    mask[start:end] = seq_labels[start:end]
                    break
                    
                i += 1
            
            # Enforce length matching (critical fix)
            if len(mask) != len(seq_id):
                mask = mask[:len(seq_id)] + [-100] * (len(seq_id) - len(mask))
            
            new_labels.append(mask)
        
        return {"labels": new_labels}

    # Apply to datasets with length verification
    def apply_masking(dataset):
        dataset = dataset.map(
            custom_masking,
            batched=True,
            batch_size=1000,
            num_proc=4,
        )
        # Verify lengths
        for i in range(min(3, len(dataset))):
            assert len(dataset[i]["input_ids"]) == len(dataset[i]["labels"]), \
                f"Length mismatch in sample {i}"
        return dataset
    
    trainer.train_dataset = apply_masking(trainer.train_dataset)
    
    if trainer.eval_dataset is not None:
        trainer.eval_dataset = apply_masking(trainer.eval_dataset)
        
    return trainer

In [26]:
trainer = train_on_responses_only_custom(trainer)

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Map (num_proc=4): 100%|██████████| 776/776 [00:01<00:00, 408.20 examples/s]


In [27]:
# Final verification (optional)
print("\n=== Final Pre-Train Check ===")
sample = trainer.train_dataset[0]
print("First sample labels preview:")
print("Input IDs length:", len(sample["input_ids"])) 
print("Labels length:", len(sample["labels"]))
print("Last 10 labels:", sample["labels"][-10:])  # Should show unmasked assistant tokens


=== Final Pre-Train Check ===
First sample labels preview:
Input IDs length: 6714
Labels length: 6714
Last 10 labels: [220, 330, 83029, 794, 220, 5313, 198, 534, 74694, 128009]


In [28]:
# Get a sample from the training set
sample_index = 5  # Try different indices
sample = trainer.train_dataset[sample_index]

# Decode the full input context
print("==== Full Input Context ====")
print(tokenizer.decode(sample["input_ids"]))
print("\n")

# Decode the labels with masking visualization
print("==== Training Targets (Masked) ====")
masked_labels = []
for token_id, label_id in zip(sample["input_ids"], sample["labels"]):
    if label_id == -100:
        # Show masked tokens as blank spaces
        masked_labels.append(" ")
    else:
        # Show actual token
        masked_labels.append(tokenizer.decode([token_id]))

print("".join(masked_labels))

==== Full Input Context ====
<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

You are an Expert Website Classifier tasked with categorizing websites (using provided Domain & Content) into distinct categories: **0 - Benign**, **1 - Gambling**, **2 - Pornography**, or **3 - Harmful**. Output strict JSON including classification, reason, and a point-based confidence score (0-100). This aids a sophisticated website prediction system for digital safety.

### **Categories & Definitions:**

-   **0 - Benign:** General info, news, entertainment, services, e-commerce, educational sites, blogs, informational pages, and general entertainment sites. No gambling/porn themes.
    -   **Includes:** Educational discussion of sensitive topics (drugs, gambling, adult, illegal), suggestive content (dating, lingerie) *without* explicit material, drugs for educational or news purposes (e.g., addiction recovery, med

In [29]:
batch = trainer.train_dataset.select(range(2)).with_format("torch")[0]
print(batch["input_ids"])
print(batch["labels"])


tensor([128000, 128000, 128006,  ...,    534,  74694, 128009])
tensor([  -100,   -100,   -100,  ...,    534,  74694, 128009])


In [30]:
trainer.train_dataset

Dataset({
    features: ['conversations', 'text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 776
})

### Memory Stats

In [18]:
trainer.train_dataset

Dataset({
    features: ['conversations', 'text', 'input_ids', 'attention_mask'],
    num_rows: 33128
})

In [23]:
dataset_dict['test'][1]

{'conversations': [{'content': 'You are an Expert Website Classifier tasked with categorizing websites (using provided Domain & Content) into distinct categories: **0 - Benign**, **1 - Gambling**, **2 - Pornography**, or **3 - Harmful**. Output strict JSON including classification, reason, and a point-based confidence score (0-100). This aids a sophisticated website prediction system for digital safety.\n\n### **Categories & Definitions:**\n\n-   **0 - Benign:** General info, news, entertainment, services, e-commerce, educational sites, blogs, informational pages, and general entertainment sites. No gambling/porn themes.\n    -   **Includes:** Educational discussion of sensitive topics (drugs, gambling, adult, illegal), suggestive content (dating, lingerie) *without* explicit material, drugs for educational or news purposes (e.g., addiction recovery, medical cannabis research).\n\n-   **1 - Gambling:** Promotes/facilitates betting, casino, poker, lottery, wagering. Includes online bett

In [19]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3090 Ti. Max memory = 23.551 GB.
3.441 GB of memory reserved.


In [21]:
# trainer_stats = trainer.train()

checkpoint_path = "./outputs/checkpoint-700"
if os.path.exists(checkpoint_path):
    trainer.train(resume_from_checkpoint=checkpoint_path)
else:
    print(f"Warning: Checkpoint {checkpoint_path} not found. Starting from scratch.")

Loading model from ./outputs/checkpoint-700.
The following columns in the training set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text, conversations. If text, conversations are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 33,128 | Num Epochs = 1 | Total steps = 4,125
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 48,627,712/3,000,000,000 (1.62% trained)
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 0
  Continuing training from global step 700
  Will skip the first 0 epochs then the first 2800 batches in the first epoch.
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "

Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
750,0.13,0.019036
800,0.0217,0.018548


The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text, conversations. If text, conversations are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 134
  Batch size = 1
Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text, conversations. If text, conversations are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 134
  Batch size = 1
Saving model checkpoint to outputs/checkpoint-800
loading configuration file config.json fro

KeyboardInterrupt: 

In [None]:
# Run this before the automatic evaluation at step 500
trainer.evaluate(eval_dataset=trainer.eval_dataset)

In [None]:
run.finish()

In [22]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

Error in callback <bound method _WandbInit._pre_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x79aca5ceba50>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 79aca5cd4110, raw_cell="used_memory = round(torch.cuda.max_memory_reserved.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://ssh-remote%2B100.77.86.156/home/fishmon/AJ/LLM-Finetuning/Malicious-Web/unsloth_netpro.ipynb#X52sdnNjb2RlLXJlbW90ZQ%3D%3D>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe

NameError: name 'trainer_stats' is not defined

Error in callback <bound method _WandbInit._post_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x79aca5ceba50>> (for post_run_cell), with arguments args (<ExecutionResult object at 79ac9612b150, execution_count=22 error_before_exec=None error_in_exec=name 'trainer_stats' is not defined info=<ExecutionInfo object at 79aca5cd4110, raw_cell="used_memory = round(torch.cuda.max_memory_reserved.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://ssh-remote%2B100.77.86.156/home/fishmon/AJ/LLM-Finetuning/Malicious-Web/unsloth_netpro.ipynb#X52sdnNjb2RlLXJlbW90ZQ%3D%3D> result=None>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe

## Inference

In [2]:
from unsloth import FastLanguageModel
import torch

# 1. Load checkpoint
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "./outputs/checkpoint-800",
    max_seq_length = 30000,  # Match training config
    dtype = None,
    load_in_4bit = True,
)

from unsloth.chat_templates import get_chat_template

# 2. Prepare for inference
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",  # Must match training template
)
model = FastLanguageModel.for_inference(model)


# 3. Load prompts
with open('./prompt/labelling_promptv4.txt', 'r') as f:
    system_prompt = f.read()
with open('./prompt/class_3_sample1.txt', 'r') as f:
    sample_text = f.read()

# 4. Format input
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"Classify: {sample_text}"}
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True,
    return_tensors = "pt",
    max_length = 30000,  # Prevent OOM
    truncation = True,
).to("cuda")

# 5. Generate
outputs = model.generate(
    inputs,
    max_new_tokens = 512,  # Reduce from 2048 for safety
    temperature = 0.3,  # More deterministic for classification
    top_p = 0.9,
    repetition_penalty = 1.2,
    eos_token_id        = tokenizer.eos_token_id,
    pad_token_id        = tokenizer.pad_token_id,
    use_cache = True,
)
tokenizer.batch_decode(outputs)
# print(tokenizer.decode(outputs[0]))

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 3090 Ti. Num GPUs = 1. Max memory: 23.551 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\nYou are an Expert Website Classifier tasked with categorizing websites (using provided Domain & Content) into distinct categories: **0 - Benign**, **1 - Gambling**, **2 - Pornography**, or **3 - Harmful**. Output strict JSON including classification, reason, and a point-based confidence score (0-100). This aids a sophisticated website prediction system for digital safety.\n\n### **Categories & Definitions:**\n\n-   **0 - Benign:** General info, news, entertainment, services, e-commerce, educational sites, blogs, informational pages, and general entertainment sites. No gambling/porn themes.\n    -   **Includes:** Educational discussion of sensitive topics (drugs, gambling, adult, illegal), suggestive content (dating, lingerie) *without* explicit material, drugs for educational or news purposes (e.g., addiction recovery, medical cannabis research).\n\n-   **

In [3]:
tokenizer.batch_decode(outputs)

['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\nYou are an Expert Website Classifier tasked with categorizing websites (using provided Domain & Content) into distinct categories: **0 - Benign**, **1 - Gambling**, **2 - Pornography**, or **3 - Harmful**. Output strict JSON including classification, reason, and a point-based confidence score (0-100). This aids a sophisticated website prediction system for digital safety.\n\n### **Categories & Definitions:**\n\n-   **0 - Benign:** General info, news, entertainment, services, e-commerce, educational sites, blogs, informational pages, and general entertainment sites. No gambling/porn themes.\n    -   **Includes:** Educational discussion of sensitive topics (drugs, gambling, adult, illegal), suggestive content (dating, lingerie) *without* explicit material, drugs for educational or news purposes (e.g., addiction recovery, medical cannabis research).\n\n-   **

In [6]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Load the system prompt from the file
with open('./prompt/labelling_promptv4.txt', 'r') as system_file:
    system_prompt = system_file.read()

# Load the label from the file
with open('./prompt/class_3_sample2.txt', 'r') as label_file:
    label = label_file.read()

# Define the messages
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"Classify the given URL as 0 (benign), 1 (gambling), 2 (pornography), or 3 (harmful). Output MUST be JSON.\n{label}"}
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 2048, use_cache = True,
                         temperature = 0.7, min_p = 0.1)
tokenizer.batch_decode(outputs)

['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\nYou are an Expert Website Classifier tasked with categorizing websites (using provided Domain & Content) into distinct categories: **0 - Benign**, **1 - Gambling**, **2 - Pornography**, or **3 - Harmful**. Output strict JSON including classification, reason, and a point-based confidence score (0-100). This aids a sophisticated website prediction system for digital safety.\n\n### **Categories & Definitions:**\n\n-   **0 - Benign:** General info, news, entertainment, services, e-commerce, educational sites, blogs, informational pages, and general entertainment sites. No gambling/porn themes.\n    -   **Includes:** Educational discussion of sensitive topics (drugs, gambling, adult, illegal), suggestive content (dating, lingerie) *without* explicit material, drugs for educational or news purposes (e.g., addiction recovery, medical cannabis research).\n\n-   **

In [3]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Why is the sky is blue"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...



...






In [9]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Why is the sky is blue"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

The sky is blue because of a phenomenon called Rayleigh scattering, named after Lord Rayleigh, who first described it. 

Rayleigh scattering is a type of scattering that occurs in the atmosphere. When sunlight enters the Earth's atmosphere, it encounters tiny molecules of gas such as nitrogen and oxygen. These molecules scatter the light in all directions, but they scatter shorter wavelengths, like violet and blue light, more than longer wavelengths, like red and orange light.

Since violet and blue light are scattered more than red and orange light, we primarily see blue light.<|eot_id|>


In [7]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Why is the sky is blue"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

The The assistant is an assistant, 

or is an assistant, or is an assistant.

The The assistant is an assistant, or is an assistant, or is an assistant.<|end_of_text|>


In [5]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Load the system prompt from the file
with open('./prompt/labelling_promptv4.txt', 'r') as system_file:
    system_prompt = system_file.read()

# Load the label from the file
with open('./prompt/class_3_sample2.txt', 'r') as label_file:
    label = label_file.read()

# Define the messages
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"Classify the given URL as 0 (benign), 1 (gambling), 2 (pornography), or 3 (harmful). Output MUST be JSON.\n{label}"}
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 4096,
                   use_cache = True, temperature = 0.7, min_p = 0.5)

### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### 

KeyboardInterrupt: 

In [16]:
print(messages)

[{'role': 'system', 'content': 'You are an Expert Website Classifier tasked with categorizing websites (using provided Domain & Content) into distinct categories: **0 - Benign**, **1 - Gambling**, **2 - Pornography**, or **3 - Harmful**. Output strict JSON including classification, reason, and a point-based confidence score (0-100). This aids a sophisticated website prediction system for digital safety.\n\n### **Categories & Definitions:**\n\n-   **0 - Benign:** General info, news, entertainment, services, e-commerce, educational sites, blogs, informational pages, and general entertainment sites. No gambling/porn themes.\n    -   **Includes:** Educational discussion of sensitive topics (drugs, gambling, adult, illegal), suggestive content (dating, lingerie) *without* explicit material, drugs for educational or news purposes (e.g., addiction recovery, medical cannabis research).\n\n-   **1 - Gambling:** Promotes/facilitates betting, casino, poker, lottery, wagering. Includes online bett