## Library

In [1]:
import pandas as pd
from pathlib import Path
from jsonschema import validate, ValidationError
from datasets import Dataset, load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Load Dataset

In [2]:
# dataset = load_dataset("json", data_files="./dataset/netpro_chatml_thought.jsonl")

dataset = load_dataset("jordinia/netpro-finetune", split = "train")

In [3]:
# dataset = dataset["train"]  # Access the 'train' split

In [4]:
print("\nOutput Type:", type(dataset))
print("Dataset Info:")
print(dataset)


Output Type: <class 'datasets.arrow_dataset.Dataset'>
Dataset Info:
Dataset({
    features: ['conversations'],
    num_rows: 33262
})


## Load Model

In [5]:
import yaml
import torch
from unsloth import FastLanguageModel
from typing import Dict, Any

def load_config(config_path: str) -> Dict[str, Any]:
    with open(config_path, 'r') as f:
        return yaml.safe_load(f)

config = load_config('./config/config-0305.yml')

# Model and Training Config
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=config.get('model_name', 'unsloth/Llama-3.2-3B-Instruct-bnb-4bit'),
    max_seq_length=config.get('max_seq_length', 30000),
    dtype=config.get('dtype', None),
    load_in_4bit=config.get('load_in_4bit', True)
)

model = FastLanguageModel.get_peft_model(
    model,
    r=config.get('lora_r', 32),
    target_modules=config.get('target_modules', ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]),
    lora_alpha=config.get('lora_alpha', 32),
    lora_dropout=config.get('lora_dropout', 0),
    bias=config.get('bias', 'none'),
    use_gradient_checkpointing=config.get('use_gradient_checkpointing', 'unsloth'),
    random_state=config.get('random_state', 3407),
    use_rslora=config.get('use_rslora', False),
    loftq_config=config.get('loftq_config', None),
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.5.1 with CUDA 1201 (you have 2.6.0+cu124)
    Python  3.11.10 (you have 3.11.11)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 3090 Ti. Num GPUs = 1. Max memory: 23.551 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.3.19 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [6]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

In [7]:
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [8]:
dataset_dict = dataset.train_test_split(test_size=0.004)

In [9]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['conversations', 'text'],
        num_rows: 33128
    })
    test: Dataset({
        features: ['conversations', 'text'],
        num_rows: 134
    })
})

## Training

In [10]:
# @title wandb init
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mjordinia[0m ([33mjordinia-netpro[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [11]:
%env WANDB_WATCH=all
%env WANDB_SILENT=true
wandb.login(relogin=True)

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

env: WANDB_WATCH=all
env: WANDB_SILENT=true


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/fishmon/.netrc


True

In [12]:
import os
from transformers.utils import logging
import wandb
from dotenv import load_dotenv

load_dotenv()  # Loads from .env file automatically

# 2. Verify token loading
if not os.getenv("WANDB_API_KEY"):
    raise ValueError("WANDB_API_KEY not found in .env file")

# 3. Initialize and upload
os.environ["WANDB_API_KEY"]=os.getenv("WANDB_API_KEY")

logging.set_verbosity_info()
project_name = "netpro-finetune" 
entity_name = "jordinia-netpro"
run_name = "llama-3.2-3b-instruct-unsloth-sft-2025-05-03"  # Set your desired run name

# Initialize WANDB (FIXED ENTITY/PROJECT)
try:
    run = wandb.init(
        entity=entity_name,
        project=project_name,
        name=run_name,
        # id="j4vh49mi",
        # resume="allow" # Uncomment to resume a previous run
    )
    print("Successfully connected to WANDB!")
except Exception as e:
    print(f"Failed to initialize WANDB: {str(e)}")
    # Consider exiting if WANDB is critical
    # sys.exit(1)

Successfully connected to WANDB!


In [13]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported


args = TrainingArguments(
    output_dir                  =config.get('output_dir', 'outputs'),
    per_device_train_batch_size =config.get('per_device_train_batch_size', 2),
    per_device_eval_batch_size  =config.get('per_device_eval_batch_size', 1),
    gradient_accumulation_steps =config.get('gradient_accumulation_steps', 4),
    warmup_ratio                =config.get('warmup_ratio', 0.05),
    max_steps                   =config.get('max_steps', 4125),
    learning_rate               =2e-5,
    fp16                        =not is_bfloat16_supported(),
    bf16                        =is_bfloat16_supported(),
    optim                       =config.get('optim', 'adamw_8bit'),
    weight_decay                =config.get('weight_decay', 0.1),
    lr_scheduler_type           =config.get('lr_scheduler_type', 'cosine'),
    eval_strategy               =config.get('eval_strategy', 'steps'),
    eval_steps                  =config.get('eval_steps', 50),
    save_strategy               =config.get('save_strategy', 'steps'),
    save_steps                  =config.get('save_steps', 100),
    save_total_limit            =config.get('save_total_limit', 3),
    logging_steps               =config.get('logging_steps', 10),
    seed                        =config.get('seed', 3407),
    report_to                   =config.get('report_to', 'wandb'),
    load_best_model_at_end      =config.get('load_best_model_at_end', True),
    metric_for_best_model       =config.get('metric_for_best_model', 'eval_loss'),
)

trainer = SFTTrainer(
    model               =model,
    tokenizer           =tokenizer,
    train_dataset       =dataset_dict["train"],
    eval_dataset        =dataset_dict["test"],
    dataset_text_field  =config.get('dataset_text_field', 'text'),
    max_seq_length      =config.get('max_seq_length', 30000),
    data_collator       =DataCollatorForSeq2Seq(
        tokenizer               =tokenizer,
        padding                 =config.get('data_collator', {}).get('padding', True),
        pad_to_multiple_of      =config.get('data_collator', {}).get('pad_to_multiple_of', 8),
        max_length              =config.get('max_seq_length', 30000),
    ),
    dataset_num_proc    =config.get('dataset_num_proc', 2),
    packing             =config.get('packing', False),
    args                =args,
)

# # Calculate training length (33k samples)
# total_samples = 33000  # Your balanced dataset size
# batch_size = 2 * 4  # batch_size * gradient_accum
# steps_per_epoch = total_samples // batch_size  # ~4125 steps

PyTorch: setting up devices
PyTorch: setting up devices
max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend


### Train on Completions Only

In [14]:
# def train_on_responses_only_custom(trainer):
#     """
#     Custom version for website classification that:
#     1. Preserves system prompt context
#     2. Only trains on assistant responses (JSON + reasoning)
#     3. Handles length consistency for batching
#     """
#     tokenizer = trainer.tokenizer
    
#     # Manually define token sequences for your template
#     SYSTEM_TOKENS = tokenizer.encode(
#         "<|start_header_id|>system<|end_header_id|>\n\n", 
#         add_special_tokens=False
#     )
#     USER_TOKENS = tokenizer.encode(
#         "<|start_header_id|>user<|end_header_id|>\n\n", 
#         add_special_tokens=False
#     )
#     ASSISTANT_TOKENS = tokenizer.encode(
#         "<|start_header_id|>assistant<|end_header_id|>\n\n", 
#         add_special_tokens=False
#     )

#     def custom_masking(examples):
#         input_ids = examples["input_ids"]
        
#         # Create labels if they don't exist
#         labels = examples.get("labels", [ids.copy() for ids in input_ids])
#         new_labels = []
        
#         for seq_id, seq_labels in zip(input_ids, labels):
#             n = len(seq_id)
#             mask = [-100] * n
#             i = 0
            
#             while i < n:
#                 # Check for system prompt
#                 if seq_id[i:i+len(SYSTEM_TOKENS)] == SYSTEM_TOKENS:
#                     i += len(SYSTEM_TOKENS)
#                     continue
                    
#                 # Check for user prompt
#                 if seq_id[i:i+len(USER_TOKENS)] == USER_TOKENS:
#                     i += len(USER_TOKENS)
#                     continue
                    
#                 # Check for assistant prompt
#                 if seq_id[i:i+len(ASSISTANT_TOKENS)] == ASSISTANT_TOKENS:
#                     start = i
#                     i += len(ASSISTANT_TOKENS)
                    
#                     # Find end of assistant response
#                     while i < n:
#                         if seq_id[i] == tokenizer.eos_token_id:
#                             end = i
#                             break
#                         i += 1
#                     else:
#                         end = n
                    
#                     # Unmask assistant response
#                     mask[start:end] = seq_labels[start:end]
#                     break
                    
#                 i += 1
            
#             # Enforce length matching (critical fix)
#             if len(mask) != len(seq_id):
#                 mask = mask[:len(seq_id)] + [-100] * (len(seq_id) - len(mask))
            
#             new_labels.append(mask)
        
#         return {"labels": new_labels}

#     # Apply to datasets with length verification
#     def apply_masking(dataset):
#         dataset = dataset.map(
#             custom_masking,
#             batched=True,
#             batch_size=1000,
#             num_proc=4,
#         )
#         # Verify lengths
#         for i in range(min(3, len(dataset))):
#             assert len(dataset[i]["input_ids"]) == len(dataset[i]["labels"]), \
#                 f"Length mismatch in sample {i}"
#         return dataset
    
#     trainer.train_dataset = apply_masking(trainer.train_dataset)
    
#     if trainer.eval_dataset is not None:
#         trainer.eval_dataset = apply_masking(trainer.eval_dataset)
        
#     return trainer

In [15]:
# trainer = train_on_responses_only_custom(trainer)

In [16]:
# # Final verification (optional)
# print("\n=== Final Pre-Train Check ===")
# sample = trainer.train_dataset[0]
# print("First sample labels preview:")
# print("Input IDs length:", len(sample["input_ids"])) 
# print("Labels length:", len(sample["labels"]))
# print("Last 10 labels:", sample["labels"][-10:])  # Should show unmasked assistant tokens

In [17]:
# # Get a sample from the training set
# sample_index = 5  # Try different indices
# sample = trainer.train_dataset[sample_index]

# # Decode the full input context
# print("==== Full Input Context ====")
# print(tokenizer.decode(sample["input_ids"]))
# print("\n")

# # Decode the labels with masking visualization
# print("==== Training Targets (Masked) ====")
# masked_labels = []
# for token_id, label_id in zip(sample["input_ids"], sample["labels"]):
#     if label_id == -100:
#         # Show masked tokens as blank spaces
#         masked_labels.append(" ")
#     else:
#         # Show actual token
#         masked_labels.append(tokenizer.decode([token_id]))

# print("".join(masked_labels))

In [18]:
# batch = trainer.train_dataset.select(range(2)).with_format("torch")[0]
# print(batch["input_ids"])
# print(batch["labels"])


### Memory Stats

In [19]:
trainer.train_dataset

Dataset({
    features: ['conversations', 'text', 'input_ids', 'attention_mask'],
    num_rows: 33128
})

In [20]:
dataset_dict['test'][1]

{'conversations': [{'content': 'You are an Expert Website Classifier tasked with categorizing websites (using provided Domain & Content) into distinct categories: **0 - Benign**, **1 - Gambling**, **2 - Pornography**, or **3 - Harmful**. Output strict JSON including classification, reason, and a point-based confidence score (0-100). This aids a sophisticated website prediction system for digital safety.\n\n### **Categories & Definitions:**\n\n-   **0 - Benign:** General info, news, entertainment, services, e-commerce, educational sites, blogs, informational pages, and general entertainment sites. No gambling/porn themes.\n    -   **Includes:** Educational discussion of sensitive topics (drugs, gambling, adult, illegal), suggestive content (dating, lingerie) *without* explicit material, drugs for educational or news purposes (e.g., addiction recovery, medical cannabis research).\n\n-   **1 - Gambling:** Promotes/facilitates betting, casino, poker, lottery, wagering. Includes online bett

In [21]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3090 Ti. Max memory = 23.551 GB.
3.441 GB of memory reserved.


In [22]:
trainer_stats = trainer.train()

# checkpoint_path = "./outputs/checkpoint-700"
# if os.path.exists(checkpoint_path):
#     trainer.train(resume_from_checkpoint=checkpoint_path)
# else:
#     print(f"Warning: Checkpoint {checkpoint_path} not found. Starting from scratch.")

The following columns in the training set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: conversations, text. If conversations, text are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 33,128 | Num Epochs = 2 | Total steps = 2,100
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 97,255,424/3,000,000,000 (3.24% trained)
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
10,7.3237,5.390213
20,6.7075,4.619384
30,5.3576,2.803724
40,2.7189,0.92432
50,0.4548,0.048286
60,0.0461,0.025213
70,0.0422,0.018148
80,0.0192,0.011971
90,0.3498,0.011343
100,0.1523,0.010476


The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: conversations, text. If conversations, text are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 134
  Batch size = 1
Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: conversations, text. If conversations, text are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 134
  Batch size = 1
The following columns in the evaluation set don't have a corresponding argument in `PeftMode

KeyboardInterrupt: 

Error in callback <bound method _WandbInit._post_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x746ae6dafe90>> (for post_run_cell), with arguments args (<ExecutionResult object at 746ae6df8490, execution_count=22 error_before_exec=None error_in_exec= info=<ExecutionInfo object at 746ae6df86d0, raw_cell="trainer_stats = trainer.train()

# checkpoint_path.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://ssh-remote%2B100.77.86.156/home/fishmon/AJ/LLM-Finetuning/Malicious-Web/unsloth_netpro.ipynb#X46sdnNjb2RlLXJlbW90ZQ%3D%3D> result=None>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe

In [None]:
# Run this before the automatic evaluation at step 500
trainer.evaluate(eval_dataset=trainer.eval_dataset)

In [None]:
run.finish()

In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

## Inference

In [1]:
from unsloth import FastLanguageModel
import torch

# 1. Load checkpoint
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "./outputs/checkpoint-100",
    max_seq_length = 30000,  # Match training config
    dtype = None,
    load_in_4bit = True,
)

from unsloth.chat_templates import get_chat_template

# 2. Prepare for inference
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",  # Must match training template
)
model = FastLanguageModel.for_inference(model)


# 3. Load prompts
with open('./prompt/labelling_promptv4.txt', 'r') as f:
    system_prompt = f.read()
with open('./prompt/class_3_sample1.txt', 'r') as f:
    sample_text = f.read()

# 4. Format input
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"Classify: {sample_text}"}
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True,
    return_tensors = "pt",
    max_length = 30000,  # Prevent OOM
    truncation = True,
).to("cuda")

# 5. Generate
outputs = model.generate(
    inputs,
    max_new_tokens = 512,  # Reduce from 2048 for safety
    temperature = 0.3,  # More deterministic for classification
    top_p = 0.9,
    repetition_penalty = 1.2,
    eos_token_id        = tokenizer.eos_token_id,
    pad_token_id        = tokenizer.pad_token_id,
    use_cache = True,
)
tokenizer.batch_decode(outputs)
# print(tokenizer.decode(outputs[0]))

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm
    PyTorch 2.5.1 with CUDA 1201 (you have 2.6.0+cu124)
    Python  3.11.10 (you have 3.11.11)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 3090 Ti. Num GPUs = 1. Max memory: 23.551 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.3.19 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\nYou are an Expert Website Classifier tasked with categorizing websites (using provided Domain & Content) into distinct categories: **0 - Benign**, **1 - Gambling**, **2 - Pornography**, or **3 - Harmful**. Output strict JSON including classification, reason, and a point-based confidence score (0-100). This aids a sophisticated website prediction system for digital safety.\n\n### **Categories & Definitions:**\n\n-   **0 - Benign:** General info, news, entertainment, services, e-commerce, educational sites, blogs, informational pages, and general entertainment sites. No gambling/porn themes.\n    -   **Includes:** Educational discussion of sensitive topics (drugs, gambling, adult, illegal), suggestive content (dating, lingerie) *without* explicit material, drugs for educational or news purposes (e.g., addiction recovery, medical cannabis research).\n\n-   **

In [5]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Load the system prompt from the file
with open('./prompt/labelling_promptv4.txt', 'r') as system_file:
    system_prompt = system_file.read()

# Load the label from the file
with open('./prompt/class_3_sample2.txt', 'r') as label_file:
    label = label_file.read()

# Define the messages
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"Classify the given URL as 0 (benign), 1 (gambling), 2 (pornography), or 3 (harmful). Output MUST be JSON.\n{label}"}
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 2048, use_cache = True,
                         temperature = 0.7, min_p = 0.1)
tokenizer.batch_decode(outputs)

['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\nYou are an Expert Website Classifier tasked with categorizing websites (using provided Domain & Content) into distinct categories: **0 - Benign**, **1 - Gambling**, **2 - Pornography**, or **3 - Harmful**. Output strict JSON including classification, reason, and a point-based confidence score (0-100). This aids a sophisticated website prediction system for digital safety.\n\n### **Categories & Definitions:**\n\n-   **0 - Benign:** General info, news, entertainment, services, e-commerce, educational sites, blogs, informational pages, and general entertainment sites. No gambling/porn themes.\n    -   **Includes:** Educational discussion of sensitive topics (drugs, gambling, adult, illegal), suggestive content (dating, lingerie) *without* explicit material, drugs for educational or news purposes (e.g., addiction recovery, medical cannabis research).\n\n-   **

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Why is the sky is blue"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Why is the sky is blue"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Why is the sky is blue"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Load the system prompt from the file
with open('./prompt/labelling_promptv4.txt', 'r') as system_file:
    system_prompt = system_file.read()

# Load the label from the file
with open('./prompt/class_3_sample2.txt', 'r') as label_file:
    label = label_file.read()

# Define the messages
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"Classify the given URL as 0 (benign), 1 (gambling), 2 (pornography), or 3 (harmful). Output MUST be JSON.\n{label}"}
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 4096,
                   use_cache = True, temperature = 0.7, min_p = 0.5)

In [None]:
print(messages)