In [None]:
import pandas as pd
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from tqdm import tqdm
from unsloth import FastLanguageModel
from peft import PeftModel
import torch

# --- Configuration (should match your training setup) ---
base_model_name = "unsloth/Qwen3-0.6B-Base" # The original base model
load_in_4bit_at_load_time = False # Matches your inference script
max_seq_length_at_load_time = 24000 # Matches your inference script
dtype_at_load_time = None # Matches your inference script
output_csv_filename = "./classifications_ClfDC/output_0,6B_ClfDC.csv"
output_report_filename = "./classifications_ClfDC/report_0,6B_ClfDC.txt"
val_filename = "dataset/test_balanced.csv"

checkpoint_path = "./model/NetPro-Qwen3-0.6B-ClfDC"
NUM_CLASSES = 4 # Same as during training

# --- 1. Load the original base model ---
print(f"Loading base model: {base_model_name}...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=base_model_name,
    max_seq_length=max_seq_length_at_load_time,
    dtype=dtype_at_load_time,
    load_in_4bit=load_in_4bit_at_load_time,
)
print("Base model loaded.")

# --- 2. Re-apply the lm_head modification (EXACTLY as done in training) ---
print("Modifying lm_head to match training setup...")
number_token_ids = []
for i in range(0, NUM_CLASSES+1):
    number_token_ids.append(tokenizer.encode(str(i), add_special_tokens=False)[0])
# keep only the number tokens from lm_head
par = torch.nn.Parameter(model.lm_head.weight[number_token_ids, :])

old_shape = model.lm_head.weight.shape
old_size = old_shape[0]
print(par.shape)
print(old_shape)

model.lm_head.weight = par

reverse_map = {value: idx for idx, value in enumerate(number_token_ids)} # will be used later to convert an idx from the old tokenizer to the new lm_head
reverse_map

# --- 3. Load the LoRA adapter from the specific checkpoint ---
# Now that the model's lm_head has the correct (shrunken) shape,
# PeftModel can load the adapter weights without a size mismatch.
print(f"Loading LoRA adapter from: {checkpoint_path}")
model = PeftModel.from_pretrained(
    model, # The base model WITH THE MODIFIED lm_head
    checkpoint_path,
    is_trainable=False
)
print("LoRA adapter loaded successfully.")

# --- lm head ---
# Save the current (trimmed) lm_head and bias
trimmed_lm_head = model.lm_head.weight.data.clone()
trimmed_lm_head_bias = model.lm_head.bias.data.clone() if hasattr(model.lm_head, "bias") and model.lm_head.bias is not None else torch.zeros(len(number_token_ids), device=trimmed_lm_head.device)

# Create a new lm_head with shape [old_size, hidden_dim]
hidden_dim = trimmed_lm_head.shape[1]
new_lm_head = torch.full((old_size, hidden_dim), 0, dtype=trimmed_lm_head.dtype, device=trimmed_lm_head.device)
new_lm_head_bias = torch.full((old_size,), -1000.0, dtype=trimmed_lm_head_bias.dtype, device=trimmed_lm_head_bias.device)

# Fill in the weights and bias for the allowed tokens (number_token_ids)
for new_idx, orig_token_id in enumerate(number_token_ids):
    new_lm_head[orig_token_id] = trimmed_lm_head[new_idx]
    new_lm_head_bias[orig_token_id] = trimmed_lm_head_bias[new_idx]

# Update the model's lm_head weight and bias
with torch.no_grad():
    new_lm_head_module = torch.nn.Linear(hidden_dim, old_size, bias=True, device=model.device)
    new_lm_head_module.weight.data.copy_(new_lm_head)
    new_lm_head_module.bias.data.copy_(new_lm_head_bias)
    model.lm_head.modules_to_save["default"] = new_lm_head_module

print(f"Remade lm_head: shape = {model.lm_head.weight.shape}. Allowed tokens: {number_token_ids}")

# --- 4. Prepare for inference ---
FastLanguageModel.for_inference(model) # Unsloth's optimization for inference
print("Model prepared for inference.")

# ...existing code...

prompt_template = """You are an expert Website Classifier.

Domain: "{}" 
Website Content: "{}" 

Classify the website based on its content into one of the following categories:
- 0: Benign (general info, news, safe entertainment, educational, marketplace, social media, etc.)
- 1: Gambling (betting, casino, lottery, real money games, judi, slot)
- 2: Pornography (explicit sexual content, adult themes, nudity, sexual, bokep)
- 3: Harmful (malware, cybercrime, illegal activities, firearms, extremism, drugs, narcotics, phishing, scams, counterfeit, hacking tools, stolen data markets, carding)

SOLUTION
The correct answer is: class """

# Load validation data
val_df = pd.read_csv(val_filename, encoding="utf-8")

# Store predictions
predicted_labels = []

for _, row in tqdm(val_df.iterrows(), total=len(val_df), desc="Predicting"):
    domain = row['Domain']
    content = row['Content']
    full_prompt_for_inference = prompt_template.format(domain, content)
    inputs = tokenizer(full_prompt_for_inference, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=1, use_cache=True, pad_token_id=tokenizer.eos_token_id)
    generated_sequence = outputs[0]
    input_length = inputs.input_ids.shape[1]
    newly_generated_tokens = generated_sequence[input_length:]
    predicted_class_token = tokenizer.decode(newly_generated_tokens, skip_special_tokens=True)
    try:
        predicted_class_int = int(predicted_class_token.strip())
    except Exception:
        predicted_class_int = -1  # or any invalid class
    predicted_labels.append(predicted_class_int)

# True labels
true_labels = val_df['Label']

# Evaluation
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='weighted', zero_division=0)
recall = recall_score(true_labels, predicted_labels, average='weighted', zero_division=0)
f1 = f1_score(  true_labels, predicted_labels, average='weighted', zero_division=0)
report = classification_report(true_labels, predicted_labels)

print("Evaluation Metrics:")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")
print("\nDetailed classification report:\n")
print(report)

# Save metrics to .txt
with open(output_report_filename, "w", encoding="utf-8") as f:
    f.write("Evaluation Metrics:\n")
    f.write(f"Accuracy : {accuracy:.4f}\n")
    f.write(f"Precision: {precision:.4f}\n")
    f.write(f"Recall   : {recall:.4f}\n")
    f.write(f"F1 Score : {f1:.4f}\n\n")
    f.write("Detailed classification report:\n")
    f.write(report)

# Add predictions to DataFrame and export
val_df['predicted_label'] = predicted_labels
val_df.to_csv(output_csv_filename, index=False)
print(f"\nClassification complete. Output saved to '{output_csv_filename}' and '{output_report_filename}'")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Loading base model: unsloth/Qwen3-0.6B-Base...


  GPU_BUFFERS = tuple([torch.empty(2*256*2048, dtype = dtype, device = f"cuda:{i}") for i in range(n_gpus)])


==((====))==  Unsloth 2025.4.8: Fast Qwen3 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 3090. Num GPUs = 1. Max memory: 24.0 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Base model loaded.
Modifying lm_head to match training setup...
torch.Size([5, 1024])
torch.Size([151936, 1024])
Loading LoRA adapter from: ./model/NetPro-Qwen3-0.6B-ClfDC
LoRA adapter loaded successfully.
Remade lm_head: shape = torch.Size([151936, 1024]). Allowed tokens: [15, 16, 17, 18, 19]
Model prepared for inference.


Predicting: 100%|██████████| 992/992 [01:17<00:00, 12.73it/s]


Evaluation Metrics:
Accuracy : 0.9224
Precision: 0.9249
Recall   : 0.9224
F1 Score : 0.9207

Detailed classification report:

              precision    recall  f1-score   support

           0       0.89      0.95      0.92       248
           1       0.97      0.98      0.97       248
           2       0.89      0.98      0.94       248
           3       0.95      0.78      0.86       248

    accuracy                           0.92       992
   macro avg       0.92      0.92      0.92       992
weighted avg       0.92      0.92      0.92       992


Classification complete. Output saved to './classifications_ClfDC/output_0,6B_ClfDC3.csv' and './classifications_ClfDC/report_0,6B_ClfDC3.txt'


In [1]:
import pandas as pd
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from tqdm import tqdm
from unsloth import FastLanguageModel
from peft import PeftModel
import torch

# --- Configuration (should match your training setup) ---
base_model_name = "unsloth/Qwen3-1.7B-Base" # The original base model
load_in_4bit_at_load_time = False # Matches your inference script
max_seq_length_at_load_time = 24000 # Matches your inference script
dtype_at_load_time = None # Matches your inference script
output_csv_filename = "./classifications_ClfDC/output_1,7B_ClfDC.csv"
output_report_filename = "./classifications_ClfDC/report_1,7B_ClfDC.txt"
val_filename = "dataset/test_balanced.csv"

checkpoint_path = "./model/NetPro-Qwen3-1.7B-ClfDC"
NUM_CLASSES = 4 # Same as during training

# --- 1. Load the original base model ---
print(f"Loading base model: {base_model_name}...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=base_model_name,
    max_seq_length=max_seq_length_at_load_time,
    dtype=dtype_at_load_time,
    load_in_4bit=load_in_4bit_at_load_time,
)
print("Base model loaded.")

# --- 2. Re-apply the lm_head modification (EXACTLY as done in training) ---
print("Modifying lm_head to match training setup...")
number_token_ids = []
for i in range(0, NUM_CLASSES+1):
    number_token_ids.append(tokenizer.encode(str(i), add_special_tokens=False)[0])
# keep only the number tokens from lm_head
par = torch.nn.Parameter(model.lm_head.weight[number_token_ids, :])

old_shape = model.lm_head.weight.shape
old_size = old_shape[0]
print(par.shape)
print(old_shape)

model.lm_head.weight = par

reverse_map = {value: idx for idx, value in enumerate(number_token_ids)} # will be used later to convert an idx from the old tokenizer to the new lm_head
reverse_map

# --- 3. Load the LoRA adapter from the specific checkpoint ---
# Now that the model's lm_head has the correct (shrunken) shape,
# PeftModel can load the adapter weights without a size mismatch.
print(f"Loading LoRA adapter from: {checkpoint_path}")
model = PeftModel.from_pretrained(
    model, # The base model WITH THE MODIFIED lm_head
    checkpoint_path,
    is_trainable=False
)
print("LoRA adapter loaded successfully.")

# --- lm head ---
# Save the current (trimmed) lm_head and bias
trimmed_lm_head = model.lm_head.weight.data.clone()
trimmed_lm_head_bias = model.lm_head.bias.data.clone() if hasattr(model.lm_head, "bias") and model.lm_head.bias is not None else torch.zeros(len(number_token_ids), device=trimmed_lm_head.device)

# Create a new lm_head with shape [old_size, hidden_dim]
hidden_dim = trimmed_lm_head.shape[1]
new_lm_head = torch.full((old_size, hidden_dim), 0, dtype=trimmed_lm_head.dtype, device=trimmed_lm_head.device)
new_lm_head_bias = torch.full((old_size,), -1000.0, dtype=trimmed_lm_head_bias.dtype, device=trimmed_lm_head_bias.device)

# Fill in the weights and bias for the allowed tokens (number_token_ids)
for new_idx, orig_token_id in enumerate(number_token_ids):
    new_lm_head[orig_token_id] = trimmed_lm_head[new_idx]
    new_lm_head_bias[orig_token_id] = trimmed_lm_head_bias[new_idx]

# Update the model's lm_head weight and bias
with torch.no_grad():
    new_lm_head_module = torch.nn.Linear(hidden_dim, old_size, bias=True, device=model.device)
    new_lm_head_module.weight.data.copy_(new_lm_head)
    new_lm_head_module.bias.data.copy_(new_lm_head_bias)
    model.lm_head.modules_to_save["default"] = new_lm_head_module

print(f"Remade lm_head: shape = {model.lm_head.weight.shape}. Allowed tokens: {number_token_ids}")

# --- 4. Prepare for inference ---
FastLanguageModel.for_inference(model) # Unsloth's optimization for inference
print("Model prepared for inference.")

# ...existing code...

prompt_template = """You are an expert Website Classifier.

Domain: "{}" 
Website Content: "{}" 

Classify the website based on its content into one of the following categories:
- 0: Benign (general info, news, safe entertainment, educational, marketplace, social media, etc.)
- 1: Gambling (betting, casino, lottery, real money games, judi, slot)
- 2: Pornography (explicit sexual content, adult themes, nudity, sexual, bokep)
- 3: Harmful (malware, cybercrime, illegal activities, firearms, extremism, drugs, narcotics, phishing, scams, counterfeit, hacking tools, stolen data markets, carding)

SOLUTION
The correct answer is: class """

# Load validation data
val_df = pd.read_csv(val_filename, encoding="utf-8")

# Store predictions
predicted_labels = []

for _, row in tqdm(val_df.iterrows(), total=len(val_df), desc="Predicting"):
    domain = row['Domain']
    content = row['Content']
    full_prompt_for_inference = prompt_template.format(domain, content)
    inputs = tokenizer(full_prompt_for_inference, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=1, use_cache=True, pad_token_id=tokenizer.eos_token_id)
    generated_sequence = outputs[0]
    input_length = inputs.input_ids.shape[1]
    newly_generated_tokens = generated_sequence[input_length:]
    predicted_class_token = tokenizer.decode(newly_generated_tokens, skip_special_tokens=True)
    try:
        predicted_class_int = int(predicted_class_token.strip())
    except Exception:
        predicted_class_int = -1  # or any invalid class
    predicted_labels.append(predicted_class_int)

# True labels
true_labels = val_df['Label']

# Evaluation
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='weighted', zero_division=0)
recall = recall_score(true_labels, predicted_labels, average='weighted', zero_division=0)
f1 = f1_score(  true_labels, predicted_labels, average='weighted', zero_division=0)
report = classification_report(true_labels, predicted_labels)

print("Evaluation Metrics:")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")
print("\nDetailed classification report:\n")
print(report)

# Save metrics to .txt
with open(output_report_filename, "w", encoding="utf-8") as f:
    f.write("Evaluation Metrics:\n")
    f.write(f"Accuracy : {accuracy:.4f}\n")
    f.write(f"Precision: {precision:.4f}\n")
    f.write(f"Recall   : {recall:.4f}\n")
    f.write(f"F1 Score : {f1:.4f}\n\n")
    f.write("Detailed classification report:\n")
    f.write(report)

# Add predictions to DataFrame and export
val_df['predicted_label'] = predicted_labels
val_df.to_csv(output_csv_filename, index=False)
print(f"\nClassification complete. Output saved to '{output_csv_filename}' and '{output_report_filename}'")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Loading base model: unsloth/Qwen3-1.7B-Base...


  GPU_BUFFERS = tuple([torch.empty(2*256*2048, dtype = dtype, device = f"cuda:{i}") for i in range(n_gpus)])


==((====))==  Unsloth 2025.4.8: Fast Qwen3 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 3090. Num GPUs = 1. Max memory: 24.0 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Base model loaded.
Modifying lm_head to match training setup...
torch.Size([5, 2048])
torch.Size([151936, 2048])
Loading LoRA adapter from: ./model/NetPro-Qwen3-1.7B-ClfDC
LoRA adapter loaded successfully.
Remade lm_head: shape = torch.Size([151936, 2048]). Allowed tokens: [15, 16, 17, 18, 19]
Model prepared for inference.


Predicting: 100%|██████████| 992/992 [02:04<00:00,  7.98it/s]

Evaluation Metrics:
Accuracy : 0.9294
Precision: 0.9312
Recall   : 0.9294
F1 Score : 0.9280

Detailed classification report:

              precision    recall  f1-score   support

           0       0.89      0.96      0.92       248
           1       0.95      0.98      0.96       248
           2       0.92      0.99      0.96       248
           3       0.96      0.80      0.87       248

    accuracy                           0.93       992
   macro avg       0.93      0.93      0.93       992
weighted avg       0.93      0.93      0.93       992


Classification complete. Output saved to './classifications_ClfDC/output_1,7B_ClfDC.csv' and './classifications_ClfDC/report_1,7B_ClfDC.txt'



