## Library

In [1]:
import json
import pandas as pd
from pathlib import Path
from jsonschema import validate, ValidationError
from datasets import Dataset, load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Data Preparation

In [2]:
# Define the schema for the classification dictionary (outside the function is cleaner)
classification_schema = {
    "type": "object",
    "properties": {
        "answer": {"type": "integer", "minimum": 0, "maximum": 3},
        "classification": {"type": "string"},
        "reason": {"type": "string"},
        "confidence": {"type": "integer", "minimum": 0, "maximum": 100}
    },
    "required": ["answer", "classification", "reason", "confidence"]
}

def to_sharegpt_with_thought(system, input_suffix, dataset) -> Dataset:
    """
    Convert website classification dataset to ShareGPT format including reasoning ('thought'),
    with JSON validation and enhanced error handling, specifically checking for unhashable types.
    Returns a Hugging Face Dataset object.
    
    Args:
        system (str): System prompt
        input_suffix (str): Suffix to append to the human message (can be empty if not needed)
        dataset (pd.DataFrame): Input DataFrame with columns:
            ['Domain', 'Content', 'Label', 'classification', 'reason', 'confidence', 'thought']
            
    Returns:
        datasets.Dataset: Hugging Face Dataset with a 'conversations' column, 
                          where each row contains a list representing one conversation.
                          Returns an empty Dataset if input dataset is empty or all rows fail.
    """
    if not isinstance(dataset, pd.DataFrame) or dataset.empty:
        print("Input is not a valid or non-empty DataFrame. Returning empty Dataset.")
        # Return an empty Dataset with the expected structure
        return Dataset.from_dict({"conversations": []}) 
        
    # This list will temporarily hold the conversation lists
    conversation_data_list = [] 
    error_count = 0
    processed_count = 0

    human_template = f"{input_suffix}\nDomain: {{domain}}, Content: \"{{content}}\""
    if not input_suffix:
         human_template = f"Domain: {{domain}}, Content: \"{{content}}\""

    print(f"Starting conversion for {len(dataset)} rows...")

    for idx, row in dataset.iterrows():
        try:
            # --- Data Extraction and Basic Type Check ---
            domain = row.get('Domain', 'N/A') 
            content = str(row['Content']) if pd.notna(row['Content']) else "" 
            thought_text = str(row['thought']).strip() if pd.notna(row['thought']) else "No thought provided."
            
            label = row['Label']
            classification = row['classification']
            reason = row['reason']
            confidence = row['confidence']

            if pd.isna(label) or pd.isna(classification) or pd.isna(reason) or pd.isna(confidence):
                 raise ValueError("One or more required classification fields are NaN")

            if isinstance(label, (list, dict)) or \
               isinstance(classification, (list, dict)) or \
               isinstance(reason, (list, dict)) or \
               isinstance(confidence, (list, dict)):
                 raise TypeError("One or more classification fields contain unhashable list/dict types")

            human_value = human_template.format(domain=domain, content=content)
            
            # --- Prepare and Validate Classification Dictionary ---
            classification_dict = {
                "answer": int(label), 
                "classification": str(classification),
                "reason": str(reason), 
                "confidence": int(confidence)
            }
            validate(instance=classification_dict, schema=classification_schema) 
            
            # --- Serialize and Format Output ---
            final_json_str = json.dumps(classification_dict, ensure_ascii=False, indent=2) 
            gpt_value = f"<think>\n{thought_text}\n</think>\n```json\n{final_json_str}\n```"
            
            # This is the list for a single conversation
            conversation = [
                {"from": "system", "value": system},
                {"from": "human", "value": human_value},
                {"from": "gpt", "value": gpt_value} 
            ]
            
            # Append the conversation list to our temporary list
            conversation_data_list.append(conversation) 
            processed_count += 1

        # --- Error Handling ---
        except (ValidationError, ValueError, TypeError) as e: 
            error_count += 1
            # Avoid printing excessive errors if many occur
            if error_count < 20 or error_count % 100 == 0: 
                 print(f"Error processing row {idx} (Domain: {domain}): {type(e).__name__} - {str(e)}")
            continue 
        except Exception as e: 
             error_count += 1
             if error_count < 20 or error_count % 100 == 0:
                 print(f"UNEXPECTED Error processing row {idx} (Domain: {domain}): {type(e).__name__} - {str(e)}")
             continue

    print(f"\nConversion finished.")
    print(f"Successfully processed: {processed_count} rows")
    print(f"Errors encountered: {error_count} rows")

    # --- Convert the list of conversation lists to a Hugging Face Dataset ---
    if conversation_data_list:
        # Create the dictionary format expected by from_dict
        hf_dataset_dict = {"conversations": conversation_data_list} 
        # Create and return the Dataset object
        return Dataset.from_dict(hf_dataset_dict)
    else:
        print("No valid data processed. Returning empty Dataset.")
        # Return an empty Dataset with the expected structure
        return Dataset.from_dict({"conversations": []})

In [3]:
df = pd.read_csv('./dataset/harmful.csv')
with open('./prompt/labelling_promptv4.txt', 'r', encoding='utf-8') as f:
    system_prompt = f.read()

# Convert to ShareGPT format with Unicode preservation
dataset = to_sharegpt_with_thought(
    system=system_prompt,
    input_suffix="Classify the given URL as 0 (benign), 1 (gambling), 2 (pornography), or 3 (harmful). Output MUST be JSON.\n",
    dataset=df
)

# Now you can check the type and print the Dataset info
print("\nOutput Type:", type(dataset))
print("Dataset Info:")
print(dataset)

Starting conversion for 776 rows...

Conversion finished.
Successfully processed: 776 rows
Errors encountered: 0 rows

Output Type: <class 'datasets.arrow_dataset.Dataset'>
Dataset Info:
Dataset({
    features: ['conversations'],
    num_rows: 776
})


### Save ShareGPT JSONL

In [4]:
print(f"Saving {len(dataset)} conversations to JSONL...")

try:
    with open('./dataset/harmful_sharegpt_thought3.jsonl', 'w', encoding='utf-8') as f:
        for item in dataset:
            json_line = {"conversations": item['conversations']}
            f.write(json.dumps(json_line, ensure_ascii=False) + '\n')
    print("Successfully saved data to harmful_sharegpt_thought3.jsonl")
except Exception as e:
    print(f"Error saving JSONL file: {e}")

Saving 776 conversations to JSONL...
Successfully saved data to harmful_sharegpt_thought3.jsonl


In [5]:
import json

with open("./dataset/harmful_sharegpt_thought3.jsonl", "r", encoding="utf-8") as f:
    for i, line in enumerate(f, start=1):
        try:
            json.loads(line)
        except json.JSONDecodeError as e:
            print(f"Error in line {i}: {e}")

### Standardize ShareGPT

In [6]:
from datasets import load_dataset

# Load the dataset from JSONL
dataset = load_dataset("json", data_files="./dataset/harmful_sharegpt_thought3.jsonl")
dataset = dataset["train"]  # Access the 'train' split

Generating train split: 776 examples [00:00, 31602.26 examples/s]


In [7]:
print("\nOutput Type:", type(dataset))
print("Dataset Info:")
print(dataset)


Output Type: <class 'datasets.arrow_dataset.Dataset'>
Dataset Info:
Dataset({
    features: ['conversations'],
    num_rows: 776
})


In [8]:
from unsloth.chat_templates import standardize_sharegpt
dataset = standardize_sharegpt(dataset)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.5.1 with CUDA 1201 (you have 2.6.0+cu124)
    Python  3.11.10 (you have 3.11.11)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!


Unsloth: Standardizing formats (num_proc=16): 100%|██████████| 776/776 [00:00<00:00, 5226.35 examples/s]


### Save ChatML JSONL

In [9]:
print(f"Saving {len(dataset)} standardized conversations to JSONL...")
# Save standardized dataset in JSONL format
try:
    with open('./dataset/harmful_chatml_thought.jsonl', 'w', encoding='utf-8') as f:
        for item in dataset:
            # Wrap each conversation in a dictionary with the key "conversations"
            json_line = {"conversations": item['conversations']}
            f.write(json.dumps(json_line, ensure_ascii=False) + '\n')
    print("Successfully saved standardized data to standardized_sharegpt.jsonl")
except Exception as e:
    print(f"Error saving JSONL file: {e}")

Saving 776 standardized conversations to JSONL...
Successfully saved standardized data to standardized_sharegpt.jsonl


## Load Model

In [10]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 30000 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-bnb-4bit", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 3090 Ti. Num GPUs = 1. Max memory: 23.551 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [11]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.3.19 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [12]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

In [13]:
dataset = dataset.map(formatting_prompts_func, batched = True,)

Map: 100%|██████████| 776/776 [00:00<00:00, 6669.06 examples/s]


In [14]:
dataset[5]["conversations"]

[{'content': 'You are an Expert Website Classifier tasked with categorizing websites (using provided Domain & Content) into distinct categories: **0 - Benign**, **1 - Gambling**, **2 - Pornography**, or **3 - Harmful**. Output strict JSON including classification, reason, and a point-based confidence score (0-100). This aids a sophisticated website prediction system for digital safety.\n\n### **Categories & Definitions:**\n\n-   **0 - Benign:** General info, news, entertainment, services, e-commerce, educational sites, blogs, informational pages, and general entertainment sites. No gambling/porn themes.\n    -   **Includes:** Educational discussion of sensitive topics (drugs, gambling, adult, illegal), suggestive content (dating, lingerie) *without* explicit material, drugs for educational or news purposes (e.g., addiction recovery, medical cannabis research).\n\n-   **1 - Gambling:** Promotes/facilitates betting, casino, poker, lottery, wagering. Includes online betting platforms, cas

In [15]:
dataset[5]["text"]

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\nYou are an Expert Website Classifier tasked with categorizing websites (using provided Domain & Content) into distinct categories: **0 - Benign**, **1 - Gambling**, **2 - Pornography**, or **3 - Harmful**. Output strict JSON including classification, reason, and a point-based confidence score (0-100). This aids a sophisticated website prediction system for digital safety.\n\n### **Categories & Definitions:**\n\n-   **0 - Benign:** General info, news, entertainment, services, e-commerce, educational sites, blogs, informational pages, and general entertainment sites. No gambling/porn themes.\n    -   **Includes:** Educational discussion of sensitive topics (drugs, gambling, adult, illegal), suggestive content (dating, lingerie) *without* explicit material, drugs for educational or news purposes (e.g., addiction recovery, medical cannabis research).\n\n-   **1

## Training

In [24]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(
        tokenizer = tokenizer,
        padding = True,                # ← Critical change
        pad_to_multiple_of = 8,        # ← Aligns with GPU memory
        max_length = max_seq_length,
    ),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2): 100%|██████████| 776/776 [00:03<00:00, 242.91 examples/s]


### Train on Completions Only

In [25]:
def train_on_responses_only_custom(trainer):
    """
    Custom version for website classification that:
    1. Preserves system prompt context
    2. Only trains on assistant responses (JSON + reasoning)
    3. Handles length consistency for batching
    """
    tokenizer = trainer.tokenizer
    
    # Manually define token sequences for your template
    SYSTEM_TOKENS = tokenizer.encode(
        "<|start_header_id|>system<|end_header_id|>\n\n", 
        add_special_tokens=False
    )
    USER_TOKENS = tokenizer.encode(
        "<|start_header_id|>user<|end_header_id|>\n\n", 
        add_special_tokens=False
    )
    ASSISTANT_TOKENS = tokenizer.encode(
        "<|start_header_id|>assistant<|end_header_id|>\n\n", 
        add_special_tokens=False
    )

    def custom_masking(examples):
        input_ids = examples["input_ids"]
        
        # Create labels if they don't exist
        labels = examples.get("labels", [ids.copy() for ids in input_ids])
        new_labels = []
        
        for seq_id, seq_labels in zip(input_ids, labels):
            n = len(seq_id)
            mask = [-100] * n
            i = 0
            
            while i < n:
                # Check for system prompt
                if seq_id[i:i+len(SYSTEM_TOKENS)] == SYSTEM_TOKENS:
                    i += len(SYSTEM_TOKENS)
                    continue
                    
                # Check for user prompt
                if seq_id[i:i+len(USER_TOKENS)] == USER_TOKENS:
                    i += len(USER_TOKENS)
                    continue
                    
                # Check for assistant prompt
                if seq_id[i:i+len(ASSISTANT_TOKENS)] == ASSISTANT_TOKENS:
                    start = i
                    i += len(ASSISTANT_TOKENS)
                    
                    # Find end of assistant response
                    while i < n:
                        if seq_id[i] == tokenizer.eos_token_id:
                            end = i
                            break
                        i += 1
                    else:
                        end = n
                    
                    # Unmask assistant response
                    mask[start:end] = seq_labels[start:end]
                    break
                    
                i += 1
            
            # Enforce length matching (critical fix)
            if len(mask) != len(seq_id):
                mask = mask[:len(seq_id)] + [-100] * (len(seq_id) - len(mask))
            
            new_labels.append(mask)
        
        return {"labels": new_labels}

    # Apply to datasets with length verification
    def apply_masking(dataset):
        dataset = dataset.map(
            custom_masking,
            batched=True,
            batch_size=1000,
            num_proc=4,
        )
        # Verify lengths
        for i in range(min(3, len(dataset))):
            assert len(dataset[i]["input_ids"]) == len(dataset[i]["labels"]), \
                f"Length mismatch in sample {i}"
        return dataset
    
    trainer.train_dataset = apply_masking(trainer.train_dataset)
    
    if trainer.eval_dataset is not None:
        trainer.eval_dataset = apply_masking(trainer.eval_dataset)
        
    return trainer

In [26]:
trainer = train_on_responses_only_custom(trainer)

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Map (num_proc=4): 100%|██████████| 776/776 [00:01<00:00, 408.20 examples/s]


In [27]:
# Final verification (optional)
print("\n=== Final Pre-Train Check ===")
sample = trainer.train_dataset[0]
print("First sample labels preview:")
print("Input IDs length:", len(sample["input_ids"])) 
print("Labels length:", len(sample["labels"]))
print("Last 10 labels:", sample["labels"][-10:])  # Should show unmasked assistant tokens


=== Final Pre-Train Check ===
First sample labels preview:
Input IDs length: 6714
Labels length: 6714
Last 10 labels: [220, 330, 83029, 794, 220, 5313, 198, 534, 74694, 128009]


In [28]:
# Get a sample from the training set
sample_index = 5  # Try different indices
sample = trainer.train_dataset[sample_index]

# Decode the full input context
print("==== Full Input Context ====")
print(tokenizer.decode(sample["input_ids"]))
print("\n")

# Decode the labels with masking visualization
print("==== Training Targets (Masked) ====")
masked_labels = []
for token_id, label_id in zip(sample["input_ids"], sample["labels"]):
    if label_id == -100:
        # Show masked tokens as blank spaces
        masked_labels.append(" ")
    else:
        # Show actual token
        masked_labels.append(tokenizer.decode([token_id]))

print("".join(masked_labels))

==== Full Input Context ====
<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

You are an Expert Website Classifier tasked with categorizing websites (using provided Domain & Content) into distinct categories: **0 - Benign**, **1 - Gambling**, **2 - Pornography**, or **3 - Harmful**. Output strict JSON including classification, reason, and a point-based confidence score (0-100). This aids a sophisticated website prediction system for digital safety.

### **Categories & Definitions:**

-   **0 - Benign:** General info, news, entertainment, services, e-commerce, educational sites, blogs, informational pages, and general entertainment sites. No gambling/porn themes.
    -   **Includes:** Educational discussion of sensitive topics (drugs, gambling, adult, illegal), suggestive content (dating, lingerie) *without* explicit material, drugs for educational or news purposes (e.g., addiction recovery, med

In [29]:
batch = trainer.train_dataset.select(range(2)).with_format("torch")[0]
print(batch["input_ids"])
print(batch["labels"])


tensor([128000, 128000, 128006,  ...,    534,  74694, 128009])
tensor([  -100,   -100,   -100,  ...,    534,  74694, 128009])


In [30]:
trainer.train_dataset

Dataset({
    features: ['conversations', 'text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 776
})

### Memory Stats

In [31]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3090 Ti. Max memory = 23.551 GB.
3.441 GB of memory reserved.


In [32]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 776 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856/3,000,000,000 (0.81% trained)


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [27]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

1099.2794 seconds used for training.
18.32 minutes used for training.
Peak reserved memory = 5.771 GB.
Peak reserved memory for training = 2.33 GB.
Peak reserved memory % of max memory = 24.504 %.
Peak reserved memory for training % of max memory = 9.893 %.


In [36]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Load the system prompt from the file
with open('./prompt/labelling_promptv4.txt', 'r') as system_file:
    system_prompt = system_file.read()

# Load the label from the file
with open('./prompt/class_3_sample1.txt', 'r') as label_file:
    label = label_file.read()

# Define the messages
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"Classify the given URL as 0 (benign), 1 (gambling), 2 (pornography), or 3 (harmful). Output MUST be JSON.\n{label}"}
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 800,
                   use_cache = True, temperature = 0.4, min_p = 0.5)

Classify the given URL as 0 (benign), 1 (gambling), 2 (pornography), or 3 (harmful). Output MUST be JSON.
\nDomain: jitutogel.net, Content: \"Whatsapp Telegram Livechat DAFTAR LOGIN BERANDA SLOT GAMES LIVE CASINO POKER ONLINE E-SPORTS ARCADE LOTTERY PROMOTION Jackpot Gaming Playstar TTG Slots Spadegaming RedTiger GMW CQ9 Gaming Live Gaming Evolution Gaming Sexy Gaming SAgaming HOgaming Gameplay OpusGaming Sports Gaming CMD368 SBOBET UBOBET TFGaming Ultraplay SabaEsports JituTogel: Serunya Game Online dengan Hadiah Uang Nyata! Penyedia Games Metode Pembayaran Bank Cimb Niaga BCA Danamon Permata Mandiri BNI BRI Panin Pulsa XL Axiata Tri Telkomsel Axis E-Money QRIS Dana OVO LinkAja Gopay ©2024 jitutogel. All rights reserved | 18+ LIVECHAT\""urls"urlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurlsurl

In [32]:
print(messages)

[{'role': 'system', 'content': 'You are an Expert Website Classifier tasked with categorizing websites (using provided Domain & Content) into distinct categories: **0 - Benign**, **1 - Gambling**, **2 - Pornography**, or **3 - Harmful**. Output strict JSON including classification, reason, and a point-based confidence score (0-100). This aids a sophisticated website prediction system for digital safety.\n\n### **Categories & Definitions:**\n\n-   **0 - Benign:** General info, news, entertainment, services, e-commerce, educational sites, blogs, informational pages, and general entertainment sites. No gambling/porn themes.\n    -   **Includes:** Educational discussion of sensitive topics (drugs, gambling, adult, illegal), suggestive content (dating, lingerie) *without* explicit material, drugs for educational or news purposes (e.g., addiction recovery, medical cannabis research).\n\n-   **1 - Gambling:** Promotes/facilitates betting, casino, poker, lottery, wagering. Includes online bett