In [3]:
import json
import pandas as pd

In [2]:
def to_sharegpt(system, input_suffix, dataset):
    """
    Convert website classification dataset to ShareGPT format while preserving original Unicode characters.
    
    Args:
        system (str): System prompt
        input_suffix (str): Suffix to append to the human message
        dataset (pd.DataFrame): Input DataFrame with columns:
            ['Domain', 'Content', 'Label', 'classification', 'reason', 'confidence']
            
    Returns:
        list: List of conversations in ShareGPT format with preserved Unicode
    """
    sharegpt_data = []
    
    for _, row in dataset.iterrows():
        domain = row['Domain']
        content = row['Content']
        human_value = (
            f"{input_suffix}\n"
            f"Domain: {domain}, "
            f'Content: "{content}"'  # Direct string interpolation with quotes
        )
        gpt_response = {
            "answer": int(row['Label']),
            "classification": row['classification'],
            "reason": row['reason'],
            "confidence": int(row['confidence'])
        }
        conversation = [
            {"from": "system", "value": system},
            {"from": "human", "value": human_value},
            {"from": "gpt", "value": json.dumps(gpt_response, ensure_ascii=False)}
        ]
        
        sharegpt_data.append(conversation)
    
    return sharegpt_data

In [None]:
df = pd.read_csv('./dataset/harmful.csv')
with open('./prompt/labelling_promptv4.txt', 'r', encoding='utf-8') as f:
    system_prompt = f.read()

# Convert to ShareGPT format with Unicode preservation
formatted_data = to_sharegpt(
    system="system_prompt",
    input_suffix="Classify the given URL as 0 (benign), 1 (gambling), 2 (pornography), or 3 (harmful). Output MUST be JSON.\n",
    dataset=df
)

# Save with UTF-8 encoding and Unicode preservation
with open('harmful_sharegpt.json', 'w', encoding='utf-8') as f:
    json.dump(formatted_data, f, ensure_ascii=False, indent=2)

In [9]:
from unsloth.chat_templates import standardize_sharegpt

Unsloth: Patching Xformers to fix some performance issues.


RuntimeError: Failed to find C compiler. Please specify via CC environment variable.

In [10]:
from unsloth import FastLanguageModel

RuntimeError: Failed to find C compiler. Please specify via CC environment variable.