In [2]:
import json
import pandas as pd
from pathlib import Path
from jsonschema import validate, ValidationError
from datasets import Dataset, load_dataset
import os

def mergeCsv(output_file, *input_files):
    """
    Merges multiple CSV files into a single CSV file.

    Parameters:
        output_file (str): The name of the output CSV file.
        *input_files (str): Paths to the input CSV files to be merged.
    """
    # List to store DataFrames
    dataframes = []

    # Read each CSV file and append to the list
    for file in input_files:
        if os.path.exists(file):
            df = pd.read_csv(file)
            dataframes.append(df)
        else:
            print(f"File not found: {file}")

    # Concatenate all DataFrames
    if dataframes:
        merged_df = pd.concat(dataframes, ignore_index=True)
        # Save the merged DataFrame to the output file
        merged_df.to_csv(output_file, index=False)
        print(f"Merged CSV saved as: {output_file}")
    else:
        print("No valid files to merge.")


def countRow(input_file):
    """
    Counts the number of rows in a CSV file.

    Parameters:
        input_file (str): Path to the input CSV file.

    Returns:
        int: Number of rows in the CSV file.
    """
    if os.path.exists(input_file):
        df = pd.read_csv(input_file)
        row_count = len(df)
        print(f"Number of rows in {input_file}: {row_count}")
    else:
        print(f"File not found: {input_file}")

def checkDuplicate(file_path):
    """
    Checks for duplicate rows in a CSV file based on the 'Domain' column.

    Parameters:
        file_path (str): Path to the input CSV file.

    Returns:
        None
    """
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)

        # Check for duplicates based on the 'Domain' column
        duplicates = df[df.duplicated(subset='Domain', keep=False)]

        # Print the duplicates if any
        if not duplicates.empty:
            print(f"Found {len(duplicates)} duplicate rows based on the 'Domain' column:")
        else:
            print("No duplicates found based on the 'Domain' column.")
    else:
        print(f"File not found: {file_path}")

def removeDuplicate(file_path, output_file):
    """
    Removes duplicate rows in a CSV file based on the 'Domain' column, keeping the first occurrence.

    Parameters:
        file_path (str): Path to the input CSV file.
        output_file (str): Path to save the deduplicated CSV file.

    Returns:
        None
    """
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)

        # Remove duplicates based on the 'Domain' column, keeping the first occurrence
        deduplicated_df = df.drop_duplicates(subset='Domain', keep='first')

        # Save the deduplicated DataFrame to a new file
        deduplicated_df.to_csv(output_file, index=False)
        print(f"Duplicates removed. Deduplicated file saved as: {output_file}")
    else:
        print(f"File not found: {file_path}")


def load_and_print_all_columns(file_path):
    """
    Load a CSV file and print all columns.

    Parameters:
        file_path (str): Path to the input CSV file.

    Returns:
        None
    """
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        print(df.columns.tolist())
    else:
        print(f"File not found: {file_path}")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
countRow("./dataset/merged_combined_dedup_final.csv")
checkDuplicate("./dataset/merged_combined_dedup_final.csv")
load_and_print_all_columns("./dataset/merged_combined_dedup_final.csv")

Number of rows in ./dataset/merged_combined_dedup_final.csv: 96828
Found 4480 duplicate rows based on the 'Domain' column:
['Domain', 'Content', 'Label', 'Classification', 'Reason', 'Confidence', 'Thought']


In [3]:
import pandas as pd

# File path
file_path = "./dataset/merged_combined_dedup_final.csv"

# Check if the file exists
if os.path.exists(file_path):
    # Load the CSV file
    df = pd.read_csv(file_path)
    
    # Count occurrences of each unique value in the 'Label' column
    label_counts = df['Label'].value_counts()
    
    # Print the occurrences
    for label, count in label_counts.items():
        print(f"Label {label}: {count} occurrences")
else:
    print(f"File not found: {file_path}")

Label 0: 79283 occurrences
Label 2: 9230 occurrences
Label 1: 7076 occurrences
Label 3: 1239 occurrences


In [5]:
import pandas as pd

# Load data
df = pd.read_csv("./dataset/merged_combined_dedup_final.csv")

# Undersample Benign to 12,000
benign = df[df["Label"] == 0].sample(n=12000, random_state=42)

# Use all Gambling and Pornography
gambling = df[df["Label"] == 1]
porn = df[df["Label"] == 2]

# Oversample Harmful by 4x via duplication
harmful = df[df["Label"] == 3]
harmful_oversampled = pd.concat([harmful] * 4, ignore_index=True)

# Combine and shuffle
balanced_df = pd.concat([benign, gambling, porn, harmful_oversampled])
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save
balanced_df.to_csv("./dataset/balanced_dataset.csv", index=False)

In [8]:
countRow("./dataset/balanced_dataset.csv")
checkDuplicate("./dataset/balanced_dataset.csv")
load_and_print_all_columns("./dataset/balanced_dataset.csv")

Number of rows in ./dataset/balanced_dataset.csv: 33262
Found 6624 duplicate rows based on the 'Domain' column:
['Domain', 'Content', 'Label', 'Classification', 'Reason', 'Confidence', 'Thought']


In [12]:
# File path
file_path = "./dataset/balanced_dataset.csv"

# Check if the file exists
if os.path.exists(file_path):
    # Load the CSV file
    df = pd.read_csv(file_path)
    
    # Count occurrences of each unique value in the 'Label' column
    label_counts = df['Label'].value_counts()
    
    # Print the occurrences
    for label, count in label_counts.items():
        print(f"Label {label}: {count} occurrences")
else:
    print(f"File not found: {file_path}")

Label 0: 12000 occurrences
Label 2: 9230 occurrences
Label 1: 7076 occurrences
Label 3: 4956 occurrences


## Data Preparation

In [3]:
# # Define the schema for the classification dictionary (outside the function is cleaner)
# classification_schema = {
#     "type": "object",
#     "properties": {
#         "answer": {"type": "integer", "minimum": 0, "maximum": 3},
#         "classification": {"type": "string"},
#         "reason": {"type": "string"},
#         "confidence": {"type": "integer", "minimum": 0, "maximum": 100}
#     },
#     "required": ["answer", "classification", "reason", "confidence"]
# }

# def to_sharegpt_with_thought(system, input_suffix, dataset) -> Dataset:
#     """
#     Convert website classification dataset to ShareGPT format including reasoning ('thought'),
#     with JSON validation and enhanced error handling, specifically checking for unhashable types.
#     Returns a Hugging Face Dataset object.
    
#     Args:
#         system (str): System prompt
#         input_suffix (str): Suffix to append to the human message (can be empty if not needed)
#         dataset (pd.DataFrame): Input DataFrame with columns:
#             ['Domain', 'Content', 'Label', 'classification', 'reason', 'confidence', 'thought']
            
#     Returns:
#         datasets.Dataset: Hugging Face Dataset with a 'conversations' column, 
#                           where each row contains a list representing one conversation.
#                           Returns an empty Dataset if input dataset is empty or all rows fail.
#     """
#     if not isinstance(dataset, pd.DataFrame) or dataset.empty:
#         print("Input is not a valid or non-empty DataFrame. Returning empty Dataset.")
#         # Return an empty Dataset with the expected structure
#         return Dataset.from_dict({"conversations": []}) 
        
#     # This list will temporarily hold the conversation lists
#     conversation_data_list = [] 
#     error_count = 0
#     processed_count = 0

#     human_template = f"{input_suffix}\nDomain: {{domain}}, Content: \"{{content}}\""
#     if not input_suffix:
#          human_template = f"Domain: {{domain}}, Content: \"{{content}}\""

#     print(f"Starting conversion for {len(dataset)} rows...")

#     for idx, row in dataset.iterrows():
#         try:
#             # --- Data Extraction and Basic Type Check ---
#             domain = row.get('Domain', 'N/A') 
#             content = str(row['Content']) if pd.notna(row['Content']) else "" 
#             thought_text = str(row['thought']).strip() if pd.notna(row['thought']) else "No thought provided."
            
#             label = row['Label']
#             classification = row['classification']
#             reason = row['reason']
#             confidence = row['confidence']

#             if pd.isna(label) or pd.isna(classification) or pd.isna(reason) or pd.isna(confidence):
#                  raise ValueError("One or more required classification fields are NaN")

#             if isinstance(label, (list, dict)) or \
#                isinstance(classification, (list, dict)) or \
#                isinstance(reason, (list, dict)) or \
#                isinstance(confidence, (list, dict)):
#                  raise TypeError("One or more classification fields contain unhashable list/dict types")

#             human_value = human_template.format(domain=domain, content=content)
            
#             # --- Prepare and Validate Classification Dictionary ---
#             classification_dict = {
#                 "answer": int(label), 
#                 "classification": str(classification),
#                 "reason": str(reason), 
#                 "confidence": int(confidence)
#             }
#             validate(instance=classification_dict, schema=classification_schema) 
            
#             # --- Serialize and Format Output ---
#             final_json_str = json.dumps(classification_dict, ensure_ascii=False, indent=2) 
#             gpt_value = f"<think>\n{thought_text}\n</think>\n```json\n{final_json_str}\n```"
            
#             # This is the list for a single conversation
#             conversation = [
#                 {"from": "system", "value": system},
#                 {"from": "human", "value": human_value},
#                 {"from": "gpt", "value": gpt_value} 
#             ]
            
#             # Append the conversation list to our temporary list
#             conversation_data_list.append(conversation) 
#             processed_count += 1

#         # --- Error Handling ---
#         except (ValidationError, ValueError, TypeError) as e: 
#             error_count += 1
#             # Avoid printing excessive errors if many occur
#             if error_count < 20 or error_count % 100 == 0: 
#                  print(f"Error processing row {idx} (Domain: {domain}): {type(e).__name__} - {str(e)}")
#             continue 
#         except Exception as e: 
#              error_count += 1
#              if error_count < 20 or error_count % 100 == 0:
#                  print(f"UNEXPECTED Error processing row {idx} (Domain: {domain}): {type(e).__name__} - {str(e)}")
#              continue

#     print(f"\nConversion finished.")
#     print(f"Successfully processed: {processed_count} rows")
#     print(f"Errors encountered: {error_count} rows")

#     # --- Convert the list of conversation lists to a Hugging Face Dataset ---
#     if conversation_data_list:
#         # Create the dictionary format expected by from_dict
#         hf_dataset_dict = {"conversations": conversation_data_list} 
#         # Create and return the Dataset object
#         return Dataset.from_dict(hf_dataset_dict)
#     else:
#         print("No valid data processed. Returning empty Dataset.")
#         # Return an empty Dataset with the expected structure
#         return Dataset.from_dict({"conversations": []})
    

# Define the schema for the classification dictionary
classification_schema = {
    "type": "object",
    "properties": {
        "answer": {"type": "integer", "minimum": 0, "maximum": 3},
        "classification": {"type": "string"},
        "reason": {"type": "string"},
        "confidence": {"type": "integer", "minimum": 0, "maximum": 100}
    },
    "required": ["answer", "classification", "reason", "confidence"]
}

def to_sharegpt_with_thought(system, input_suffix, dataset) -> Dataset:
    """
    Convert website classification dataset to ShareGPT format including reasoning ('Thought'),
    with JSON validation and enhanced error handling, specifically checking for unhashable types.
    Returns a Hugging Face Dataset object.
    
    Args:
        system (str): System prompt
        input_suffix (str): Suffix to append to the human message (can be empty if not needed)
        dataset (pd.DataFrame): Input DataFrame with columns:
            ['Domain', 'Content', 'Label', 'Classification', 'Reason', 'Confidence', 'Thought']
            
    Returns:
        datasets.Dataset: Hugging Face Dataset with a 'conversations' column, 
                          where each row contains a list representing one conversation
    """
    # Validate input DataFrame
    if not isinstance(dataset, pd.DataFrame) or dataset.empty:
        print("Input is not a valid or non-empty DataFrame. Returning empty Dataset.")
        return Dataset.from_dict({"conversations": []}) 
        
    # Validate required columns
    required_columns = ['Domain', 'Content', 'Label', 'Classification', 'Reason', 'Confidence', 'Thought']
    if not all(col in dataset.columns for col in required_columns):
        missing = [col for col in required_columns if col not in dataset.columns]
        print(f"Missing required columns: {missing}. Returning empty Dataset.")
        return Dataset.from_dict({"conversations": []})

    conversation_data_list = [] 
    error_count = 0
    processed_count = 0
    
    # Template for human message
    human_template = f"{input_suffix}\nDomain: {{domain}}, Content: \"{{content}}\""
    if not input_suffix:
        human_template = f"Domain: {{domain}}, Content: \"{{content}}\""
    
    print(f"Starting conversion for {len(dataset)} rows...")
    
    for idx, row in dataset.iterrows():
        try:
            # Extract data from DataFrame row
            domain = str(row['Domain']) if pd.notna(row['Domain']) else "N/A"
            content = str(row['Content']) if pd.notna(row['Content']) else ""
            thought = str(row['Thought']) if pd.notna(row['Thought']) else "No thought provided."
            
            # Validate classification fields
            label = row['Label']
            classification = row['Classification']
            reason = row['Reason']
            confidence = row['Confidence']
            
            if pd.isna(label) or pd.isna(classification) or pd.isna(reason) or pd.isna(confidence):
                raise ValueError("One or more required classification fields are NaN")
                
            if isinstance(label, (list, dict)) or isinstance(classification, (list, dict)) or \
               isinstance(reason, (list, dict)) or isinstance(confidence, (list, dict)):
                raise TypeError("One or more classification fields contain unhashable list/dict types")
            
            # Create human message
            human_value = human_template.format(domain=domain, content=content)
            
            # Prepare and validate classification dictionary
            classification_dict = {
                "answer": int(label),
                "classification": str(classification),
                "reason": str(reason),
                "confidence": int(confidence)
            }
            validate(instance=classification_dict, schema=classification_schema)
            
            # Create GPT response with thought and JSON
            final_json_str = json.dumps(classification_dict, ensure_ascii=False, indent=2)
            gpt_value = f"<think>\n{thought}\n</think>\n```json\n{final_json_str}\n```"
            # Create conversation structure
            conversation = [
                {"from": "system", "value": system},
                {"from": "human", "value": human_value},
                {"from": "gpt", "value": gpt_value}
            ]
            
            conversation_data_list.append(conversation)
            processed_count += 1
            
        except (ValidationError, ValueError, TypeError) as e:
            error_count += 1
            if error_count < 20 or error_count % 100 == 0:
                print(f"Error processing row {idx} (Domain: {domain}): {type(e).__name__} - {str(e)}")
            continue
            
        except Exception as e:
            error_count += 1
            if error_count < 20 or error_count % 100 == 0:
                print(f"UNEXPECTED Error processing row {idx} (Domain: {domain}): {type(e).__name__} - {str(e)}")
            continue
    
    # Print conversion summary
    print(f"\nConversion finished.")
    print(f"Successfully processed: {processed_count} rows")
    print(f"Errors encountered: {error_count} rows")
    
    # Return Dataset
    if conversation_data_list:
        return Dataset.from_dict({"conversations": conversation_data_list})
    else:
        print("No valid data processed. Returning empty Dataset.")
        return Dataset.from_dict({"conversations": []})

In [4]:
df = pd.read_csv('./dataset/balanced_dataset.csv')
with open('./prompt/labelling_promptv4.txt', 'r', encoding='utf-8') as f:
    system_prompt = f.read()

# Convert to ShareGPT format with Unicode preservation
dataset = to_sharegpt_with_thought(
    system=system_prompt,
    input_suffix="Classify the given URL as 0 (benign), 1 (gambling), 2 (pornography), or 3 (harmful). Output MUST be JSON.\n",
    dataset=df
)

# Now you can check the type and print the Dataset info
print("\nOutput Type:", type(dataset))
print("Dataset Info:")
print(dataset)

Starting conversion for 33262 rows...

Conversion finished.
Successfully processed: 33262 rows
Errors encountered: 0 rows

Output Type: <class 'datasets.arrow_dataset.Dataset'>
Dataset Info:
Dataset({
    features: ['conversations'],
    num_rows: 33262
})


In [5]:
print(f"Saving {len(dataset)} conversations to JSONL...")

try:
    with open('./dataset/netpro_sharegpt_thought.jsonl', 'w', encoding='utf-8') as f:
        for item in dataset:
            json_line = {"conversations": item['conversations']}
            f.write(json.dumps(json_line, ensure_ascii=False) + '\n')
    print("Successfully saved data to netpro_sharegpt_thought.jsonl")
except Exception as e:
    print(f"Error saving JSONL file: {e}")

Saving 33262 conversations to JSONL...
Successfully saved data to netpro_sharegpt_thought.jsonl


In [6]:
with open("./dataset/netpro_sharegpt_thought.jsonl", "r", encoding="utf-8") as f:
    for i, line in enumerate(f, start=1):
        try:
            json.loads(line)
        except json.JSONDecodeError as e:
            print(f"Error in line {i}: {e}")

### Standardize ShareGPT

In [7]:
from datasets import load_dataset

# Load the dataset from JSONL
dataset = load_dataset("json", data_files="./dataset/netpro_sharegpt_thought.jsonl")

Generating train split: 33262 examples [00:00, 40941.89 examples/s]


In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['conversations'],
        num_rows: 33262
    })
})

In [9]:
dataset = dataset["train"]  # Access the 'train' split

In [10]:
print("\nOutput Type:", type(dataset))
print("Dataset Info:")
print(dataset)


Output Type: <class 'datasets.arrow_dataset.Dataset'>
Dataset Info:
Dataset({
    features: ['conversations'],
    num_rows: 33262
})


In [11]:
from unsloth.chat_templates import standardize_sharegpt
dataset = standardize_sharegpt(dataset)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.5.1 with CUDA 1201 (you have 2.6.0+cu124)
    Python  3.11.10 (you have 3.11.11)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!


Unsloth: Standardizing formats (num_proc=16): 100%|██████████| 33262/33262 [00:01<00:00, 27355.39 examples/s]


### Save ChatML JSONL

In [12]:
print(f"Saving {len(dataset)} standardized conversations to JSONL...")
# Save standardized dataset in JSONL format
try:
    with open('./dataset/netpro_chatml_thought.jsonl', 'w', encoding='utf-8') as f:
        for item in dataset:
            # Wrap each conversation in a dictionary with the key "conversations"
            json_line = {"conversations": item['conversations']}
            f.write(json.dumps(json_line, ensure_ascii=False) + '\n')
    print("Successfully saved standardized data to netpro_chatml_thought.jsonl")
except Exception as e:
    print(f"Error saving JSONL file: {e}")

Saving 33262 standardized conversations to JSONL...
Successfully saved standardized data to netpro_chatml_thought.jsonl


### Upload To Huggingface

In [2]:
from huggingface_hub import HfApi
from dotenv import load_dotenv
import os

# 1. Load environment variables
load_dotenv()  # Loads from .env file automatically

# 2. Verify token loading
if not os.getenv("HF_TOKEN"):
    raise ValueError("HF_TOKEN not found in .env file")

# 3. Initialize and upload
api = HfApi(token=os.getenv("HF_TOKEN"))

api.upload_file(
    path_or_fileobj="./dataset/netpro_chatml_thought.jsonl",
    path_in_repo="data/netpro_chatml_thought.jsonl",
    repo_id="jordinia/netpro-finetune",
    repo_type="dataset",
    commit_message="Initial dataset upload"
)

netpro_chatml_thought.jsonl: 100%|██████████| 710M/710M [01:22<00:00, 8.62MB/s]   


CommitInfo(commit_url='https://huggingface.co/datasets/jordinia/netpro-finetune/commit/e5a321ee51600a2778b55d5d33d8059d33dad9fb', commit_message='Initial dataset upload', commit_description='', oid='e5a321ee51600a2778b55d5d33d8059d33dad9fb', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/jordinia/netpro-finetune', endpoint='https://huggingface.co', repo_type='dataset', repo_id='jordinia/netpro-finetune'), pr_revision=None, pr_num=None)