In [1]:
import json
import pandas as pd
from pathlib import Path
from jsonschema import validate, ValidationError
from datasets import Dataset, load_dataset
import os

def mergeCsv(output_file, *input_files):
    """
    Merges multiple CSV files into a single CSV file.

    Parameters:
        output_file (str): The name of the output CSV file.
        *input_files (str): Paths to the input CSV files to be merged.
    """
    # List to store DataFrames
    dataframes = []

    # Read each CSV file and append to the list
    for file in input_files:
        if os.path.exists(file):
            df = pd.read_csv(file)
            dataframes.append(df)
        else:
            print(f"File not found: {file}")

    # Concatenate all DataFrames
    if dataframes:
        merged_df = pd.concat(dataframes, ignore_index=True)
        # Save the merged DataFrame to the output file
        merged_df.to_csv(output_file, index=False)
        print(f"Merged CSV saved as: {output_file}")
    else:
        print("No valid files to merge.")


def countRow(input_file):
    """
    Counts the number of rows in a CSV file.

    Parameters:
        input_file (str): Path to the input CSV file.

    Returns:
        int: Number of rows in the CSV file.
    """
    if os.path.exists(input_file):
        df = pd.read_csv(input_file)
        row_count = len(df)
        print(f"Number of rows in {input_file}: {row_count}")
    else:
        print(f"File not found: {input_file}")

def checkDuplicate(file_path):
    """
    Checks for duplicate rows in a CSV file based on the 'Domain' column.

    Parameters:
        file_path (str): Path to the input CSV file.

    Returns:
        None
    """
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)

        # Check for duplicates based on the 'Domain' column
        duplicates = df[df.duplicated(subset='Domain', keep=False)]

        # Print the duplicates if any
        if not duplicates.empty:
            print(f"Found {len(duplicates)} duplicate rows based on the 'Domain' column:")
        else:
            print("No duplicates found based on the 'Domain' column.")
    else:
        print(f"File not found: {file_path}")

def removeDuplicate(file_path, output_file):
    """
    Removes duplicate rows in a CSV file based on the 'Domain' column, keeping the first occurrence.

    Parameters:
        file_path (str): Path to the input CSV file.
        output_file (str): Path to save the deduplicated CSV file.

    Returns:
        None
    """
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)

        # Remove duplicates based on the 'Domain' column, keeping the first occurrence
        deduplicated_df = df.drop_duplicates(subset='Domain', keep='first')

        # Save the deduplicated DataFrame to a new file
        deduplicated_df.to_csv(output_file, index=False)
        print(f"Duplicates removed. Deduplicated file saved as: {output_file}")
    else:
        print(f"File not found: {file_path}")


def load_and_print_all_columns(file_path):
    """
    Load a CSV file and print all columns.

    Parameters:
        file_path (str): Path to the input CSV file.

    Returns:
        None
    """
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        print(df.columns.tolist())
    else:
        print(f"File not found: {file_path}")


In [2]:
countRow("./dataset/merged_combined_dedup_final.csv")
checkDuplicate("./dataset/merged_combined_dedup_final.csv")
load_and_print_all_columns("./dataset/merged_combined_dedup_final.csv")

Number of rows in ./dataset/merged_combined_dedup_final.csv: 96926
Found 4491 duplicate rows based on the 'Domain' column:
['Domain', 'Content', 'Label', 'Classification', 'Reason', 'Confidence', 'Thought']


In [3]:
countRow("./dataset/test.csv")
checkDuplicate("./dataset/test.csv")
load_and_print_all_columns("./dataset/test.csv")

Number of rows in ./dataset/test.csv: 19367
Found 324 duplicate rows based on the 'Domain' column:
['Domain', 'Content', 'Label', 'Classification', 'Reason', 'Confidence', 'Thought']


In [4]:
countRow("./dataset/train.csv")
checkDuplicate("./dataset/train.csv")
load_and_print_all_columns("./dataset/train.csv")

Number of rows in ./dataset/train.csv: 77461
Found 3449 duplicate rows based on the 'Domain' column:
['Domain', 'Content', 'Label', 'Classification', 'Reason', 'Confidence', 'Thought']


In [5]:
import pandas as pd

# File path
file_path = "./dataset/train.csv"

# Check if the file exists
if os.path.exists(file_path):
    # Load the CSV file
    df = pd.read_csv(file_path)
    
    # Count occurrences of each unique value in the 'Label' column
    label_counts = df['Label'].value_counts()
    
    # Print the occurrences
    for label, count in label_counts.items():
        print(f"Label {label}: {count} occurrences")
else:
    print(f"File not found: {file_path}")

Label 0: 63426 occurrences
Label 2: 7384 occurrences
Label 1: 5660 occurrences
Label 3: 991 occurrences


In [9]:
import pandas as pd

# File path
file_path = "./dataset/test.csv"

# Check if the file exists
if os.path.exists(file_path):
    # Load the CSV file
    df = pd.read_csv(file_path)
    
    # Count occurrences of each unique value in the 'Label' column
    label_counts = df['Label'].value_counts()
    
    # Print the occurrences
    for label, count in label_counts.items():
        print(f"Label {label}: {count} occurrences")
else:
    print(f"File not found: {file_path}")

Label 0: 15857 occurrences
Label 2: 1846 occurrences
Label 1: 1416 occurrences
Label 3: 248 occurrences


In [7]:
import pandas as pd
from sklearn.utils import shuffle

# Load dataset
df = pd.read_csv("./dataset/train.csv")

# Define target counts
target_counts = {0: 8000, 1: 5660, 2: 7384, 3: 3964}

# Process each label
processed_dfs = []

# Label 0 (Undersample)
df_label_0 = df[df['Label'] == 0].sample(target_counts[0], random_state=42)
processed_dfs.append(df_label_0)

# Label 1 (Keep original)
df_label_1 = df[df['Label'] == 1]
processed_dfs.append(df_label_1)

# Label 2 (Keep original)
df_label_2 = df[df['Label'] == 2]
processed_dfs.append(df_label_2)

# Label 3 (Oversample)
df_label_3 = df[df['Label'] == 3]
if not df_label_3.empty:
    num_original = len(df_label_3)
    factor = target_counts[3] // num_original
    remainder = target_counts[3] % num_original
    oversampled = pd.concat([df_label_3] * factor + [df_label_3.sample(remainder)])
    processed_dfs.append(oversampled)

# Combine and shuffle
df_balanced = shuffle(pd.concat(processed_dfs), random_state=42)

# Save to CSV
df_balanced.to_csv("train_balanced.csv", index=False)

In [8]:
countRow("./dataset/train_balanced.csv")
checkDuplicate("./dataset/train_balanced.csv")
load_and_print_all_columns("./dataset/train_balanced.csv")

Number of rows in ./dataset/train_balanced.csv: 25008
Found 5187 duplicate rows based on the 'Domain' column:
['Domain', 'Content', 'Label', 'Classification', 'Reason', 'Confidence', 'Thought']


In [5]:
# File path
file_path = "./dataset/balanced_dataset.csv"

# Check if the file exists
if os.path.exists(file_path):
    # Load the CSV file
    df = pd.read_csv(file_path)
    
    # Count occurrences of each unique value in the 'Label' column
    label_counts = df['Label'].value_counts()
    
    # Print the occurrences
    for label, count in label_counts.items():
        print(f"Label {label}: {count} occurrences")
else:
    print(f"File not found: {file_path}")

Label 0: 2500 occurrences
Label 2: 1800 occurrences
Label 1: 1800 occurrences
Label 3: 1205 occurrences


In [10]:
import pandas as pd

# Load the deduplicated dataset
file_path = "./dataset/test.csv"
df = pd.read_csv(file_path)

# Configure the number of samples to take from each label
label_sample_sizes = {
    0: 18,  # Benign
    1: 18,  # Gambling
    2: 18,  # Pornography
    3: 6   # Harmful
}

# Process each label
validation_dataframes = []
remaining_dataframes = []

for label, sample_size in label_sample_sizes.items():
    label_df = df[df["Label"] == label]
    
    # Take 10 samples for validation
    validation_sample = label_df.sample(n=sample_size, random_state=42)
    validation_dataframes.append(validation_sample)
    
    # Remove the sampled rows from the original dataset
    remaining_data = label_df.drop(validation_sample.index)
    remaining_dataframes.append(remaining_data)

# Combine validation samples and save
validation_df = pd.concat(validation_dataframes).reset_index(drop=True)
validation_df.to_csv("./dataset/netpro_7k_val.csv", index=False)
print("Validation dataset saved as './dataset/netpro_7k_val.csv'")

# # Combine remaining data and save back to the original file
# remaining_df = pd.concat(remaining_dataframes).reset_index(drop=True)
# remaining_df.to_csv("./dataset/netpro_7k_train.csv", index=False)
# print("Updated dataset saved as './dataset/netpro_7k_train.csv'")

Validation dataset saved as './dataset/netpro_7k_val.csv'


In [7]:
countRow("./dataset/netpro_7k_train.csv")
checkDuplicate("./dataset/netpro_7k_train.csv")
load_and_print_all_columns("./dataset/netpro_7k_train.csv")
countRow("./dataset/netpro_7k_val.csv")
checkDuplicate("./dataset/netpro_7k_val.csv")
load_and_print_all_columns("./dataset/netpro_7k_val.csv")

Number of rows in ./dataset/netpro_7k_train.csv: 7245
No duplicates found based on the 'Domain' column.
['Domain', 'Content', 'Label', 'Classification', 'Reason', 'Confidence', 'Thought']
Number of rows in ./dataset/netpro_7k_val.csv: 60
No duplicates found based on the 'Domain' column.
['Domain', 'Content', 'Label', 'Classification', 'Reason', 'Confidence', 'Thought']


## Data Preparation

In [11]:
# # Define the schema for the classification dictionary (outside the function is cleaner)
# classification_schema = {
#     "type": "object",
#     "properties": {
#         "answer": {"type": "integer", "minimum": 0, "maximum": 3},
#         "classification": {"type": "string"},
#         "reason": {"type": "string"},
#         "confidence": {"type": "integer", "minimum": 0, "maximum": 100}
#     },
#     "required": ["answer", "classification", "reason", "confidence"]
# }

# def to_sharegpt_with_thought(system, input_suffix, dataset) -> Dataset:
#     """
#     Convert website classification dataset to ShareGPT format including reasoning ('thought'),
#     with JSON validation and enhanced error handling, specifically checking for unhashable types.
#     Returns a Hugging Face Dataset object.
    
#     Args:
#         system (str): System prompt
#         input_suffix (str): Suffix to append to the human message (can be empty if not needed)
#         dataset (pd.DataFrame): Input DataFrame with columns:
#             ['Domain', 'Content', 'Label', 'classification', 'reason', 'confidence', 'thought']
            
#     Returns:
#         datasets.Dataset: Hugging Face Dataset with a 'conversations' column, 
#                           where each row contains a list representing one conversation.
#                           Returns an empty Dataset if input dataset is empty or all rows fail.
#     """
#     if not isinstance(dataset, pd.DataFrame) or dataset.empty:
#         print("Input is not a valid or non-empty DataFrame. Returning empty Dataset.")
#         # Return an empty Dataset with the expected structure
#         return Dataset.from_dict({"conversations": []}) 
        
#     # This list will temporarily hold the conversation lists
#     conversation_data_list = [] 
#     error_count = 0
#     processed_count = 0

#     human_template = f"{input_suffix}\nDomain: {{domain}}, Content: \"{{content}}\""
#     if not input_suffix:
#          human_template = f"Domain: {{domain}}, Content: \"{{content}}\""

#     print(f"Starting conversion for {len(dataset)} rows...")

#     for idx, row in dataset.iterrows():
#         try:
#             # --- Data Extraction and Basic Type Check ---
#             domain = row.get('Domain', 'N/A') 
#             content = str(row['Content']) if pd.notna(row['Content']) else "" 
#             thought_text = str(row['thought']).strip() if pd.notna(row['thought']) else "No thought provided."
            
#             label = row['Label']
#             classification = row['classification']
#             reason = row['reason']
#             confidence = row['confidence']

#             if pd.isna(label) or pd.isna(classification) or pd.isna(reason) or pd.isna(confidence):
#                  raise ValueError("One or more required classification fields are NaN")

#             if isinstance(label, (list, dict)) or \
#                isinstance(classification, (list, dict)) or \
#                isinstance(reason, (list, dict)) or \
#                isinstance(confidence, (list, dict)):
#                  raise TypeError("One or more classification fields contain unhashable list/dict types")

#             human_value = human_template.format(domain=domain, content=content)
            
#             # --- Prepare and Validate Classification Dictionary ---
#             classification_dict = {
#                 "answer": int(label), 
#                 "classification": str(classification),
#                 "reason": str(reason), 
#                 "confidence": int(confidence)
#             }
#             validate(instance=classification_dict, schema=classification_schema) 
            
#             # --- Serialize and Format Output ---
#             final_json_str = json.dumps(classification_dict, ensure_ascii=False, indent=2) 
#             gpt_value = f"<think>\n{thought_text}\n</think>\n```json\n{final_json_str}\n```"
            
#             # This is the list for a single conversation
#             conversation = [
#                 {"from": "system", "value": system},
#                 {"from": "human", "value": human_value},
#                 {"from": "gpt", "value": gpt_value} 
#             ]
            
#             # Append the conversation list to our temporary list
#             conversation_data_list.append(conversation) 
#             processed_count += 1

#         # --- Error Handling ---
#         except (ValidationError, ValueError, TypeError) as e: 
#             error_count += 1
#             # Avoid printing excessive errors if many occur
#             if error_count < 20 or error_count % 100 == 0: 
#                  print(f"Error processing row {idx} (Domain: {domain}): {type(e).__name__} - {str(e)}")
#             continue 
#         except Exception as e: 
#              error_count += 1
#              if error_count < 20 or error_count % 100 == 0:
#                  print(f"UNEXPECTED Error processing row {idx} (Domain: {domain}): {type(e).__name__} - {str(e)}")
#              continue

#     print(f"\nConversion finished.")
#     print(f"Successfully processed: {processed_count} rows")
#     print(f"Errors encountered: {error_count} rows")

#     # --- Convert the list of conversation lists to a Hugging Face Dataset ---
#     if conversation_data_list:
#         # Create the dictionary format expected by from_dict
#         hf_dataset_dict = {"conversations": conversation_data_list} 
#         # Create and return the Dataset object
#         return Dataset.from_dict(hf_dataset_dict)
#     else:
#         print("No valid data processed. Returning empty Dataset.")
#         # Return an empty Dataset with the expected structure
#         return Dataset.from_dict({"conversations": []})
    

# Define the schema for the classification dictionary
classification_schema = {
    "type": "object",
    "properties": {
        "answer": {"type": "integer", "minimum": 0, "maximum": 3},
        "classification": {"type": "string"},
        "reason": {"type": "string"},
        "confidence": {"type": "integer", "minimum": 0, "maximum": 100}
    },
    "required": ["answer", "classification", "reason", "confidence"]
}

def to_sharegpt_with_thought(system, input_suffix, dataset) -> Dataset:
    """
    Convert website classification dataset to ShareGPT format including reasoning ('Thought'),
    with JSON validation and enhanced error handling, specifically checking for unhashable types.
    Returns a Hugging Face Dataset object.
    
    Args:
        system (str): System prompt
        input_suffix (str): Suffix to append to the human message (can be empty if not needed)
        dataset (pd.DataFrame): Input DataFrame with columns:
            ['Domain', 'Content', 'Label', 'Classification', 'Reason', 'Confidence', 'Thought']
            
    Returns:
        datasets.Dataset: Hugging Face Dataset with a 'conversations' column, 
                          where each row contains a list representing one conversation
    """
    # Validate input DataFrame
    if not isinstance(dataset, pd.DataFrame) or dataset.empty:
        print("Input is not a valid or non-empty DataFrame. Returning empty Dataset.")
        return Dataset.from_dict({"conversations": []}) 
        
    # Validate required columns
    required_columns = ['Domain', 'Content', 'Label', 'Classification', 'Reason', 'Confidence', 'Thought']
    if not all(col in dataset.columns for col in required_columns):
        missing = [col for col in required_columns if col not in dataset.columns]
        print(f"Missing required columns: {missing}. Returning empty Dataset.")
        return Dataset.from_dict({"conversations": []})

    conversation_data_list = [] 
    error_count = 0
    processed_count = 0
    
    # Template for human message
    human_template = f"{input_suffix}\nDomain: {{domain}}, Content: \"{{content}}\""
    if not input_suffix:
        human_template = f"Domain: {{domain}}, Content: \"{{content}}\""
    
    print(f"Starting conversion for {len(dataset)} rows...")
    
    for idx, row in dataset.iterrows():
        try:
            # Extract data from DataFrame row
            domain = str(row['Domain']) if pd.notna(row['Domain']) else "N/A"
            content = str(row['Content']) if pd.notna(row['Content']) else ""
            thought = str(row['Thought']) if pd.notna(row['Thought']) else "No thought provided."
            
            # Validate classification fields
            label = row['Label']
            classification = row['Classification']
            reason = row['Reason']
            confidence = row['Confidence']
            
            if pd.isna(label) or pd.isna(classification) or pd.isna(reason) or pd.isna(confidence):
                raise ValueError("One or more required classification fields are NaN")
                
            if isinstance(label, (list, dict)) or isinstance(classification, (list, dict)) or \
               isinstance(reason, (list, dict)) or isinstance(confidence, (list, dict)):
                raise TypeError("One or more classification fields contain unhashable list/dict types")
            
            # Create human message
            human_value = human_template.format(domain=domain, content=content)
            
            # Prepare and validate classification dictionary
            classification_dict = {
                "answer": int(label),
                "classification": str(classification),
                "reason": str(reason),
                "confidence": int(confidence)
            }
            validate(instance=classification_dict, schema=classification_schema)
            
            # Create GPT response with thought and JSON
            final_json_str = json.dumps(classification_dict, ensure_ascii=False, indent=2)
            gpt_value = f"<think>\n{thought}\n</think>\n```json\n{final_json_str}\n```"
            # Create conversation structure
            conversation = [
                {"from": "system", "value": system},
                {"from": "human", "value": human_value},
                {"from": "gpt", "value": gpt_value}
            ]
            
            conversation_data_list.append(conversation)
            processed_count += 1
            
        except (ValidationError, ValueError, TypeError) as e:
            error_count += 1
            if error_count < 20 or error_count % 100 == 0:
                print(f"Error processing row {idx} (Domain: {domain}): {type(e).__name__} - {str(e)}")
            continue
            
        except Exception as e:
            error_count += 1
            if error_count < 20 or error_count % 100 == 0:
                print(f"UNEXPECTED Error processing row {idx} (Domain: {domain}): {type(e).__name__} - {str(e)}")
            continue
    
    # Print conversion summary
    print(f"\nConversion finished.")
    print(f"Successfully processed: {processed_count} rows")
    print(f"Errors encountered: {error_count} rows")
    
    # Return Dataset
    if conversation_data_list:
        return Dataset.from_dict({"conversations": conversation_data_list})
    else:
        print("No valid data processed. Returning empty Dataset.")
        return Dataset.from_dict({"conversations": []})

In [12]:
with open('./prompt/labelling_promptv4.txt', 'r', encoding='utf-8') as f:
    system_prompt = f.read()
dftrain         = pd.read_csv('./dataset/netpro_25k_train.csv')
dfvalidation    = pd.read_csv('./dataset/netpro_25k_val.csv')

# Convert to ShareGPT format with Unicode preservation
train_dataset = to_sharegpt_with_thought(
    system=system_prompt,
    input_suffix="Classify the given URL as 0 (benign), 1 (gambling), 2 (pornography), or 3 (harmful). Output MUST be JSON.\n",
    dataset=dftrain
)
validation_dataset = to_sharegpt_with_thought(
    system=system_prompt,
    input_suffix="Classify the given URL as 0 (benign), 1 (gambling), 2 (pornography), or 3 (harmful). Output MUST be JSON.\n",
    dataset=dfvalidation
)

# Now you can check the type and print the Dataset info
print("\nOutput Type:", type(train_dataset))
print("Dataset Info:")
print(train_dataset)
print("\nOutput Type:", type(validation_dataset))
print("Dataset Info:")
print(validation_dataset)
# Save the datasets to disk
# train_dataset.save_to_disk('./dataset/train_dataset')
# validation_dataset.save_to_disk('./dataset/validation_dataset')

Starting conversion for 25008 rows...

Conversion finished.
Successfully processed: 25008 rows
Errors encountered: 0 rows
Starting conversion for 60 rows...

Conversion finished.
Successfully processed: 60 rows
Errors encountered: 0 rows

Output Type: <class 'datasets.arrow_dataset.Dataset'>
Dataset Info:
Dataset({
    features: ['conversations'],
    num_rows: 25008
})

Output Type: <class 'datasets.arrow_dataset.Dataset'>
Dataset Info:
Dataset({
    features: ['conversations'],
    num_rows: 60
})


In [None]:
print(f"Saving {len(train_dataset)} conversations to JSONL...")

try:
    with open('./dataset/netpro_25k_sharegpt_thought_train.jsonl', 'w', encoding='utf-8') as f:
        for item in train_dataset:
            json_line = {"conversations": item['conversations']}
            f.write(json.dumps(json_line, ensure_ascii=False) + '\n')
    print("Successfully saved data to netpro_25k_sharegpt_thought_train.jsonl")
except Exception as e:
    print(f"Error saving JSONL file: {e}")

Saving 25008 conversations to JSONL...
Successfully saved data to netpro_7k_sharegpt_thought_train.jsonl


In [14]:
with open("./dataset/netpro_25k_sharegpt_thought_train.jsonl", "r", encoding="utf-8") as f:
    for i, line in enumerate(f, start=1):
        try:
            json.loads(line)
        except json.JSONDecodeError as e:
            print(f"Error in line {i}: {e}")

In [15]:
print(f"Saving {len(validation_dataset)} conversations to JSONL...")

try:
    with open('./dataset/netpro_25k_sharegpt_thought_val.jsonl', 'w', encoding='utf-8') as f:
        for item in validation_dataset:
            json_line = {"conversations": item['conversations']}
            f.write(json.dumps(json_line, ensure_ascii=False) + '\n')
    print("Successfully saved data to netpro_25k_sharegpt_thought_val.jsonl")
except Exception as e:
    print(f"Error saving JSONL file: {e}")

Saving 60 conversations to JSONL...
Successfully saved data to netpro_25k_sharegpt_thought_val.jsonl


In [16]:
with open("./dataset/netpro_25k_sharegpt_thought_val.jsonl", "r", encoding="utf-8") as f:
    for i, line in enumerate(f, start=1):
        try:
            json.loads(line)
        except json.JSONDecodeError as e:
            print(f"Error in line {i}: {e}")

### Standardize ShareGPT

In [18]:
from datasets import load_dataset

# Load the dataset from JSONL
train_dataset = load_dataset("json", data_files="./dataset/netpro_25k_sharegpt_thought_train.jsonl")
validation_dataset = load_dataset("json", data_files="./dataset/netpro_25k_sharegpt_thought_val.jsonl")

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [19]:
validation_dataset

DatasetDict({
    train: Dataset({
        features: ['conversations'],
        num_rows: 60
    })
})

In [20]:
train_dataset = train_dataset["train"] 
validation_dataset = validation_dataset["train"]

In [21]:
from unsloth.chat_templates import standardize_sharegpt
train_dataset = standardize_sharegpt(train_dataset, "conversations")
validation_dataset = standardize_sharegpt(validation_dataset, "conversations")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


Unsloth: Standardizing formats (num_proc=32):   0%|          | 0/25008 [00:00<?, ? examples/s]

Unsloth: Standardizing formats (num_proc=32):   0%|          | 0/60 [00:00<?, ? examples/s]

### Save ChatML JSONL

In [22]:
print(f"Saving {len(train_dataset)} standardized conversations to JSONL...")
# Save standardized dataset in JSONL format
try:
    with open('./dataset/netpro_25k_chatml_thought_train2.jsonl', 'w', encoding='utf-8') as f:
        for item in train_dataset:
            # Wrap each conversation in a dictionary with the key "conversations"
            json_line = {"conversations": item['conversations']}
            f.write(json.dumps(json_line, ensure_ascii=False) + '\n')
    print("Successfully saved standardized data to netpro_25k_chatml_thought_train2.jsonl")
except Exception as e:
    print(f"Error saving JSONL file: {e}")

Saving 25008 standardized conversations to JSONL...
Successfully saved standardized data to netpro_25k_chatml_thought_train2.jsonl


In [23]:
print(f"Saving {len(validation_dataset)} standardized conversations to JSONL...")
# Save standardized dataset in JSONL format
try:
    with open('./dataset/netpro_25k_chatml_thought_val.jsonl', 'w', encoding='utf-8') as f:
        for item in validation_dataset:
            # Wrap each conversation in a dictionary with the key "conversations"
            json_line = {"conversations": item['conversations']}
            f.write(json.dumps(json_line, ensure_ascii=False) + '\n')
    print("Successfully saved standardized data to netpro_25k_chatml_thought_val.jsonl")
except Exception as e:
    print(f"Error saving JSONL file: {e}")

Saving 60 standardized conversations to JSONL...
Successfully saved standardized data to netpro_25k_chatml_thought_val.jsonl


### Upload To Huggingface

In [24]:
from datasets import DatasetDict
from huggingface_hub import HfApi
from datasets import Dataset
import os

# Combine train and validation datasets into a DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset
})
dataset_dict.save_to_disk("./dataset/chatml_7k")


Saving the dataset (0/2 shards):   0%|          | 0/25008 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/60 [00:00<?, ? examples/s]

In [25]:
# Push the DatasetDict to the Hugging Face Hub
from datasets import load_from_disk
from dotenv import load_dotenv

load_dotenv()  # Loads from .env file automatically

if not os.getenv("HF_TOKEN"):
    raise ValueError("HF_TOKEN not found in .env file")
hf_token = os.getenv("HF_TOKEN")

# Reload the dataset to ensure it's in the correct format
dataset_dict = load_from_disk("./dataset/chatml_thought_25k")

# Push to the Hugging Face Hub
dataset_dict.push_to_hub("jordinia/netpro-finetune", config_name="chatml_thought_25k")

print("Dataset successfully pushed to the Hugging Face Hub with train and validation splits.")

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.67k [00:00<?, ?B/s]

Dataset successfully pushed to the Hugging Face Hub with train and validation splits.


In [27]:
from datasets import Dataset, DatasetDict, load_dataset
from huggingface_hub import HfApi
from dotenv import load_dotenv
import os

# Step 1: Load environment variables and token
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
    raise ValueError("HF_TOKEN not found in .env file")

# Step 2: Load the raw CSVs into datasets
train_dataset = Dataset.from_csv("./dataset/train.csv")
val_dataset = Dataset.from_csv("./dataset/test.csv")

# Step 3: Create a DatasetDict with train and validation splits
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
})

# Step 4: Push to the Hub with config name "raw-7k"
dataset_dict.push_to_hub("jordinia/netpro-finetune", config_name="full", token=hf_token)

print("Successfully pushed raw-7k config with train and validation splits.")


Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/39 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/39 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

'(MaxRetryError("HTTPSConnectionPool(host='hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com', port=443): Max retries exceeded with url: /repos/a6/84/a6841de6caf821ca512f175ac20a94a238de85992149a4a1e9c31dc35b707563/6086db9d5d94a8531728633a7e81eafc3c12c1dbf6ffd073add1d25168e78404?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQLC2QXPN7%2F20250514%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250514T161735Z&X-Amz-Expires=86400&X-Amz-Signature=b56d697e2d3bfdb75ecfcb2bb172192d5261e3377c8795ec352917ad1875abd0&X-Amz-SignedHeaders=host&partNumber=1&uploadId=FAQuc.F.RKlwC9kJnnK2HOw30z88hEnasfDbOMlESDnfCzTe0sXFR7Z7b4q5FesMNpm2tNuwged.T1ezfWnaV.ocUbNAUGsAPCd95dyuezeqkXDCvEtadJvRLVUHR7Ee&x-id=UploadPart (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:2427)')))"), '(Request ID: a22e2905-7739-407d-acb7-841f662baabf)')' thrown while requesting PUT https://hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com/repos/a6/84/a

README.md:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Successfully pushed raw-7k config with train and validation splits.


In [None]:
from huggingface_hub import HfApi
import os

# 1. Load environment variables
load_dotenv()  # Loads from .env file automatically

# 2. Verify token loading
if not os.getenv("HF_TOKEN"):
    raise ValueError("HF_TOKEN not found in .env file")

# 3. Initialize and upload
api = HfApi(token=os.getenv("HF_TOKEN"))

api.upload_file(
    path_or_fileobj="./dataset/netpro_chatml_thought.jsonl",
    path_in_repo="data/netpro_chatml_thought.jsonl",
    repo_id="jordinia/netpro-finetune",
    repo_type="dataset",
    commit_message="Initial dataset upload"
)

netpro_chatml_thought.jsonl: 100%|██████████| 710M/710M [01:22<00:00, 8.62MB/s]   


CommitInfo(commit_url='https://huggingface.co/datasets/jordinia/netpro-finetune/commit/e5a321ee51600a2778b55d5d33d8059d33dad9fb', commit_message='Initial dataset upload', commit_description='', oid='e5a321ee51600a2778b55d5d33d8059d33dad9fb', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/jordinia/netpro-finetune', endpoint='https://huggingface.co', repo_type='dataset', repo_id='jordinia/netpro-finetune'), pr_revision=None, pr_num=None)

In [36]:
# dataset = load_dataset("json", data_files="./dataset/netpro_chatml_thought.jsonl")

dataset = load_dataset("jordinia/netpro-finetune", split = "train")

Generating train split: 100%|██████████| 33262/33262 [00:01<00:00, 19689.44 examples/s]


In [37]:
dataset.save_to_disk("./dataset/chatml_thought_33k")

Saving the dataset (2/2 shards): 100%|██████████| 33262/33262 [00:00<00:00, 75387.68 examples/s]


In [2]:
from datasets import load_from_disk
dataset = load_from_disk("./dataset/chatml_thought_33k")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset 

Dataset({
    features: ['conversations'],
    num_rows: 33262
})

In [4]:
dataset_dict = dataset.train_test_split(test_size=0.004)

In [5]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['conversations'],
        num_rows: 33128
    })
    test: Dataset({
        features: ['conversations'],
        num_rows: 134
    })
})

In [7]:
from dotenv import load_dotenv
import os

load_dotenv()  # Loads from .env file automatically

if not os.getenv("HF_TOKEN"):
    raise ValueError("HF_TOKEN not found in .env file")
hf_token = os.getenv("HF_TOKEN")


# Step 4: Push to the Hub with config name "raw-7k"
dataset_dict.push_to_hub("jordinia/netpro-finetune", config_name="chatml_thought_33k", token=hf_token)

print("Successfully pushed raw-7k config with train and validation splits.")

Creating parquet from Arrow format: 100%|██████████| 17/17 [00:00<00:00, 31.86ba/s]
Creating parquet from Arrow format: 100%|██████████| 17/17 [00:00<00:00, 34.30ba/s]
Uploading the dataset shards: 100%|██████████| 2/2 [00:30<00:00, 15.36s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 265.26ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.55s/it]


Successfully pushed raw-7k config with train and validation splits.


In [2]:
import json

# Load the trainer state
with open("outputs/checkpoint-500/trainer_state.json", "r") as f:
    logs = json.load(f)

# Print organized logs
print(f"\n{'Step':<8} {'Train Loss':<12} {'grad_norm':<12} {'Learning Rate':<15}")
print("-" * 50)
for entry in logs["log_history"]:
    step = entry.get("step", "N/A")
    train_loss = entry.get("loss", "N/A")
    eval_loss = entry.get("grad_norm", "N/A")
    lr = entry.get("learning_rate", "N/A")
    
    print(f"{step:<8} {str(train_loss)[:6]:<12} {str(eval_loss)[:6]:<12} {str(lr)[:8]:<15}")


Step     Train Loss   grad_norm    Learning Rate  
--------------------------------------------------
1        0.8794       66.098       0.0            
2        0.1284       13.109       1e-05          
3        0.1405       14.897       2e-05          
4        0.0963       19.853       3e-05          
5        0.1028       8.9562       4e-05          
6        0.0356       4.6695       5e-05          
7        0.0134       0.9691       6e-05          
8        0.1371       34.132       7e-05          
9        2.0593       63.384       8e-05          
10       0.0004       0.0          9e-05          
11       2.2151       61.359       0.0001         
12       0.0007       0.0615       9.999999       
13       -0.002       0.0264       9.999997       
14       0.0035       1.2783       9.999994       
15       2.8499       69.349       9.999989       
16       -0.000       0.3443       9.999984       
17       1.1129       246.83       9.999977       
18       0.0023       0.0694  