# WildGuardMix Dataset Analysis

This notebook explores the `allenai/wildguardmix` dataset from HuggingFace.

Dataset: https://huggingface.co/datasets/allenai/wildguardmix

## 1. Load Environment Variables and Import Libraries

In [None]:
# Import libraries
import intention_jailbreak  # This will load environment variables
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries loaded successfully!")
print(f"intention_jailbreak version: {intention_jailbreak.__version__}")

## 2. Load the WildGuardMix Dataset

In [None]:
# Load the dataset from HuggingFace
print("Loading WildGuardMix dataset...")
dataset = load_dataset("allenai/wildguardmix")
print("Dataset loaded successfully!")
print(f"\nDataset object: {dataset}")

## 3. Dataset Structure and Basic Information

In [None]:
# Display dataset splits and sizes
print("Dataset Splits:")
for split_name in dataset.keys():
    split_size = len(dataset[split_name])
    print(f"  {split_name}: {split_size:,} examples")

# Get the train split for further exploration
train_data = dataset['train'] if 'train' in dataset else dataset[list(dataset.keys())[0]]

print(f"\nDataset Features/Columns:")
print(train_data.features)

print(f"\nColumn Names: {train_data.column_names}")

## 4. Explore Dataset Features and Sample Data

In [None]:
# Convert to pandas for easier exploration
df = train_data.to_pandas()

print(f"DataFrame Shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Display data types and info
print("Data Types and Non-Null Counts:")
df.info()