# Dataset Preparation for Finetuning,
This notebook handles:,
- Loading raw data\,
- Formatting for different tasks (SFT, DPO, Persona, Story)\,
- Train/validation splitting,
- Tokenization and saving"

In [None]:
import os
import json
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# Configuration\n",
BASE_MODEL = "deepseek-ai/DeepSeek-V3-Base"  # or Qwen, Llama, etc.
RAW_DATA_PATH = "../data/raw/conversations.jsonl"
OUTPUT_DIR = "../data/processed"
MAX_LENGTH = 2048
VALIDATION_SPLIT = 0.1

os.makedirs(OUTPUT_DIR, exist_ok=True)

## Load and Explore Raw Data

In [None]:
# Load raw data
data = []
with open(RAW_DATA_PATH, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

df = pd.DataFrame(data)
print(f"Loaded {len(df)} samples")
print(f"Columns: {df.columns.tolist()}")
df.head()

In [None]:
# Data statistics\n",
print(f"\nData Statistics:")
print(f"Total samples: {len(df)}")
print(f"Average prompt length: {df['prompt'].str.len().mean():.0f} chars")
print(f"Average response length: {df['response'].str.len().mean():.0f} chars")

# Distribution of lengths
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.hist(df['prompt'].str.len(), bins=50)
plt.xlabel('Prompt Length (chars)')
plt.ylabel('Count')
plt.title('Prompt Length Distribution')

plt.subplot(1, 2, 2)
plt.hist(df['response'].str.len(), bins=50)
plt.xlabel('Response Length (chars)')
plt.ylabel('Count')
plt.title('Response Length Distribution')
plt.tight_layout()
plt.show()

## Format Data for SFT (Supervised Fine-Tuning)

In [None]:
def format_for_sft(examples):
    """
    Format data for supervised fine-tuning
    """
    formatted = []

    for i in range(len(examples['prompt'])):
        # Chat template format\n",
        text = f"""<|im_start|>system
You are a helpful AI assistant for the Elio/Pixar community.<|im_end|>
<|im_start|>user
{examples['prompt'][i]}<|im_end|>
<|im_start|>assistant\n",
{examples['response'][i]}<|im_end|>"""
        formatted.append(text)

    return {'text': formatted}

# Apply formatting
df_sft = df.copy()
formatted_data = format_for_sft({
    'prompt': df_sft['prompt'].tolist(),
    'response': df_sft['response'].tolist()
})

df_sft['text'] = formatted_data['text']
print(f"\nFormatted {len(df_sft)} samples for SFT")
print("\nExample:")
print(df_sft['text'].iloc[0])

## Tokenization

In [None]:
# Load tokenizer\n",
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        max_length=MAX_LENGTH,
        padding='max_length'
    )

# Create HuggingFace Dataset\n",
dataset = Dataset.from_pandas(df_sft[['text']])

# Tokenize\n",
print("Tokenizing dataset...")
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['text']
)

print(f"Tokenized {len(tokenized_dataset)} samples")

## Train/Validation Split

In [None]:
# Split dataset\n",
split_dataset = tokenized_dataset.train_test_split(
    test_size=VALIDATION_SPLIT,
    seed=42
)

train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

print(f"Train samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

## Save Processed Dataset

In [None]:
# Create DatasetDict\n",
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})
,
# Save to disk\n",
output_path = os.path.join(OUTPUT_DIR, 'sft_dataset')
dataset_dict.save_to_disk(output_path)
print(f"\nDataset saved to: {output_path}")

# Also save metadata\n",
metadata = {
    'base_model': BASE_MODEL,
    'max_length': MAX_LENGTH,
    'train_samples': len(train_dataset),
    'val_samples': len(val_dataset),
    'validation_split': VALIDATION_SPLIT,
    'format': 'sft',
    'created_at': pd.Timestamp.now().isoformat()
}

with open(os.path.join(OUTPUT_DIR, 'dataset_metadata.json'), 'w') as f:
    json.dump(metadata, f, indent=2)

print("Metadata saved")

## Token Length Analysis

In [None]:
# Analyze token lengths\n",
token_lengths = [len(x['input_ids']) for x in train_dataset]

plt.figure(figsize=(10, 5))
plt.hist(token_lengths, bins=50)
plt.axvline(MAX_LENGTH, color='r', linestyle='--', label=f'Max Length ({MAX_LENGTH})')
plt.xlabel('Token Length')
plt.ylabel('Count')
plt.title('Distribution of Token Lengths in Training Set')
plt.legend()
plt.show()

print(f"\nToken Length Statistics:")
print(f"Mean: {pd.Series(token_lengths).mean():.0f}")
print(f"Median: {pd.Series(token_lengths).median():.0f}")
print(f"95th percentile: {pd.Series(token_lengths).quantile(0.95):.0f}")
print(f"Max: {pd.Series(token_lengths).max():.0f}")