# Persuasion-RL: Exploratory Data Analysis

This notebook provides exploratory analysis of the CMV and PersuasionForGood datasets.


In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import Counter

sns.set_style('whitegrid')
%matplotlib inline


## Load Processed Data


In [None]:
def load_jsonl(file_path):
    """Load JSONL file."""
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

# Load training data
train_data = load_jsonl('../data/processed/sft_train.jsonl')
val_data = load_jsonl('../data/processed/sft_val.jsonl')
test_data = load_jsonl('../data/processed/sft_test.jsonl')

print(f"Train: {len(train_data)} examples")
print(f"Val: {len(val_data)} examples")
print(f"Test: {len(test_data)} examples")
print(f"Total: {len(train_data) + len(val_data) + len(test_data)} examples")


## Dataset Statistics


In [None]:
# Combine all data for analysis
all_data = train_data + val_data + test_data

# Extract metadata
sources = [ex['metadata']['source'] for ex in all_data]
response_tokens = [ex['metadata']['response_tokens'] for ex in all_data]
context_tokens = [ex['metadata']['context_tokens'] for ex in all_data]

print("Dataset Composition:")
source_counts = Counter(sources)
for source, count in source_counts.items():
    print(f"  {source}: {count} ({count/len(all_data)*100:.1f}%)")
