In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
output_dir = "../2026-01-27"
test_output_dir = output_dir + "_test"

In [2]:
import os
import sys
import json
from pathlib import Path
from datasets import load_dataset
from dotenv import load_dotenv

sys.path.insert(0, os.path.join(os.path.dirname(os.getcwd()), '..'))

load_dotenv()

ds = load_dataset("SimpleStories/SimpleStories", split="train")
print("Dataset features:")
print(ds.features)
print(f"\nDataset size: {len(ds)}")
print(f"\nFirst example keys: {list(ds[0].keys())}")

  from .autonotebook import tqdm as notebook_tqdm


Dataset features:
{'story': Value('string'), 'topic': Value('string'), 'theme': Value('string'), 'style': Value('string'), 'feature': Value('string'), 'grammar': Value('string'), 'persona': Value('string'), 'initial_word_type': Value('string'), 'initial_letter': Value('string'), 'word_count': Value('int64'), 'character_count': Value('int64'), 'num_paragraphs': Value('int64'), 'avg_word_length': Value('float64'), 'avg_sentence_length': Value('float64'), 'flesch_reading_ease': Value('float64'), 'flesch_kincaid_grade': Value('float64'), 'dale_chall_readability_score': Value('float64'), 'num_stories_in_completion': Value('int64'), 'expected_num_stories_in_completion': Value('int64'), 'generation_id': Value('string'), 'model': Value('string')}

Dataset size: 2115696

First example keys: ['story', 'topic', 'theme', 'style', 'feature', 'grammar', 'persona', 'initial_word_type', 'initial_letter', 'word_count', 'character_count', 'num_paragraphs', 'avg_word_length', 'avg_sentence_length', 'fles

In [4]:
# Run the dataset exploration script
from explore_dataset import explore_dataset, propose_leaf_nodes

# Explore the dataset
categorical_features, numeric_features = explore_dataset(sample_size=50000)

# Propose leaf nodes
leaf_nodes = propose_leaf_nodes(categorical_features, numeric_features)

print(f"\n✓ Generated {len(leaf_nodes)} leaf nodes")
print("\nFirst 10 leaf nodes:")
for i, node in enumerate(leaf_nodes[:10], 1):
    print(f"{i:2d}. {node['id']}: {node['description']}")


Analyzing 50000 examples from 2115696 total examples...

Most common categorical values:

topic (48 unique):
  hidden treasures: 1123
  magical lands: 1105
  bygone eras: 1094
  the arts: 1092
  cultural traditions: 1086
  seasonal changes: 1080
  giant creatures: 1079
  mystical creatures: 1076
  time travel: 1073
  lost civilizations: 1073

theme (63 unique):
  Magic: 878
  Deception: 853
  Helping Others: 852
  Agency: 847
  Innovation: 840
  Kindness: 831
  Problem-Solving: 827
  Humor: 819
  Growth: 817
  Hardship: 814

style (23 unique):
  minimalist: 2325
  classic: 2255
  lighthearted: 2216
  playful: 2215
  modern: 2209
  surreal: 2208
  philosophical: 2199
  humorous: 2181
  tragic: 2180
  fable-like: 2180

feature (26 unique):
  a flashback: 2021
  circular narrative structure: 2013
  a cliffhanger: 1994
  a Red Herring: 1986
  juxtaposition: 1981
  a story within a story: 1961
  Checkhov's gun: 1958
  a moral lesson: 1950
  absence indicating a presence: 1949
  symbolism: 1

In [None]:
output_file = 'proposed_leaf_nodes.json'
with open(output_file, 'w') as f:
    json.dump(leaf_nodes, f, indent=2)

In [4]:
from generate_codebooks import CodebookGenerator
generator = CodebookGenerator(model="gpt-5-mini-2025-08-07")

# Load the proposed leaf nodes
leaf_nodes = generator.load_leaf_nodes(output_file)
print(f"Loaded {len(leaf_nodes)} leaf nodes")

NameError: name 'output_file' is not defined

In [None]:
# Generate a small test codebook
test_codebook = generator.generate_codebook(
    leaf_nodes[:8],  # Use first 8 leaf nodes
    size="small",
    difficulty="easy",
    use_all_formulas=False
)

print("Generated Codebook:")
print("=" * 60)
print(test_codebook)
print("=" * 60)


In [None]:
# Generate obfuscated version
obfuscated_codebook = generator.obfuscate_codebook(test_codebook)

print("Obfuscated Codebook:")
print("=" * 60)
print(obfuscated_codebook)
print("=" * 60)


In [None]:
generator.generate_all_codebooks(
    output_dir=output_dir,
    small_count=20,
    medium_count=15,
    large_count=10,
    insane_count=5
)

## Step 6: Generate Individual Codebooks

Generate specific codebooks with custom parameters.


In [None]:
medium_codebook = generator.generate_codebook(
    leaf_nodes,
    size="medium",
    difficulty="easy",
    use_all_formulas=True
)

print("Medium Codebook with All Formulas:")
print("=" * 60)
print(medium_codebook)
print("=" * 60)

# Save it
generator.save_codebook(medium_codebook, "example-medium.txt", output_dir=test_output_dir)

# Generate and save obfuscated version
obfuscated_medium = generator.obfuscate_codebook(medium_codebook)
generator.save_codebook(obfuscated_medium, "example-medium-obfc.txt", output_dir=test_output_dir)


## Step 7: Verify Generated Codebooks

Parse a generated codebook to verify it's valid.


In [None]:
test_codebook = medium_codebook

from parser import CodebookParser

parser = CodebookParser()

test_file = Path(test_output_dir) / "test-codebook.txt"
test_file.parent.mkdir(exist_ok=True)
with open(test_file, 'w') as f:
    f.write(test_codebook)

try:
    graph = parser.parse_codebook(str(test_file))
    print(f"✓ Successfully parsed codebook!")
    print(f"  Nodes: {len(graph.nodes)}")
    print(f"  Edges: {len(graph.edges)}")
    print(f"\nNodes:")
    for node in graph.nodes:
        formula_type = type(node.formula).__name__ if node.formula else "None"
        print(f"  - {node.id}: {formula_type}")
except Exception as e:
    print(f"✗ Error parsing codebook: {e}")


## Step 8: Rewrite Codebooks in Different Styles


In [12]:
from rewrite_codebooks import CodebookRewriter

rewriter = CodebookRewriter(model="gpt-5-mini-2025-08-07")

print("Available styles:")
for style in CodebookRewriter.STYLES:
    print(f"  - {style}: {CodebookRewriter.STYLE_DESCRIPTIONS[style]}")


Available styles:
  - free-flow: Natural, conversational, flowing text that reads smoothly without rigid structure
  - transcript: Dialogue-like, interview style with questions and answers, as if explaining to someone
  - technical: Precise, formal technical language with clear definitions and specifications
  - structured: Clear, organized format with bullet points, sections, and hierarchical organization
  - flowery: Extended, descriptive, elaborate language with rich vocabulary and detailed explanations
  - concise: Brief, to-the-point style with minimal words while maintaining clarity
  - narrative: Story-like, engaging narrative style that weaves concepts together like a story


In [None]:
test_codebook_file = Path(test_output_dir) / "test-codebook.txt"

if test_codebook_file.exists():
    style = "flowery"
    rewritten = rewriter.rewrite_codebook_file(str(test_codebook_file), style)
    print(f"✓ Rewritten in {style} style")
    print(f"  Saved to: {rewritten}")
    
    with open(rewritten, 'r') as f:
        rewritten_text = f.read()
    print(f"\nFirst 500 characters of {style} version:")
    print("=" * 60)
    print(rewritten_text[:500])
    print("=" * 60)
else:
    print(f"Test codebook not found at {test_codebook_file}")


# Step 9: Verify rewritten codebook

In [None]:
from parser import CodebookParser

parser = CodebookParser()

test_rewritten_file = Path(test_output_dir) / "test-codebook-flowery.txt"

if test_rewritten_file.exists():
    try:
        graph = parser.parse_codebook(str(test_rewritten_file))
        print(f"✓ Successfully parsed rewritten codebook!")
        print(f"  Nodes: {len(graph.nodes)}")
        print(f"  Edges: {len(graph.edges)}")
        print(f"\nNodes (first 10):")
        for node in graph.nodes[:10]:
            formula_type = type(node.formula).__name__ if node.formula else "None"
            print(f"  - {node.id}: {formula_type}")
        
        # Compare with original
        original_file = Path(test_output_dir) / "test-codebook.txt"
        if original_file.exists():
            original_graph = parser.parse_codebook(str(original_file))
            print(f"\nComparison:")
            print(f"  Original nodes: {len(original_graph.nodes)}")
            print(f"  Rewritten nodes: {len(graph.nodes)}")
            print(f"  Original edges: {len(original_graph.edges)}")
            print(f"  Rewritten edges: {len(graph.edges)}")
            
            # Check if node IDs match
            original_ids = {node.id for node in original_graph.nodes}
            rewritten_ids = {node.id for node in graph.nodes}
            if original_ids == rewritten_ids:
                print(f"  ✓ All node IDs match!")
            else:
                print(f"  ⚠ Node ID mismatch!")
                print(f"    Missing: {original_ids - rewritten_ids}")
                print(f"    Extra: {rewritten_ids - original_ids}")
    except Exception as e:
        print(f"✗ Error parsing rewritten codebook: {e}")
        import traceback
        traceback.print_exc()
else:
    print(f"Rewritten codebook not found at {test_rewritten_file}")
    print("Run the rewriting step above first.")


## Step 10: Rewrite All Codebooks in Directory

Rewrite all codebooks in a directory with all available styles. This will create multiple versions of each codebook.


## Complete Pipeline Script

Use the `pipeline.py` script to run the entire process automatically:
1. Generate codebooks
2. Obfuscate originals
3. Rewrite in different styles
4. Obfuscate rewritten versions
5. Parse and serialize all codebooks


In [4]:
# Run the complete pipeline
from pipeline import CodebookPipeline

# Initialize pipeline
pipeline = CodebookPipeline(
    model="gpt-5.2-2025-12-11", # "gpt-5-mini-2025-08-07",
    rewrite_styles=None  # None = all styles, or specify: ["flowery", "technical"]
)



pipeline.run_full_pipeline(
    output_dir=output_dir,
    small_count=20,
    medium_count=20,
    large_count=10,
    insane_count=5
)

CODEBOOK GENERATION PIPELINE
Output directory: /home/jjb/msc/axiom-guided-structured-reasoning/codebooks/generator/../2026-01-27
Rewriting styles: free-flow, transcript, technical, structured, flowery, concise, narrative

Step 1: Generating codebooks...
--------------------------------------------------------------------------------

✓ Processed 55 codebooks
  Skipped 55 existing original codebooks
  Skipped 55 existing obfuscated codebooks
  Output directory: /home/jjb/msc/axiom-guided-structured-reasoning/codebooks/generator/../2026-01-27

Step 2: Obfuscating original codebooks...
--------------------------------------------------------------------------------
Skipping 55 already obfuscated files.
All original codebooks already obfuscated.

Step 3: Rewriting codebooks in different styles...
--------------------------------------------------------------------------------
Skipping 385 already rewritten files.
All codebooks already rewritten in all styles.

Step 4: Obfuscating rewritten

Parsing codebooks: 100%|██████████| 29/29 [01:06<00:00,  2.30s/it]
Traceback (most recent call last):
  File "/home/jjb/msc/axiom-guided-structured-reasoning/codebooks/generator/api_utils.py", line 34, in run_async
    return asyncio.run(coro)
           ^^^^^^^^^^^^^^^^^
  File "/home/jjb/msc/axiom-guided-structured-reasoning/.venv/lib/python3.12/site-packages/nest_asyncio.py", line 30, in run
    return loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jjb/msc/axiom-guided-structured-reasoning/.venv/lib/python3.12/site-packages/nest_asyncio.py", line 98, in run_until_complete
    return f.result()
           ^^^^^^^^^^
  File "/usr/lib/python3.12/asyncio/futures.py", line 203, in result
    raise self._exception.with_traceback(self._exception_tb)
  File "/usr/lib/python3.12/asyncio/tasks.py", line 314, in __step_run_and_handle_result
    result = coro.send(None)
             ^^^^^^^^^^^^^^^
  File "/home/jjb/msc/axiom-guided-structured-reasoning/cod


Error during parallel parsing: cannot reuse already awaited coroutine
Falling back to sequential parsing...


KeyboardInterrupt: 