# Extract Token Journeys

This notebook extracts individual token journeys from the routing data.

**Steps:**
1. Create output folders
2. Extract token journeys from routing JSONL files

## Step 1: Create Output Folders

In [None]:
import os
os.makedirs("humaneval_tokens", exist_ok=True)
os.makedirs("gsm8k_tokens", exist_ok=True)
print("Created folders: humaneval_tokens/, gsm8k_tokens/")

## Step 2: Define Helper Functions

In [None]:
import json

# CONFIG: Set to None for ALL, or a number for first N
MAX_PROBLEMS = None  # e.g., 10 for first 10 problems

def get_problem_ids(filepath):
    pids = set()
    with open(filepath) as f:
        for line in f:
            if not line.strip(): continue
            try: pids.add(json.loads(line)['problem_id'])
            except: pass
    return sorted(pids)

def get_max_token(filepath, problem_id):
    max_tok = 0
    with open(filepath) as f:
        for line in f:
            if not line.strip(): continue
            try:
                d = json.loads(line)
                if d['problem_id'] == problem_id:
                    max_tok = max(max_tok, d['token_idx'])
            except: pass
    return max_tok + 1

## Step 3: Extract HumanEval Token Journeys

In [None]:
humaneval_file = "humaneval_full_routing.jsonl"
problem_ids = get_problem_ids(humaneval_file)[:MAX_PROBLEMS]
print(f"Extracting {len(problem_ids)} problems...")

for pid in problem_ids:
    max_tok = get_max_token(humaneval_file, pid)
    print(f"  Problem {pid}: {max_tok} tokens")
    !python extract_tokens.py {humaneval_file} --problem {pid} --tokens 0 {max_tok} --output-dir humaneval_tokens

## Step 4: Extract GSM8K Token Journeys

In [None]:
gsm8k_file = "gsm8k_full_routing.jsonl"
problem_ids = get_problem_ids(gsm8k_file)[:MAX_PROBLEMS]
print(f"Extracting {len(problem_ids)} problems...")

for pid in problem_ids:
    max_tok = get_max_token(gsm8k_file, pid)
    print(f"  Problem {pid}: {max_tok} tokens")
    !python extract_tokens.py {gsm8k_file} --problem {pid} --tokens 0 {max_tok} --output-dir gsm8k_tokens

## Step 5: Verify Extraction (Stats)

In [None]:
import glob
from collections import defaultdict

def print_folder_stats(dataset_name):
    folder = f"{dataset_name}_tokens"
    files = glob.glob(f"{folder}/p*_token_*.jsonl")
    
    if not files:
        print(f"Dataset: {dataset_name} -> NO FILES FOUND in {folder}/")
        return
        
    problems = defaultdict(list)
    for f in files:
        basename = f.split('/')[-1]
        pid = int(basename.split('_token_')[0][1:])
        tok = int(basename.split('_token_')[1].split('.')[0])
        problems[pid].append(tok)
    
    total_tokens = sum(len(toks) for toks in problems.values())
    avg_tokens = total_tokens / len(problems)
    
    print(f"Dataset: {dataset_name}")
    print(f"  Total Problems: {len(problems)}")
    print(f"  Total Tokens:   {total_tokens}")
    print(f"  Avg Tokens/Prob: {avg_tokens:.1f}")
    print("-" * 40)

print_folder_stats("humaneval")
print_folder_stats("gsm8k")