## Prerequisites: Config and Exceptions

In [1]:
# First, define the dependencies (exceptions and config)
from dataclasses import dataclass, asdict
from typing import Any, Dict, List, Set, Optional
import json

# === Exceptions ===
class Json2ToonError(Exception):
    pass

class AnalysisError(Json2ToonError):
    pass

# === Config ===
@dataclass
class ToonConfig:
    table_separator: str = "|"
    header_separator: str = "-"
    max_inline_array_length: int = 10
    compress_primitive_arrays: bool = True
    max_string_length: Optional[int] = None
    quote_strings: bool = False
    indent_size: int = 2
    max_nesting_depth: int = 10
    uniformity_threshold: float = 0.8
    min_table_rows: int = 2

print("✓ Dependencies loaded")

✓ Dependencies loaded


## Implementation

In [2]:
# analyzer.py - Structure analyzer for json2toon library

@dataclass
class StructureInfo:
    """Information about JSON structure.
    
    Attributes:
        type: 'object', 'array', or 'primitive'
        is_uniform: For arrays, whether all items have same structure
        keys: For uniform arrays, list of common keys
        item_count: For arrays, number of items
    """
    type: str
    is_uniform: bool
    keys: Optional[List[str]] = None
    item_count: Optional[int] = None


def is_uniform_array(
    arr: List[Dict[str, Any]],
    threshold: float = 0.8
) -> tuple:
    """Check if array items have uniform structure.
    
    An array is "uniform" if most items share the same keys.
    This is used to decide whether to use table format.
    
    Args:
        arr: List of dictionaries to analyze
        threshold: Minimum key overlap ratio (0.0-1.0)
        
    Returns:
        Tuple of (is_uniform: bool, common_keys: List[str])
        
    Algorithm:
        1. Collect all unique keys from all items
        2. Count how often each key appears
        3. Keys appearing in >= threshold% of items are "uniform"
        4. If >= threshold% of all keys are uniform, array is uniform
    """
    # Early return for non-dict arrays or empty arrays
    if not arr or not all(isinstance(item, dict) for item in arr):
        return False, []
    
    # Step 1: Collect all unique keys
    all_keys: Set[str] = set()
    for item in arr:
        all_keys.update(item.keys())
    
    if not all_keys:
        return False, []
    
    # Step 2: Count key occurrences
    key_counts: Dict[str, int] = {key: 0 for key in all_keys}
    for item in arr:
        for key in item.keys():
            key_counts[key] += 1
    
    # Step 3: Find uniform keys (appearing in >= threshold% of items)
    total_items = len(arr)
    uniform_keys = [
        key for key, count in key_counts.items()
        if count / total_items >= threshold
    ]
    
    # Step 4: Check if array is uniform
    if all_keys:
        is_uniform = len(uniform_keys) / len(all_keys) >= threshold
    else:
        is_uniform = False
    
    return is_uniform, sorted(uniform_keys)


def should_use_table_format(data: Any, config: ToonConfig) -> bool:
    """Determine if data should use table format.
    
    Table format is used for arrays of uniform objects.
    Example:
        [{"id": 1, "name": "A"}, {"id": 2, "name": "B"}]
        
    Becomes:
        | id | name |
        |----|----- |
        | 1  | A    |
        | 2  | B    |
    
    Args:
        data: Data to analyze
        config: TOON configuration
        
    Returns:
        True if table format should be used
    """
    # Only lists can be tables
    if not isinstance(data, list):
        return False
    
    # Must have minimum number of rows
    if len(data) < config.min_table_rows:
        return False
    
    # Check uniformity
    is_uniform, _ = is_uniform_array(data, config.uniformity_threshold)
    return is_uniform


def analyze_structure(data: Any, config: ToonConfig) -> StructureInfo:
    """Analyze data structure to determine encoding strategy.
    
    Args:
        data: Data to analyze
        config: TOON configuration
        
    Returns:
        StructureInfo object with analysis results
        
    Raises:
        AnalysisError: If analysis fails
    """
    try:
        if isinstance(data, dict):
            return StructureInfo(
                type='object',
                is_uniform=False
            )
        elif isinstance(data, list):
            threshold = config.uniformity_threshold
            is_uniform, keys = is_uniform_array(data, threshold)
            return StructureInfo(
                type='array',
                is_uniform=is_uniform,
                keys=keys,
                item_count=len(data)
            )
        else:
            return StructureInfo(
                type='primitive',
                is_uniform=False
            )
    except Exception as e:
        raise AnalysisError(f"Failed to analyze structure: {e}") from e


print("✓ Analyzer functions defined successfully")

✓ Analyzer functions defined successfully


## Testing Analyzer

In [3]:
# Test 1: Uniform array detection
print("Test 1: Uniform Array Detection")
print("="*50)

# Perfectly uniform array
uniform_data = [
    {"id": 1, "name": "Alice", "age": 30},
    {"id": 2, "name": "Bob", "age": 25},
    {"id": 3, "name": "Charlie", "age": 35}
]

is_uniform, keys = is_uniform_array(uniform_data)
print(f"Uniform array: {is_uniform} ✓")
print(f"Common keys: {keys}")

Test 1: Uniform Array Detection
Uniform array: True ✓
Common keys: ['age', 'id', 'name']


In [4]:
# Test 2: Non-uniform array
print("\nTest 2: Non-Uniform Array Detection")
print("="*50)

# Each item has different keys
non_uniform_data = [
    {"id": 1, "name": "Alice"},
    {"id": 2, "email": "bob@test.com"},
    {"age": 35, "city": "NYC"}
]

is_uniform, keys = is_uniform_array(non_uniform_data)
print(f"Uniform array: {is_uniform}")
print(f"Common keys: {keys}")


Test 2: Non-Uniform Array Detection
Uniform array: False
Common keys: []


In [5]:
# Test 3: Partially uniform array
print("\nTest 3: Partially Uniform Array (80% threshold)")
print("="*50)

# 4 out of 5 items have same keys (80%)
partial_data = [
    {"id": 1, "name": "A"},
    {"id": 2, "name": "B"},
    {"id": 3, "name": "C"},
    {"id": 4, "name": "D"},
    {"different": "key"}  # Different keys
]

is_uniform, keys = is_uniform_array(partial_data, threshold=0.8)
print(f"Uniform at 80% threshold: {is_uniform}")
print(f"Common keys: {keys}")


Test 3: Partially Uniform Array (80% threshold)
Uniform at 80% threshold: False
Common keys: ['id', 'name']


In [6]:
# Test 4: Edge cases
print("\nTest 4: Edge Cases")
print("="*50)

# Empty array
is_uniform, keys = is_uniform_array([])
print(f"Empty array - uniform: {is_uniform}, keys: {keys}")

# Array of non-dicts
is_uniform, keys = is_uniform_array([1, 2, 3])
print(f"Array of ints - uniform: {is_uniform}, keys: {keys}")

# Single item
is_uniform, keys = is_uniform_array([{"id": 1}])
print(f"Single item - uniform: {is_uniform}, keys: {keys}")


Test 4: Edge Cases
Empty array - uniform: False, keys: []
Array of ints - uniform: False, keys: []
Single item - uniform: True, keys: ['id']


In [8]:
# Test 5: Table format decision
print("\nTest 5: Table Format Decision")
print("="*50)

config = ToonConfig(min_table_rows=2, uniformity_threshold=0.8)

# Should use table format
table_data = [
    {"id": 1, "name": "A"},
    {"id": 2, "name": "B"},
    {"id": 3, "name": "C"}
]
print(f"Uniform array with 3 items: {should_use_table_format(table_data, config)} ✓")

# Too few rows
single_item = [{"id": 1, "name": "A"}]
print(f"Single item (min_table_rows=2): {should_use_table_format(single_item, config)}")

# Not a list
print(f"Dictionary: {should_use_table_format({'id': 1}, config)}")

# Non-uniform
print(f"Non-uniform array: {should_use_table_format(non_uniform_data, config)}")


Test 5: Table Format Decision
Uniform array with 3 items: True ✓
Single item (min_table_rows=2): False
Dictionary: False
Non-uniform array: False


In [9]:
# Test 6: Structure analysis
print("\nTest 6: Structure Analysis")
print("="*50)

config = ToonConfig()

# Analyze object
obj_info = analyze_structure({"name": "test"}, config)
print(f"Object analysis:")
print(f"  Type: {obj_info.type}")
print(f"  Is uniform: {obj_info.is_uniform}")

# Analyze uniform array
arr_info = analyze_structure(uniform_data, config)
print(f"\nUniform array analysis:")
print(f"  Type: {arr_info.type}")
print(f"  Is uniform: {arr_info.is_uniform}")
print(f"  Keys: {arr_info.keys}")
print(f"  Item count: {arr_info.item_count}")

# Analyze primitive
prim_info = analyze_structure("hello", config)
print(f"\nPrimitive analysis:")
print(f"  Type: {prim_info.type}")
print(f"  Is uniform: {prim_info.is_uniform}")


Test 6: Structure Analysis
Object analysis:
  Type: object
  Is uniform: False

Uniform array analysis:
  Type: array
  Is uniform: True
  Keys: ['age', 'id', 'name']
  Item count: 3

Primitive analysis:
  Type: primitive
  Is uniform: False


In [10]:
# Test 7: Threshold sensitivity
print("\nTest 7: Threshold Sensitivity")
print("="*50)

# Array where 3/5 items have 'name' key (60%)
mixed_data = [
    {"id": 1, "name": "A"},
    {"id": 2, "name": "B"},
    {"id": 3, "name": "C"},
    {"id": 4},
    {"id": 5}
]

for threshold in [0.5, 0.6, 0.7, 0.8]:
    is_uniform, keys = is_uniform_array(mixed_data, threshold)
    print(f"Threshold {threshold}: uniform={is_uniform}, keys={keys}")


Test 7: Threshold Sensitivity
Threshold 0.5: uniform=True, keys=['id', 'name']
Threshold 0.6: uniform=True, keys=['id', 'name']
Threshold 0.7: uniform=False, keys=['id']
Threshold 0.8: uniform=False, keys=['id']


## Algorithm Visualization

In [11]:
# Visualize the uniformity analysis algorithm
print("Algorithm Visualization")
print("="*50)

sample_data = [
    {"id": 1, "name": "Alice", "email": "a@test.com"},
    {"id": 2, "name": "Bob"},
    {"id": 3, "name": "Charlie", "email": "c@test.com"},
    {"id": 4, "name": "Diana"}
]

print("\nInput Data:")
for i, item in enumerate(sample_data):
    print(f"  Item {i+1}: {item}")

# Step 1: Collect all keys
all_keys = set()
for item in sample_data:
    all_keys.update(item.keys())
print(f"\nStep 1 - All unique keys: {sorted(all_keys)}")

# Step 2: Count occurrences
key_counts = {key: 0 for key in all_keys}
for item in sample_data:
    for key in item.keys():
        key_counts[key] += 1
print(f"Step 2 - Key counts: {key_counts}")

# Step 3: Calculate percentages
total = len(sample_data)
print(f"Step 3 - Key percentages (total items = {total}):")
for key, count in key_counts.items():
    pct = count / total * 100
    print(f"  {key}: {count}/{total} = {pct:.0f}%")

# Step 4: Apply threshold
threshold = 0.8
uniform_keys = [k for k, v in key_counts.items() if v/total >= threshold]
print(f"\nStep 4 - Keys with >= {threshold*100}% occurrence: {uniform_keys}")

# Step 5: Final decision
uniformity_ratio = len(uniform_keys) / len(all_keys)
print(f"Step 5 - Uniformity ratio: {len(uniform_keys)}/{len(all_keys)} = {uniformity_ratio:.2f}")
print(f"Is uniform (>= {threshold}): {uniformity_ratio >= threshold}")

Algorithm Visualization

Input Data:
  Item 1: {'id': 1, 'name': 'Alice', 'email': 'a@test.com'}
  Item 2: {'id': 2, 'name': 'Bob'}
  Item 3: {'id': 3, 'name': 'Charlie', 'email': 'c@test.com'}
  Item 4: {'id': 4, 'name': 'Diana'}

Step 1 - All unique keys: ['email', 'id', 'name']
Step 2 - Key counts: {'name': 4, 'email': 2, 'id': 4}
Step 3 - Key percentages (total items = 4):
  name: 4/4 = 100%
  email: 2/4 = 50%
  id: 4/4 = 100%

Step 4 - Keys with >= 80.0% occurrence: ['name', 'id']
Step 5 - Uniformity ratio: 2/3 = 0.67
Is uniform (>= 0.8): False


## Summary

The analyzer module provides:

1. **StructureInfo dataclass**: Stores analysis results
2. **is_uniform_array()**: Checks if array items share common structure
3. **should_use_table_format()**: Decides if table format is appropriate
4. **analyze_structure()**: Complete structure analysis

### Key Design Decisions:
- Configurable uniformity threshold (default 80%)
- Handles edge cases (empty arrays, non-dict items)
- Returns sorted key lists for consistent output
- Proper exception handling with custom AnalysisError