## Prerequisites: Dependencies

In [1]:
# First, define the dependencies
from dataclasses import dataclass
from typing import Any, Dict, List, Set, Optional
import json
import re

# === Exceptions ===
class Json2ToonError(Exception):
    pass

class EncodingError(Json2ToonError):
    pass

class AnalysisError(Json2ToonError):
    pass

# === Config ===
@dataclass
class ToonConfig:
    table_separator: str = "|"
    header_separator: str = "-"
    max_inline_array_length: int = 10
    compress_primitive_arrays: bool = True
    max_string_length: Optional[int] = None
    quote_strings: bool = False
    indent_size: int = 2
    max_nesting_depth: int = 10
    uniformity_threshold: float = 0.8
    min_table_rows: int = 2

# === Analyzer ===
def is_uniform_array(arr: List[Dict[str, Any]], threshold: float = 0.8) -> tuple:
    if not arr or not all(isinstance(item, dict) for item in arr):
        return False, []
    all_keys: Set[str] = set()
    for item in arr:
        all_keys.update(item.keys())
    if not all_keys:
        return False, []
    key_counts: Dict[str, int] = {key: 0 for key in all_keys}
    for item in arr:
        for key in item.keys():
            key_counts[key] += 1
    total_items = len(arr)
    uniform_keys = [key for key, count in key_counts.items() if count / total_items >= threshold]
    if all_keys:
        is_uniform = len(uniform_keys) / len(all_keys) >= threshold
    else:
        is_uniform = False
    return is_uniform, sorted(uniform_keys)

def should_use_table_format(data: Any, config: ToonConfig) -> bool:
    if not isinstance(data, list):
        return False
    if len(data) < config.min_table_rows:
        return False
    is_uniform, _ = is_uniform_array(data, config.uniformity_threshold)
    return is_uniform

print("✓ Dependencies loaded")

✓ Dependencies loaded


## Implementation: ToonEncoder Class

In [17]:
# encoder.py - ToonEncoder class for json2toon library

class ToonEncoder:
    """Encoder to convert JSON data to TOON format.
    
    The encoder handles:
    - Objects: key-value pairs with indentation
    - Arrays: table format (uniform) or list format (non-uniform)
    - Primitives: strings, numbers, booleans, null
    - Escaping: special characters in strings
    - Nesting: depth limits to prevent infinite recursion
    """
    
    def __init__(self, config: Optional[ToonConfig] = None):
        """Initialize encoder with configuration.
        
        Args:
            config: ToonConfig instance (uses defaults if None)
        """
        self.config = config or ToonConfig()
    
    def encode(self, data: Any) -> str:
        """Encode JSON data to TOON format.
        
        Args:
            data: JSON-serializable data
            
        Returns:
            TOON formatted string
            
        Raises:
            EncodingError: If encoding fails
        """
        try:
            return self._encode_value(data, depth=0)
        except EncodingError:
            raise
        except Exception as e:
            raise EncodingError(f"Failed to encode data: {e}") from e
    
    def _looks_like_number(self, s: str) -> bool:
        """Check if string looks like a number.
        
        This is a KEY BUG FIX: strings like "123" must be quoted
        to distinguish them from actual numbers.
        
        Args:
            s: String to check
            
        Returns:
            True if string could be parsed as a number
        """
        if not s:
            return False
        # Match integers: 123, -456, +789
        # Match floats: 1.5, -2.5, .5, 1e10, 2.5e-3
        pattern = r'^[+-]?(\d+\.?\d*|\.\d+)([eE][+-]?\d+)?$'
        return bool(re.match(pattern, s))
    
    def _encode_value(self, value: Any, depth: int = 0) -> str:
        """Encode a single value based on its type.
        
        This is the main dispatch method that routes to specific handlers.
        
        Args:
            value: Value to encode
            depth: Current nesting depth
            
        Returns:
            TOON encoded string
        """
        # Check nesting depth
        if depth > self.config.max_nesting_depth:
            raise EncodingError(
                f"Maximum nesting depth ({self.config.max_nesting_depth}) exceeded"
            )
        
        # Dispatch to appropriate handler
        if value is None:
            return "null"
        elif isinstance(value, bool):
            return "true" if value else "false"
        elif isinstance(value, (int, float)):
            return str(value)
        elif isinstance(value, str):
            # Escape special characters
            escaped = value.replace("\\", "\\\\").replace("\n", "\\n")
            
            # BUG FIX: Quote strings that look like numbers
            if self.config.quote_strings or self._looks_like_number(escaped):
                return f'"{escaped}"'
            return escaped
        elif isinstance(value, list):
            return self._encode_array(value, depth)
        elif isinstance(value, dict):
            return self._encode_object(value, depth)
        else:
            raise EncodingError(f"Unsupported type: {type(value)}")
    
    def _encode_array(self, arr: List[Any], depth: int) -> str:
        """Encode an array to TOON format.
        
        Decision tree:
        1. Empty array → []
        2. Uniform objects → table format
        3. Primitive array (numbers/strings) → inline [1, 2, 3]
        4. Array containing arrays → JSON fallback
        5. Other arrays → list format with - prefix
        
        Args:
            arr: Array to encode
            depth: Current nesting depth
            
        Returns:
            TOON encoded array
        """
        if not arr:
            return "[]"
        
        # Check for table format (uniform objects)
        if should_use_table_format(arr, self.config):
            return self._encode_table(arr, depth)
        
        # Check for inline array format (primitives only)
        all_primitives = all(
            isinstance(item, (str, int, float, bool)) or item is None
            for item in arr
        )
        
        if (all_primitives and 
            self.config.compress_primitive_arrays and
            len(arr) <= self.config.max_inline_array_length):
            # Inline format: [1, 2, 3] or ["a", "b", "c"]
            items = [self._encode_value(item, depth + 1) for item in arr]
            return "[" + ", ".join(items) + "]"
        
        # BUG FIX: Arrays containing arrays - use JSON fallback
        # TOON doesn't have good syntax for nested arrays
        has_nested_arrays = any(isinstance(item, list) for item in arr)
        if has_nested_arrays:
            return json.dumps(arr)
        
        # List format with - prefix
        indent = " " * (depth * self.config.indent_size)
        lines = []
        for item in arr:
            if isinstance(item, dict):
                # Inline object on same line as -
                obj_parts = []
                for key, val in item.items():
                    encoded_val = self._encode_value(val, depth + 1)
                    obj_parts.append(f"{key}: {encoded_val}")
                lines.append(f"{indent}- " + ", ".join(obj_parts))
            else:
                encoded = self._encode_value(item, depth + 1)
                lines.append(f"{indent}- {encoded}")
        
        return "\n".join(lines)
    
    def _encode_table(self, arr: List[Dict[str, Any]], depth: int) -> str:
        """Encode uniform array as markdown-style table.
        
        Example:
            [{"id": 1, "name": "A"}, {"id": 2, "name": "B"}]
            
        Becomes:
            | id | name |
            |----|------|
            | 1  | A    |
            | 2  | B    |
        
        Args:
            arr: Uniform array of objects
            depth: Current nesting depth
            
        Returns:
            Table formatted string
        """
        _, keys = is_uniform_array(arr, self.config.uniformity_threshold)
        sep = self.config.table_separator
        indent = " " * (depth * self.config.indent_size)
        
        lines = []
        
        # Header row
        header = sep + " " + f" {sep} ".join(keys) + " " + sep
        lines.append(f"{indent}{header}")
        
        # Separator row
        sep_parts = [self.config.header_separator * (len(key) + 2) for key in keys]
        separator = sep + sep.join(sep_parts) + sep
        lines.append(f"{indent}{separator}")
        
        # Data rows
        for item in arr:
            values = []
            for key in keys:
                val = item.get(key, "")
                encoded = self._encode_value(val, depth + 1)
                # Pad to match header width
                padded = encoded.ljust(len(key))
                values.append(padded)
            row = sep + " " + f" {sep} ".join(values) + " " + sep
            lines.append(f"{indent}{row}")
        
        return "\n".join(lines)
    
    def _encode_object(self, obj: Dict[str, Any], depth: int) -> str:
        """Encode an object to TOON format.
        
        Example:
            {"name": "Alice", "age": 30}
            
        Becomes:
            name: Alice
            age: 30
        
        Args:
            obj: Dictionary to encode
            depth: Current nesting depth
            
        Returns:
            Indented key-value pairs
        """
        if not obj:
            return "{}"
        
        indent = " " * (depth * self.config.indent_size)
        lines = []
        
        for key, value in obj.items():
            # Handle nested objects with increased indentation
            if isinstance(value, dict) and value:
                lines.append(f"{indent}{key}:")
                nested = self._encode_object(value, depth + 1)
                lines.append(nested)
            # Handle arrays - might need newline
            elif isinstance(value, list) and value:
                # Check if it will be a table or multi-line list
                encoded = self._encode_value(value, depth + 1)
                if "\n" in encoded or encoded.startswith("|"):
                    lines.append(f"{indent}{key}:")
                    lines.append(encoded)
                else:
                    lines.append(f"{indent}{key}: {encoded}")
            else:
                # Simple value - inline
                encoded = self._encode_value(value, depth + 1)
                lines.append(f"{indent}{key}: {encoded}")
        
        return "\n".join(lines)


print("✓ ToonEncoder class defined successfully")

✓ ToonEncoder class defined successfully


## Testing Encoder

In [4]:
# Create encoder instance
encoder = ToonEncoder()

# Test 1: Simple primitives
print("Test 1: Primitive Values")
print("="*50)
print(f"null:    {encoder.encode(None)}")
print(f"true:    {encoder.encode(True)}")
print(f"false:   {encoder.encode(False)}")
print(f"integer: {encoder.encode(42)}")
print(f"float:   {encoder.encode(3.14)}")
print(f"string:  {encoder.encode('hello')}")

Test 1: Primitive Values
null:    null
true:    true
false:   false
integer: 42
float:   3.14
string:  hello


In [5]:
# Test 2: Numeric strings (BUG FIX verification)
print("\nTest 2: Numeric Strings (Must Be Quoted)")
print("="*50)

result = encoder.encode("123")
print(f"String '123' encodes to: {result}")
assert result == '"123"', "Numeric strings must be quoted!"
print("✓ Numeric strings correctly quoted")

result = encoder.encode("3.14")
print(f"String '3.14' encodes to: {result}")
assert result == '"3.14"', "Decimal strings must be quoted!"
print("✓ Decimal strings correctly quoted")


Test 2: Numeric Strings (Must Be Quoted)
String '123' encodes to: "123"
✓ Numeric strings correctly quoted
String '3.14' encodes to: "3.14"
✓ Decimal strings correctly quoted


In [6]:
# Test 3: Simple object
print("\nTest 3: Simple Object")
print("="*50)

person = {
    "name": "Alice",
    "age": 30,
    "active": True
}

result = encoder.encode(person)
print(result)


Test 3: Simple Object
name: Alice
age: 30
active: true


In [7]:
# Test 4: Nested object
print("\nTest 4: Nested Object")
print("="*50)

nested = {
    "user": {
        "name": "Alice",
        "contact": {
            "email": "alice@test.com",
            "phone": "123-456"
        }
    },
    "active": True
}

result = encoder.encode(nested)
print(result)


Test 4: Nested Object
user:
  name: Alice
  contact:
    email: alice@test.com
    phone: 123-456
active: true


In [8]:
# Test 5: Inline array (primitives)
print("\nTest 5: Inline Arrays")
print("="*50)

# Number array
numbers = [1, 2, 3, 4, 5]
print(f"Numbers: {encoder.encode(numbers)}")

# String array
strings = ["a", "b", "c"]
print(f"Strings: {encoder.encode(strings)}")

# Mixed primitives
mixed = [1, "two", True, None]
print(f"Mixed:   {encoder.encode(mixed)}")


Test 5: Inline Arrays
Numbers: [1, 2, 3, 4, 5]
Strings: [a, b, c]
Mixed:   [1, two, true, null]


In [9]:
# Test 6: Table format (uniform objects)
print("\nTest 6: Table Format")
print("="*50)

users = [
    {"id": 1, "name": "Alice", "role": "admin"},
    {"id": 2, "name": "Bob", "role": "user"},
    {"id": 3, "name": "Charlie", "role": "user"}
]

result = encoder.encode(users)
print(result)


Test 6: Table Format
| id | name | role |
|----|------|------|
| 1  | Alice | admin |
| 2  | Bob  | user |
| 3  | Charlie | user |


In [10]:
# Test 7: Arrays of arrays (JSON fallback)
print("\nTest 7: Nested Arrays (JSON Fallback)")
print("="*50)

matrix = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
result = encoder.encode(matrix)
print(f"Matrix: {result}")

# Verify it's valid JSON
parsed = json.loads(result)
assert parsed == matrix, "JSON fallback must roundtrip correctly"
print("✓ Nested arrays correctly use JSON fallback")


Test 7: Nested Arrays (JSON Fallback)
Matrix: [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
✓ Nested arrays correctly use JSON fallback


In [11]:
# Test 8: Escape sequences
print("\nTest 8: Escape Sequences")
print("="*50)

# Newline in string
text_with_newline = "line1\nline2"
result = encoder.encode(text_with_newline)
print(f"Original: {repr(text_with_newline)}")
print(f"Encoded:  {result}")
assert "\\n" in result, "Newlines must be escaped"
print("✓ Newlines correctly escaped")

# Backslash in string
text_with_backslash = "path\\to\\file"
result = encoder.encode(text_with_backslash)
print(f"\nOriginal: {repr(text_with_backslash)}")
print(f"Encoded:  {result}")
print("✓ Backslashes correctly escaped")


Test 8: Escape Sequences
Original: 'line1\nline2'
Encoded:  line1\nline2
✓ Newlines correctly escaped

Original: 'path\\to\\file'
Encoded:  path\\to\\file
✓ Backslashes correctly escaped


In [12]:
# Test 9: Max nesting depth
print("\nTest 9: Max Nesting Depth")
print("="*50)

# Create deeply nested structure
deep = {"level": 0}
current = deep
for i in range(1, 15):
    current["nested"] = {"level": i}
    current = current["nested"]

# Should raise error with default max_nesting_depth=10
try:
    encoder.encode(deep)
    print("ERROR: Should have raised EncodingError")
except EncodingError as e:
    print(f"✓ Correctly raised: {e}")


Test 9: Max Nesting Depth
✓ Correctly raised: Maximum nesting depth (10) exceeded


In [13]:
# Test 10: Object with array property
print("\nTest 10: Object with Array Property")
print("="*50)

user = {
    "name": "Alice",
    "tags": ["admin", "active"],
    "scores": [95, 87, 92]
}

result = encoder.encode(user)
print(result)


Test 10: Object with Array Property
name: Alice
tags: [admin, active]
scores: [95, 87, 92]


In [14]:
# Test 11: Object with nested table
print("\nTest 11: Object with Nested Table")
print("="*50)

company = {
    "name": "Acme Corp",
    "employees": [
        {"id": 1, "name": "Alice", "dept": "Engineering"},
        {"id": 2, "name": "Bob", "dept": "Sales"},
        {"id": 3, "name": "Charlie", "dept": "Engineering"}
    ]
}

result = encoder.encode(company)
print(result)


Test 11: Object with Nested Table
name: Acme Corp
employees:
  | dept | id | name |
  |------|----|------|
  | Engineering | 1  | Alice |
  | Sales | 2  | Bob  |
  | Engineering | 3  | Charlie |


In [15]:
# Test 12: Quote strings configuration
print("\nTest 12: Quote Strings Configuration")
print("="*50)

# Default: no quotes
default_encoder = ToonEncoder(ToonConfig(quote_strings=False))
print(f"Default (no quotes): {default_encoder.encode('hello')}")

# Quote strings enabled
quoting_encoder = ToonEncoder(ToonConfig(quote_strings=True))
print(f"With quotes:         {quoting_encoder.encode('hello')}")


Test 12: Quote Strings Configuration
Default (no quotes): hello
With quotes:         "hello"


In [16]:
# Test 13: Empty structures
print("\nTest 13: Empty Structures")
print("="*50)

print(f"Empty array:  {encoder.encode([])}")
print(f"Empty object: {encoder.encode({})}")


Test 13: Empty Structures
Empty array:  []
Empty object: {}


## Bug Fixes Explained

### Bug 1: Numeric String Detection
**Problem**: String values like `"123"` would be encoded without quotes, making them indistinguishable from the number `123`.

**Solution**: The `_looks_like_number()` method uses regex to detect strings that look like numbers and automatically quotes them.

### Bug 2: Nested Arrays
**Problem**: Arrays containing arrays (like `[[1,2],[3,4]]`) don't have a clean TOON representation.

**Solution**: Fall back to standard JSON for these structures.

### Bug 3: Escape Sequences
**Problem**: Newlines and backslashes in strings would break the TOON format.

**Solution**: Escape `\n` to `\\n` and `\` to `\\` before encoding.

## Summary

The encoder module provides:

1. **ToonEncoder class**: Main encoder with configurable options
2. **_encode_value()**: Dispatch method for type-specific encoding
3. **_encode_array()**: Handles table format, inline, and list formats
4. **_encode_table()**: Markdown-style table for uniform objects
5. **_encode_object()**: Key-value pairs with nested indentation
6. **_looks_like_number()**: Detects numeric strings for quoting

### Key Design Decisions:
- Smart format selection based on data structure
- Proper escape handling for special characters
- JSON fallback for unsupported structures
- Configurable depth limits to prevent infinite recursion