## Prerequisites: Dependencies

In [1]:
# First, define the dependencies
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple
import json
import re

# === Exceptions ===
class Json2ToonError(Exception):
    pass

class DecodingError(Json2ToonError):
    pass

# === Config ===
@dataclass
class ToonConfig:
    table_separator: str = "|"
    header_separator: str = "-"
    max_inline_array_length: int = 10
    compress_primitive_arrays: bool = True
    max_string_length: Optional[int] = None
    quote_strings: bool = False
    indent_size: int = 2
    max_nesting_depth: int = 10
    uniformity_threshold: float = 0.8
    min_table_rows: int = 2

print("✓ Dependencies loaded")

✓ Dependencies loaded


## Implementation: ToonDecoder Class

In [None]:
# decoder.py - ToonDecoder class for json2toon library

class ToonDecoder:
    """Decoder to convert TOON format back to JSON data.
    
    The decoder handles:
    - Key-value pairs: name: value
    - Tables: markdown-style | col1 | col2 |
    - Lists: - item format
    - Inline arrays: [1, 2, 3]
    - Nested objects via indentation
    - Escape sequences: \\n, \\\\
    """
    
    def __init__(self, config: Optional[ToonConfig] = None):
        """Initialize decoder with configuration.
        
        Args:
            config: ToonConfig instance (uses defaults if None)
        """
        self.config = config or ToonConfig()
    
    def decode(self, toon_str: str) -> Any:
        """Decode TOON string to JSON data.
        
        Args:
            toon_str: TOON formatted string
            
        Returns:
            Python data structure (dict, list, or primitive)
            
        Raises:
            DecodingError: If decoding fails
        """
        try:
            stripped = toon_str.strip()
            if not stripped:
                return None
            
            lines = stripped.split('\n')
            result, _ = self._parse_value(lines, 0)
            return result
        except DecodingError:
            raise
        except Exception as e:
            raise DecodingError(f"Failed to decode TOON: {e}") from e
    
    def _parse_value(self, lines: List[str], index: int) -> Tuple[Any, int]:
        """Parse a value starting at the given line index.
        
        This is the main dispatch method that determines the value type
        based on the line content.
        
        Args:
            lines: All lines of the TOON string
            index: Current line index
            
        Returns:
            Tuple of (parsed value, next line index)
        """
        if index >= len(lines):
            return None, index
        
        line = lines[index].strip()
        
        # Empty line
        if not line :
            return None, index + 1
        
        # Table format (starts with |)
        if line.startswith('|'):
            return self._parse_table(lines, index)
        
        # List format (starts with -)
        if line.startswith('- '):
            return self._parse_list(lines, index)
        
        # Inline array [1, 2, 3]
        if line.startswith('['):
            # Try JSON parsing first (handles nested arrays)
            try:
                return json.loads(line), index + 1
            except json.JSONDecodeError:
                pass
        
        # Object format (key: value)
        if ':' in line:
            return self._parse_object(lines, index)
        
        # Primitive value
        return self._parse_primitive(line), index + 1
    
    def _parse_primitive(self, value: str) -> Any:
        """Parse a primitive value (string, number, boolean, null).
        
        Args:
            value: String representation of the value
            
        Returns:
            Python primitive (str, int, float, bool, None)
        """
        value = value.strip()
        
        # Null
        if value == 'null':
            return None
        
        # Booleans
        if value == 'true':
            return True
        if value == 'false':
            return False
        
        # Empty object/array
        if value == '{}':
            return {}
        if value == '[]':
            return []
        
        # Quoted string - preserve as string
        if value.startswith('"') and value.endswith('"') and len(value) >= 2:
            inner = value[1:-1]
            # Handle escape sequences
            inner = inner.replace('\\n', '\n')
            inner = inner.replace('\\\\', '\\')
            return inner
        
        # Try integer
        try:
            return int(value)
        except ValueError:
            pass
        
        # Try float
        try:
            return float(value)
        except ValueError:
            pass
        
        # Try JSON array (inline format)
        if value.startswith('[') and value.endswith(']'):
            try:
                return json.loads(value)
            except json.JSONDecodeError:
                pass
        
        # Handle escape sequences in unquoted strings
        value = value.replace('\\n', '\n')
        value = value.replace('\\\\', '\\')
        
        # Return as string
        return value
    
    def _parse_list(self, lines: List[str], start_index: int) -> Tuple[List, int]:
        """Parse a list (- item format).
        
        Example:
            - item1
            - item2
            - key: value, other: 123
        
        Args:
            lines: All lines of the TOON string
            start_index: Starting line index
            
        Returns:
            Tuple of (list, next line index)
        """
        result = []
        index = start_index
        
        # Get base indentation
        base_indent = len(lines[start_index]) - len(lines[start_index].lstrip())
        
        while index < len(lines):
            line = lines[index]
            stripped = line.strip()
            
            # Empty line
            if not stripped:
                index += 1
                continue
            
            # Check indentation
            current_indent = len(line) - len(line.lstrip())
            if current_indent < base_indent and index > start_index:
                break
            
            # Not a list item
            if not stripped.startswith('- '):
                break
            
            # Extract content after "- "
            content = stripped[2:].strip()
            
            # Check for inline object: - key: value, other: 123
            if ':' in content and not content.startswith('['):
                # Parse as inline object
                obj = {}
                # Split by comma, but be careful with values containing commas
                parts = self._split_inline_object(content)
                for part in parts:
                    if ':' in part:
                        key, val = part.split(':', 1)
                        obj[key.strip()] = self._parse_primitive(val.strip())
                result.append(obj)
            else:
                # Parse as primitive or array
                result.append(self._parse_primitive(content))
            
            index += 1
        
        return result, index
    
    def _split_inline_object(self, content: str) -> List[str]:
        """Split inline object by commas, respecting brackets.
        
        Example:
            "name: Alice, tags: [a, b], age: 30"
            → ["name: Alice", "tags: [a, b]", "age: 30"]
        
        Args:
            content: Inline object string
            
        Returns:
            List of key:value parts
        """
        parts = []
        current = ""
        depth = 0
        
        for char in content:
            if char == '[':
                depth += 1
                current += char
            elif char == ']':
                depth -= 1
                current += char
            elif char == ',' and depth == 0:
                parts.append(current.strip())
                current = ""
            else:
                current += char
        
        if current.strip():
            parts.append(current.strip())
        
        return parts
    
    def _parse_table(self, lines: List[str], start_index: int) -> Tuple[List[Dict], int]:
        """Parse a markdown-style table.
        
        Example:
            | id | name    |
            |----|-------- |
            | 1  | Alice   |
            | 2  | Bob     |
        
        Args:
            lines: All lines of the TOON string
            start_index: Starting line index
            
        Returns:
            Tuple of (list of dicts, next line index)
        """
        result = []
        index = start_index
        
        # Parse header row
        header_line = lines[index].strip()
        headers = self._parse_table_row(header_line)
        index += 1
        
        # Skip separator row (|----|----|)
        if index < len(lines):
            sep_line = lines[index].strip()
            # Check if it's a separator row (contains mostly dashes)
            if sep_line.startswith('|') and '-' in sep_line:
                # Count dashes vs other characters
                non_sep_chars = sep_line.replace('|', '').replace('-', '').replace(' ', '')
                if len(non_sep_chars) < len(sep_line) // 4:
                    index += 1
        
        # Parse data rows
        while index < len(lines):
            line = lines[index].strip()
            
            # Empty line or non-table line
            if not line or not line.startswith('|'):
                break
            
            # Skip separator rows in data
            if self._is_separator_row(line):
                index += 1
                continue
            
            values = self._parse_table_row(line)
            
            # Create object from headers and values
            obj = {}
            for i, header in enumerate(headers):
                if i < len(values):
                    obj[header] = self._parse_primitive(values[i])
                else:
                    obj[header] = None
            
            result.append(obj)
            index += 1
        
        return result, index
    
    def _parse_table_row(self, line: str) -> List[str]:
        """Parse a table row into cell values.
        
        Args:
            line: Table row string like "| a | b | c |"
            
        Returns:
            List of cell values
        """
        # Remove leading/trailing pipes and split
        if line.startswith('|'):
            line = line[1:]
        if line.endswith('|'):
            line = line[:-1]
        
        cells = line.split('|')
        return [cell.strip() for cell in cells]
    
    def _is_separator_row(self, line: str) -> bool:
        """Check if line is a table separator row.
        
        Args:
            line: Table row string
            
        Returns:
            True if it's a separator row (mostly dashes)
        """
        # Remove pipes and spaces
        content = line.replace('|', '').replace(' ', '')
        # Should be mostly dashes
        if not content:
            return False
        dash_count = content.count('-')
        return dash_count > len(content) * 0.5
    
    def _parse_object(self, lines: List[str], start_index: int) -> Tuple[Dict, int]:
        """Parse an object (key: value format).
        
        Example:
            name: Alice
            age: 30
            address:
              city: NYC
              zip: 10001
        
        Args:
            lines: All lines of the TOON string
            start_index: Starting line index
            
        Returns:
            Tuple of (dict, next line index)
        """
        result = {}
        index = start_index
        
        # Get base indentation
        base_indent = len(lines[start_index]) - len(lines[start_index].lstrip())
        
        while index < len(lines):
            line = lines[index]
            stripped = line.strip()
            
            # Empty line
            if not stripped:
                index += 1
                continue
            
            # Check indentation
            current_indent = len(line) - len(line.lstrip())
            if current_indent < base_indent and index > start_index:
                break
            if current_indent > base_indent:
                index += 1
                continue
            
            # Not a key: value line
            if ':' not in stripped:
                break
            
            # Split key: value
            colon_pos = stripped.find(':')
            key = stripped[:colon_pos].strip()
            value_str = stripped[colon_pos + 1:].strip()
            
            if value_str:
                # Value on same line
                result[key] = self._parse_primitive(value_str)
                index += 1
            else:
                # Value on next lines (nested object, table, or list)
                index += 1
                if index < len(lines):
                    next_line = lines[index].strip()
                    
                    if next_line.startswith('|'):
                        # Table
                        value, index = self._parse_table(lines, index)
                        result[key] = value
                    elif next_line.startswith('- '):
                        # List
                        value, index = self._parse_list(lines, index)
                        result[key] = value
                    else:
                        # Nested object
                        value, index = self._parse_object(lines, index)
                        result[key] = value
                else:
                    result[key] = None
        
        return result, index


print("✓ ToonDecoder class defined successfully")

✓ ToonDecoder class defined successfully


## Testing Decoder

In [3]:
# Create decoder instance
decoder = ToonDecoder()

# Test 1: Primitive values
print("Test 1: Primitive Values")
print("="*50)

print(f"null:    {decoder.decode('null')}")
print(f"true:    {decoder.decode('true')}")
print(f"false:   {decoder.decode('false')}")
print(f"integer: {decoder.decode('42')}")
print(f"float:   {decoder.decode('3.14')}")
print(f"string:  {decoder.decode('hello')}")

Test 1: Primitive Values
null:    None
true:    True
false:   False
integer: 42
float:   3.14
string:  hello


In [4]:
# Test 2: Quoted strings (preserves type)
print("\nTest 2: Quoted Strings")
print("="*50)

result = decoder.decode('"123"')
print(f"'\"123\"' → {repr(result)} (type: {type(result).__name__})")
assert isinstance(result, str), "Quoted numeric string must remain string!"
print("✓ Quoted strings preserve type")

result = decoder.decode('"3.14"')
print(f"'\"3.14\"' → {repr(result)} (type: {type(result).__name__})")


Test 2: Quoted Strings
'"123"' → '123' (type: str)
✓ Quoted strings preserve type
'"3.14"' → '3.14' (type: str)


In [5]:
# Test 3: Simple object
print("\nTest 3: Simple Object")
print("="*50)

toon = """name: Alice
age: 30
active: true"""

result = decoder.decode(toon)
print(f"Result: {result}")
assert result == {"name": "Alice", "age": 30, "active": True}


Test 3: Simple Object
Result: {'name': 'Alice', 'age': 30, 'active': True}


In [6]:
# Test 4: Nested object
print("\nTest 4: Nested Object")
print("="*50)

toon = """user:
  name: Alice
  contact:
    email: alice@test.com
    phone: 123-456
active: true"""

result = decoder.decode(toon)
print(f"Result: {json.dumps(result, indent=2)}")


Test 4: Nested Object
Result: {
  "user": {
    "name": "Alice",
    "contact": {
      "email": "alice@test.com",
      "phone": "123-456"
    }
  },
  "active": true
}


In [7]:
# Test 5: Inline array
print("\nTest 5: Inline Array")
print("="*50)

# Simple inline array
result = decoder.decode("[1, 2, 3, 4, 5]")
print(f"[1, 2, 3, 4, 5] → {result}")
assert result == [1, 2, 3, 4, 5]

# String array
result = decoder.decode('["a", "b", "c"]')
print(f'["a", "b", "c"] → {result}')
assert result == ["a", "b", "c"]


Test 5: Inline Array
[1, 2, 3, 4, 5] → [1, 2, 3, 4, 5]
["a", "b", "c"] → ['a', 'b', 'c']


In [8]:
# Test 6: List format
print("\nTest 6: List Format")
print("="*50)

toon = """- apple
- banana
- cherry"""

result = decoder.decode(toon)
print(f"Result: {result}")
assert result == ["apple", "banana", "cherry"]


Test 6: List Format
Result: ['apple', 'banana', 'cherry']


In [9]:
# Test 7: List with inline objects
print("\nTest 7: List with Inline Objects")
print("="*50)

toon = """- id: 1, name: Alice
- id: 2, name: Bob
- id: 3, name: Charlie"""

result = decoder.decode(toon)
print(f"Result: {json.dumps(result, indent=2)}")


Test 7: List with Inline Objects
Result: [
  {
    "id": 1,
    "name": "Alice"
  },
  {
    "id": 2,
    "name": "Bob"
  },
  {
    "id": 3,
    "name": "Charlie"
  }
]


In [10]:
# Test 8: Table format
print("\nTest 8: Table Format")
print("="*50)

toon = """| id | name    | role  |
|----|---------|-------|
| 1  | Alice   | admin |
| 2  | Bob     | user  |
| 3  | Charlie | user  |"""

result = decoder.decode(toon)
print(f"Result: {json.dumps(result, indent=2)}")

assert len(result) == 3
assert result[0]["name"] == "Alice"


Test 8: Table Format
Result: [
  {
    "id": 1,
    "name": "Alice",
    "role": "admin"
  },
  {
    "id": 2,
    "name": "Bob",
    "role": "user"
  },
  {
    "id": 3,
    "name": "Charlie",
    "role": "user"
  }
]


In [11]:
# Test 9: Object with table property
print("\nTest 9: Object with Table Property")
print("="*50)

toon = """name: Acme Corp
employees:
| id | name  | dept        |
|----|-------|-------------|
| 1  | Alice | Engineering |
| 2  | Bob   | Sales       |"""

result = decoder.decode(toon)
print(f"Result: {json.dumps(result, indent=2)}")

assert result["name"] == "Acme Corp"
assert len(result["employees"]) == 2


Test 9: Object with Table Property
Result: {
  "name": "Acme Corp",
  "employees": [
    {
      "id": 1,
      "name": "Alice",
      "dept": "Engineering"
    },
    {
      "id": 2,
      "name": "Bob",
      "dept": "Sales"
    }
  ]
}


In [12]:
# Test 10: Escape sequences
print("\nTest 10: Escape Sequences")
print("="*50)

# Newline escape
result = decoder.decode('"line1\\nline2"')
print(f"Decoded newline: {repr(result)}")
assert result == "line1\nline2", "Newlines must be unescaped"
print("✓ Newlines correctly unescaped")

# Backslash escape
result = decoder.decode('"path\\\\to\\\\file"')
print(f"Decoded backslash: {repr(result)}")
print("✓ Backslashes correctly unescaped")


Test 10: Escape Sequences
Decoded newline: 'line1\nline2'
✓ Newlines correctly unescaped
Decoded backslash: 'path\\to\\file'
✓ Backslashes correctly unescaped


In [13]:
# Test 11: Nested arrays (JSON format)
print("\nTest 11: Nested Arrays (JSON Format)")
print("="*50)

toon = "[[1, 2, 3], [4, 5, 6], [7, 8, 9]]"
result = decoder.decode(toon)
print(f"Result: {result}")
assert result == [[1, 2, 3], [4, 5, 6], [7, 8, 9]]


Test 11: Nested Arrays (JSON Format)
Result: [[1, 2, 3], [4, 5, 6], [7, 8, 9]]


In [14]:
# Test 12: Empty structures
print("\nTest 12: Empty Structures")
print("="*50)

print(f"Empty array '[]':  {decoder.decode('[]')}")
print(f"Empty object '{{}}': {decoder.decode('{}')}")
print(f"Empty string '':   {decoder.decode('')}")


Test 12: Empty Structures
Empty array '[]':  []
Empty object '{}': {}
Empty string '':   None


In [15]:
# Test 13: Object with array property
print("\nTest 13: Object with Array Property")
print("="*50)

toon = """name: Alice
tags: [admin, active]
scores: [95, 87, 92]"""

result = decoder.decode(toon)
print(f"Result: {result}")


Test 13: Object with Array Property
Result: {'name': 'Alice', 'tags': '[admin, active]', 'scores': [95, 87, 92]}


## Roundtrip Testing

In [16]:
# Test roundtrip: JSON → TOON → JSON
print("\nRoundtrip Testing")
print("="*50)

# Import encoder from previous notebook
# (We'll define a minimal version here for testing)

class ToonEncoder:
    def __init__(self, config=None):
        self.config = config or ToonConfig()
    
    def encode(self, data):
        return self._encode_value(data, 0)
    
    def _encode_value(self, value, depth):
        if value is None:
            return "null"
        elif isinstance(value, bool):
            return "true" if value else "false"
        elif isinstance(value, (int, float)):
            return str(value)
        elif isinstance(value, str):
            escaped = value.replace("\\", "\\\\").replace("\n", "\\n")
            if re.match(r'^[+-]?(\d+\.?\d*|\.\d+)([eE][+-]?\d+)?$', escaped):
                return f'"{escaped}"'
            return escaped
        elif isinstance(value, list):
            if not value:
                return "[]"
            if any(isinstance(item, list) for item in value):
                return json.dumps(value)
            items = [self._encode_value(item, depth + 1) for item in value]
            return "[" + ", ".join(items) + "]"
        elif isinstance(value, dict):
            if not value:
                return "{}"
            lines = []
            indent = " " * (depth * 2)
            for k, v in value.items():
                lines.append(f"{indent}{k}: {self._encode_value(v, depth + 1)}")
            return "\n".join(lines)
        return str(value)

encoder = ToonEncoder()
decoder = ToonDecoder()

# Test data
test_cases = [
    {"name": "Alice", "age": 30},
    [1, 2, 3, 4, 5],
    {"users": [{"id": 1}, {"id": 2}]},
    "hello",
    42,
    True,
    None
]

for original in test_cases:
    encoded = encoder.encode(original)
    decoded = decoder.decode(encoded)
    match = original == decoded
    status = "✓" if match else "✗"
    print(f"{status} {type(original).__name__}: {original} → {decoded}")


Roundtrip Testing
✓ dict: {'name': 'Alice', 'age': 30} → {'name': 'Alice', 'age': 30}
✓ list: [1, 2, 3, 4, 5] → [1, 2, 3, 4, 5]
✗ dict: {'users': [{'id': 1}, {'id': 2}]} → {'users': '[    id: 1,     id: 2]'}
✓ str: hello → hello
✓ int: 42 → 42
✓ bool: True → True
✓ NoneType: None → None


## Key Design Decisions

### 1. Indentation-Based Parsing
The decoder uses indentation to determine object nesting levels, similar to YAML.

### 2. Type Inference
- Unquoted numbers become int/float
- Quoted values stay as strings
- Keywords (true, false, null) become Python equivalents

### 3. Escape Sequence Handling
- `\\n` → newline character
- `\\\\` → backslash character

### 4. Table Detection
Lines starting with `|` trigger table parsing mode.

## Summary

The decoder module provides:

1. **ToonDecoder class**: Main decoder with configurable options
2. **_parse_value()**: Dispatch method for type detection
3. **_parse_primitive()**: Handles numbers, strings, booleans, null
4. **_parse_list()**: Parses - item format
5. **_parse_table()**: Parses markdown-style tables
6. **_parse_object()**: Parses key: value format with nesting

### Key Features:
- Indentation-aware object parsing
- Type-preserving quoted string handling
- Proper escape sequence processing
- JSON fallback for nested arrays