In [7]:
# !pip install lark

from lark import Lark

In [8]:
GRAMMAR = r"""
    value: dict
         | list
         | ESCAPED_STRING
         | SIGNED_NUMBER
         | "true" | "false" | "null"

    list : "[" [value ("," value)*] "]"

    dict : "{" [pair ("," pair)*] "}"
    pair : ESCAPED_STRING ":" value

    %import common.ESCAPED_STRING
    %import common.SIGNED_NUMBER
    %import common.WS
    %ignore WS

    """

In [9]:
json_parser = Lark(GRAMMAR, start='value')

In [10]:
text = '{"key": ["item0", "item1", 3.14]}'

In [12]:
mytree = json_parser.parse(text)
mytree

Tree('value', [Tree('dict', [Tree('pair', [Token('ESCAPED_STRING', '"key"'), Tree('value', [Tree('list', [Tree('value', [Token('ESCAPED_STRING', '"item0"')]), Tree('value', [Token('ESCAPED_STRING', '"item1"')]), Tree('value', [Token('SIGNED_NUMBER', '3.14')])])])])])])

In [14]:
print(mytree.pretty())

value
  dict
    pair
      "key"
      value
        list
          value	"item0"
          value	"item1"
          value	3.14



### Shaping the tree

In [80]:
GRAMMAR = r"""
    ?value: dict
          | list
          | string
          | SIGNED_NUMBER      -> number
          | "true"             -> true
          | "false"            -> false
          | "null"             -> null

    list : "[" [value ("," value)*] "]"

    dict : "{" [pair ("," pair)*] "}"
    pair : string ":" value

    string : ESCAPED_STRING

    %import common.ESCAPED_STRING
    %import common.SIGNED_NUMBER
    %import common.WS
    %ignore WS

    """

In [81]:
json_parser = Lark(GRAMMAR, start='value')

In [82]:
text = '{"key": ["item0", "item1", 3.14, true]}'

In [83]:
tree = json_parser.parse(text)

In [84]:
tree

Tree('dict', [Tree('pair', [Tree('string', [Token('ESCAPED_STRING', '"key"')]), Tree('list', [Tree('string', [Token('ESCAPED_STRING', '"item0"')]), Tree('string', [Token('ESCAPED_STRING', '"item1"')]), Tree('number', [Token('SIGNED_NUMBER', '3.14')]), Tree('true', [])])])])

In [85]:
print(tree.pretty())

dict
  pair
    string	"key"
    list
      string	"item0"
      string	"item1"
      number	3.14
      true



In [78]:
tree

Tree('value', [Tree('dict', [Tree('pair', [Tree('string', [Token('ESCAPED_STRING', '"key"')]), Tree('value', [Tree('list', [Tree('value', [Tree('string', [Token('ESCAPED_STRING', '"item0"')])]), Tree('value', [Tree('string', [Token('ESCAPED_STRING', '"item1"')])]), Tree('number', [Token('SIGNED_NUMBER', '3.14')]), Tree('true', [])])])])])])

In [79]:
print(tree.pretty())

value
  dict
    pair
      string	"key"
      value
        list
          value
            string	"item0"
          value
            string	"item1"
          number	3.14
          true



In [88]:
from lark import common

In [90]:
dir(common)

['LexerConf',
 'ParserConf',
 'Serialize',
 'TerminalDef',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'warn']

In [91]:
common.NUMBER

AttributeError: module 'lark.common' has no attribute 'NUMBER'

### Evalute Tree (to json)

In [22]:
from lark import Transformer

class MyTransformer(Transformer):
    
    def list(self, items):
        return list(items)
    
    def pair(self, key_value):
        k, v = key_value
        return k, v
    
    def dict(self, items):
        return dict(items)

In [30]:
text

'{"key": ["item0", "item1", 3.14, true]}'

In [24]:
tree = json_parser.parse(text)

In [25]:
MyTransformer().transform(tree)

{Tree('string', [Token('ESCAPED_STRING', '"key"')]): [Tree('string', [Token('ESCAPED_STRING', '"item0"')]),
  Tree('string', [Token('ESCAPED_STRING', '"item1"')]),
  Tree('number', [Token('SIGNED_NUMBER', '3.14')]),
  Tree('true', [])]}

In [61]:
class TreeToJson(Transformer):
    
    def string(self, s):
        (s,) = s
        return s[1:-1]
    
    def number(self, n):
        (n,) = n
        return float(n)

#     list = list
    def list(self, items):
        breakpoint()
        return list(items)
    
    def dict(self, items):
        breakpoint()
        return dict(items)
    
    def pair(self, key_value):
        breakpoint()
        k, v = key_value
        return k, v
#     pair = tuple
#     dict = dict

    null = lambda self, _: None
    true = lambda self, _: True
    false = lambda self, _: False

In [62]:
text

'{"key": ["item0", "item1", 3.14, true]}'

In [63]:
tree

Tree('dict', [Tree('pair', [Tree('string', [Token('ESCAPED_STRING', '"key"')]), Tree('list', [Tree('string', [Token('ESCAPED_STRING', '"item0"')]), Tree('string', [Token('ESCAPED_STRING', '"item1"')]), Tree('number', [Token('SIGNED_NUMBER', '3.14')]), Tree('true', [])])])])

In [64]:
TreeToJson().transform(tree)

> [0;32m<ipython-input-61-02bf1411a236>[0m(14)[0;36mlist[0;34m()[0m
[0;32m     12 [0;31m    [0;32mdef[0m [0mlist[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mitems[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     13 [0;31m        [0mbreakpoint[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 14 [0;31m        [0;32mreturn[0m [0mlist[0m[0;34m([0m[0mitems[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     15 [0;31m[0;34m[0m[0m
[0m[0;32m     16 [0;31m    [0;32mdef[0m [0mdict[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mitems[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  items


['item0', 'item1', 3.14, True]


ipdb>  c


> [0;32m<ipython-input-61-02bf1411a236>[0m(22)[0;36mpair[0;34m()[0m
[0;32m     20 [0;31m    [0;32mdef[0m [0mpair[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mkey_value[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     21 [0;31m        [0mbreakpoint[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 22 [0;31m        [0mk[0m[0;34m,[0m [0mv[0m [0;34m=[0m [0mkey_value[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     23 [0;31m        [0;32mreturn[0m [0mk[0m[0;34m,[0m [0mv[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     24 [0;31m[0;31m#     pair = tuple[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  key_value


['key', ['item0', 'item1', 3.14, True]]


ipdb>  n


> [0;32m<ipython-input-61-02bf1411a236>[0m(23)[0;36mpair[0;34m()[0m
[0;32m     21 [0;31m        [0mbreakpoint[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     22 [0;31m        [0mk[0m[0;34m,[0m [0mv[0m [0;34m=[0m [0mkey_value[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 23 [0;31m        [0;32mreturn[0m [0mk[0m[0;34m,[0m [0mv[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     24 [0;31m[0;31m#     pair = tuple[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     25 [0;31m[0;31m#     dict = dict[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  k


'key'


ipdb>  v


['item0', 'item1', 3.14, True]


ipdb>  c


> [0;32m<ipython-input-61-02bf1411a236>[0m(18)[0;36mdict[0;34m()[0m
[0;32m     16 [0;31m    [0;32mdef[0m [0mdict[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mitems[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     17 [0;31m        [0mbreakpoint[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 18 [0;31m        [0;32mreturn[0m [0mdict[0m[0;34m([0m[0mitems[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     19 [0;31m[0;34m[0m[0m
[0m[0;32m     20 [0;31m    [0;32mdef[0m [0mpair[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mkey_value[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  items


[('key', ['item0', 'item1', 3.14, True])]


ipdb>  c


{'key': ['item0', 'item1', 3.14, True]}

### Regex overview

In [92]:
import re

Regex were so widely adopted because they are incredibly _compact_. 

Given an input string, match the string if it contains: an `a` followed by any number of `b`s (represented via the `*`) followed by a `c`.

In [123]:
result = re.match(r"ab*c", "abbbbbbbbbc")

In [124]:
result.group()

'abbbbbbbbbc'

In [125]:
result = re.findall(r"ab*c", "abbbbbbbbbc")

In [127]:
result

['abbbbbbbbbc']

In [128]:
result = re.search(r"ab*c", "abbbbbbbbbc")

In [129]:
result.group()

'abbbbbbbbbc'

In [130]:
result = re.search(r"ab*c|nate", "abbbbbbbbbc")

In [131]:
result.group()

'abbbbbbbbbc'

In [134]:
result = re.match(r"ab*c|nate", "this is nate")

In [135]:
result.group()

AttributeError: 'NoneType' object has no attribute 'group'

In [None]:
# https://regex101.com/ This is a great source to learn from