In [1]:
!pip install parsimonious

Defaulting to user installation because normal site-packages is not writeable
Collecting parsimonious
  Downloading parsimonious-0.10.0-py3-none-any.whl (48 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.4/48.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: parsimonious
Successfully installed parsimonious-0.10.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [1]:
import json

In [10]:
from parsimonious.grammar import Grammar

# Define the grammar for your custom language
grammar = Grammar(r"""
program          = (WS (def_stmt / arg_stmt / format_decl / struct_decl / prompt_decl / flow_block / annot_block ))* WS

def_stmt         = "define"   WS identifier WS initializer  WS SC
arg_stmt         = "argument" WS identifier WS initializer? WS SC
initializer      = EQUAL WS value_expr

format_decl      = "format" WS identifier WS LCB (var_decl WS)* is_format_field WS annot_expr?  WS RCB
struct_decl      = "struct" WS identifier WS LCB (var_decl WS)* is_record_field WS annot_block? WS RCB

var_decl         = WS ( def_stmt / arg_stmt )

prompt_decl      = "prompt" WS identifier WS LCB
                       var_decl* WS
                       is_record_field WS
                       channel_block? WS
                       flow_block? WS
                       return_stmt? WS
                       annot_block? WS
                   RCB

channel_block    = "channel" WS LCB WS ( channel_stmt WS )+ RCB
channel_stmt     = "to" WS path_expr WS ( from_stmt / call_block )
from_stmt        = "from" WS path_expr WS SC
call_block       = "call" WS LCB WS
                       ( extern_stmt WS )?
                       ( entry_stmt WS )?
                       ( kwarg_stmt WS )*
                       ( bind_stmt  WS )*
                   RCB
extern_stmt      = "extern" WS identifier WS SC
entry_stmt       = "entry"  WS identifier WS SC
kwarg_stmt       = "kwarg"  WS identifier WS ( kwarg_from_stmt / kwarg_map_stmt ) WS SC
kwarg_from_stmt  = "from" WS path_expr
kwarg_map_stmt   = "map"  WS path_expr
bind_stmt        = "bind" WS path_expr WS bind_as_stmt WS SC
bind_as_stmt     = "as" WS path_expr

flow_block       = "flow" WS LCB WS ( flow_stmt WS )+ RCB
flow_stmt        = "to" WS identifier flow_limit? WS flow_as_stmt? SC
flow_limit       = "[" WS int_literal WS "]"
flow_as_stmt     = "as" WS string_literal

return_stmt      = "return" WS ( return_block / return_expr )
return_block     = LCB WS ( return_as_stmt WS )? ( return_from_stmt WS )+ RCB
return_as_stmt   = "as" WS string_literal WS SC
return_from_stmt = "from" WS path_expr WS SC
return_expr      = path_expr WS SC

annot_block      = "annotate" WS LCB WS ( annot_stmt WS )+ RCB
annot_stmt       = path_expr WS "as" WS string_expr SC
annot_expr       = "annotate" WS string_expr SC

field_decl       = identifier array_slice? WS ( is_format_field / is_record_field / field_detail ) WS
field_detail     = LCB WS ( is_format_field / is_record_field ) ( WS (annot_expr/annot_block) )? WS RCB
is_format_field  = "is" WS (repeat_def / select_def / enum_def / regex_string / type_ref) WS SC
is_record_field  = "is" WS LCB ( WS field_decl )* WS RCB

array_slice      = "[" WS int_literal ( WS ":" WS int_literal )? WS "]"

repeat_def       = "repeat" WS "(" path_expr ")"
select_def       = "select" WS "(" path_expr ")"
enum_def         = "enum"   WS "(" string_expr_list ")"

type_ref         = identifier WS ( "<" WS param_list WS ">" )?

param_list       = param_expr ( WS COMMA WS param_expr )*
param_expr       = ( identifier WS EQUAL  WS )? value_expr

path_expr        = ( PERIOD / QMARK )? identifier array_slice? (PERIOD identifier array_slice? )*

value_expr       = string_literal / int_literal / identifier

string_expr_list = string_expr ( WS COMMA WS string_expr )*
string_expr      = string_literal / identifier
string_literal   = val_string / fmt_string
val_string       = '"' ~r'[^"]*' '"'
fmt_string       = 'f"' ~r'[^"]*' '"'

regex_string     = 'r"' ~r'[^"]*' '"'

int_literal      = ~r'\d+' / int_infinty
int_infinty      = "INF"

identifier       = ~"[a-zA-Z_][a-zA-Z0-9_]*"

LCB = "{"
RCB = "}"
LSB = "["
RSB = "]"
EQUAL = "="
QMARK = "?"
PERIOD = "."
COMMA = ","
SC = ";"
WS = ~"\s*"
""")

In [32]:
from parsimonious.nodes import NodeVisitor

class StaVisitor(NodeVisitor):
    def visit_LCB(self, node, visited_children):
        return None
    def visit_RCB(self, node, visited_children):
        return None
    def visit_LSB(self, node, visited_children):
        return None
    def visit_RSB(self, node, visited_children):
        return None
    def visit_EQUAL(self, node, visited_children):
        return None
    def visit_QMARK(self, node, visited_children):
        return None
    def visit_PERIOD(self, node, visited_children):
        return None
    def visit_COMMA(self, node, visited_children):
        return None
    def visit_SC(self, node, visited_children):
        return None
    def visit_WS(self, node, visited_children):
        return None

    def generic_visit(self, node, visited_children):
        if visited_children:
            children = [ c for c in visited_children if c is not None ]
            if len(node.expr_name) == 0 and len(children) == 0:
                return None
            elif len(node.expr_name) == 0 and len(children) == 1:
                return children[0]
            else:
                return { 'kind' : node.expr_name, 'N' : len(children), 'children' : children }
        elif len(node.text) == 0:
            return None
        else:
            return str(node)

visitor = StaVisitor()

## Single prompt and _compact_

**Note**:
 - `text` is implicitly `text<INF>` (meaning no limit on number of tokens) 
   - **but** can only be used if target of a channel

In [33]:
tree = grammar.parse("""
prompt mmlu_main {
    is {
        topic is text;
        question is text;
        choices[4] is {
            value is text;
            correct is bool;
        }
        answer is repeat(.choices.value);
    }
    channel {
        to .topic         from ?topic;
        to .question      from ?question;
        to .choices.value from ?choices;
    }
    return {
        from .answer;
    }
    annotate {
        .topic           as "the general category from which the question was taken";
        .question        as "the question that you have to answer";
        .choices         as "you judge whether each choice is correct or not";
        .choices.value   as "the value of the choice";
        .choices.correct as "you decide whether this choice is correct or not";
        .answer          as "you repeat the value of the choice that best answer the question";
    }
}
""")
print(json.dumps(visitor.visit(tree), indent=4))

{
    "kind": "program",
    "N": 1,
    "children": [
        {
            "kind": "prompt_decl",
            "N": 6,
            "children": [
                "<Node matching \"prompt\">",
                "<RegexNode called \"identifier\" matching \"mmlu_main\">",
                {
                    "kind": "is_record_field",
                    "N": 2,
                    "children": [
                        "<Node matching \"is\">",
                        {
                            "kind": "",
                            "N": 4,
                            "children": [
                                {
                                    "kind": "field_decl",
                                    "N": 2,
                                    "children": [
                                        "<RegexNode called \"identifier\" matching \"topic\">",
                                        {
                                            "kind": "is_format_field",
                

## Spreading the annotations

**Note**:
 - outer-most `annotate` takes precedence over inner `annotate` (including overide from compile command)
 - showing shorthand for single element `return`

In [3]:
tree = grammar.parse("""
prompt mmlu_main {
    is {
        topic {
            is text;
            annotate "the general category from which the question was taken";
        }
        question {
            is text;
            annotate "the question that you have to answer";
        }
        choices[4] {
            is {
                value   is text;
                correct is bool;
            }
            annotate {
                .value   as "the value of the choice";
                .correct as "you decide whether this choice is correct or not";
            }
        }
        answer is repeat(.choices.value);
    }
    channel {
        to .topic         from ?topic;
        to .question      from ?question;
        to .choices.value from ?choices;
    }
    return .answer;
}

annotate {
    mmlu_main.choices as "you judge whether each choice is correct or not";
    mmlu_main.answer  as "you repeat the value of the choice that best answer the question";
}
""")

## Reusable data-structure

In [4]:
tree = grammar.parse("""
struct a_choice {
    is {
        value   is text;
        correct is bool;
    }
    annotate {
        .value   as "the value of the choice";
        .correct as "you decide whether this choice is correct or not";
    }
}

prompt mmlu_main {
    is {
        topic      is text;
        question   is text;
        choices[4] is a_choice;
        answer     is repeat(.choices.value);
    }
    channel {
        to .topic         from ?topic;
        to .question      from ?question;
        to .choices.value from ?choices;
    }
    return .answer;
    annotate {
        .topic    as "the general category from which the question was taken";
        .question as "the question that you have to answer";
        .choices  as "you judge whether each choice is correct or not";
        .answer   as "you repeat the value of the choice that best answer the question";
    }
}
""")

## Flow between prompts and formats

**Note**:
 - annotation with f-exp `f"{var}"`
 - global `flow` declares possible entry point
   - entry point defaults to `main`
   - if not precised the first prompt defaults to be `main`

In [5]:
tree = grammar.parse("""
format sentence {
    argument N=30;
    is text<N>;
    annotate f"A grammatically correct sentence made of at most {N} tokens.";
}

prompt mmlu_main {
    define K=50;
    is {
        topic       is text;
        question    is text;
        hyphothesis is sentence<K>;
    }
    channel {
        to .topic    from ?topic;
        to .question from ?question;
    }
    flow {
        to mmlu_choose;
    }
}

prompt mmlu_choose {
    is {
        topic       is text;
        question    is text;
        hyphothesis is sentence;
        choices[4]  is text;
        answer      is repeat(.choices.value);
    }
    channel {
        to .topic         from ?topic;
        to .question      from ?question;
        to .choices.value from ?choices;
        to .hyphothesis   from mmlu_main.hyphothesis;
    }
    return .answer;
}

flow {
    to mmlu_main as "main";
}
""")

## Iterations

**Note**:
 - use of `as` in both `flow` and `return`, when prompt ends LLM will have to choose between `retry` and `ready`
   - without `as` choice would have been `mmlu_main` or `return`
 - `[5]` in the flow section limits the trip count on that branch ensuring termination

In [6]:
tree = grammar.parse("""
prompt mmlu_main {
    is {
        topic         is text;
        question      is text;
        choices[4]    is text;
        previous[0:1] is text<20>;
        answer        is repeat(.choices.value);
        reflect       is text<20>;
    }
    channel {
        to .topic         from ?topic;
        to .question      from ?question;
        to .choices.value from ?choices;
        to .previous      from .reflect;
    }
    flow {
        to mmlu_main[5] as "retry";
    }
    return {
        as "ready";
        from .answer;
    }
}
""")

## Calls

**Note**:
 - 

In [7]:
tree = grammar.parse("""
prompt mmlu_main {
    is {
        topic is text;
        question is text;
        choices[4] is {
            value is text;
            correct is bool;
        }
        answer is repeat(.choices.value);
    }
    channel {
        to .topic    from ?topic;
        to .question from ?question;
        to .choices call {
            entry mmlu_annot;
            kwarg topic    from ?topic;
            kwarg question from ?question;
            kwarg choice   map  ?choices;
            bind choice.subfld as value;
        }
    }
    return {
        from .answer;
    }
}

prompt mmlu_annot {
    is {
        topic    is text;
        question is text;
        choice   is text;
        correct  is bool;
    }
    channel {
        to .topic    from ?topic;
        to .question from ?question;
        to .choice   from ?choice;
    }
    return {
        from .choice;
        from .correct;
    }
}
""")