Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance filter parser #41

Open
wants to merge 24 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
9cfc06a
Introduce filter `Chain` helper methods
yhabteab Oct 23, 2023
7847585
tests: Use `~` as an operator for wildcald matching
yhabteab Oct 26, 2023
a45bec4
Introduce `NewChain()` & `NewCondition()` funcs
yhabteab May 2, 2023
d166796
`spew.Sdump()` filter & add some assertions in `FuzzParser()`
julianbrost May 2, 2023
61d1b6d
Introduce `goyacc` filter parsing grammer rules
yhabteab Oct 23, 2023
8909111
Introduce `Lexer` type and other parsing utils
yhabteab Oct 23, 2023
b0d5cae
Add extended filter/parser tests & adjust expected error messages
yhabteab May 2, 2023
89320eb
Add auto-generated goyacc filter `Parser`
yhabteab Nov 20, 2023
5107934
Set `yylex.rule` only once
yhabteab Oct 27, 2023
d6768fe
Reset `Lexer#rule` on error & use `init()` for initializing global confg
yhabteab Oct 27, 2023
ab4c51d
Name regex variable after its usage
yhabteab Oct 27, 2023
8f74a9a
Fix accessing to non-existent logical operators
yhabteab Oct 27, 2023
f0444f0
Set the scanner mode to `ScanIdents`
yhabteab Nov 6, 2023
94f92c6
Drop superfluous `T_STRING` token
yhabteab Nov 6, 2023
0cae943
Add test cases that triggers scan error
yhabteab Nov 6, 2023
5d6c645
Url decode `identifier`s in place & cleanup some type/token defs
yhabteab Nov 6, 2023
0ca652d
Resolve `filter_rule` in brackets instead of `filter_chain`
yhabteab Nov 6, 2023
c627cd7
Provide token friendly names in err messages
yhabteab Nov 6, 2023
1b70c78
Fully & correctly resolve complex filter strings
yhabteab Nov 6, 2023
271703f
Ignore auto generated parser actions file
yhabteab Nov 6, 2023
cef2478
Add test case for all the filter evaluation methods
yhabteab Nov 20, 2023
82d3141
Fix mixed tab & space indentention
yhabteab Apr 10, 2024
755b6ae
Fix confusing yacc rule names
yhabteab Apr 17, 2024
7142584
Disable `//line` directives for `goyacc` generated codes
yhabteab Aug 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
/build/

# Exclude the autogenerated parser.output file
parser.output
140 changes: 140 additions & 0 deletions internal/filter/filter_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
package filter

import (
"errors"
"github.com/stretchr/testify/assert"
"regexp"
"strings"
"testing"
)

const unknown string = "unknown"

var evalError = errors.New("evaluation error")

func TestFilter(t *testing.T) {
t.Parallel()

filterable := &filterableType{
key: "domain",
value: "example.com",
}

t.Run("InvalidOperator", func(t *testing.T) {
chain, err := NewChain("unknown", nil)
assert.Nil(t, chain)
assert.EqualError(t, err, "invalid logical operator provided: \"unknown\"")

condition, err := NewCondition("column", "unknown", "value")
assert.Nil(t, condition)
assert.EqualError(t, err, "invalid comparison operator provided: \"unknown\"")
})

t.Run("EvaluationError", func(t *testing.T) {
t.Parallel()

testInvalidData := []struct {
Expression string
}{
{"domain=" + unknown},
{"domain!=" + unknown},
{"domain<" + unknown},
{"domain<=" + unknown},
{"domain>" + unknown},
{"domain>=" + unknown},
{"domain~" + unknown},
{"domain!~" + unknown},
{"!(domain!=" + unknown + ")"},
{"domain=" + unknown + "&domain<=test.example.com"},
{"domain<=" + unknown + "|domain<=test.example.com"},
}

for _, td := range testInvalidData {
f, err := Parse(td.Expression)
assert.NoError(t, err)

matched, err := f.Eval(filterable)
assert.EqualError(t, err, evalError.Error())
assert.Equal(t, matched, false, "unexpected filter result for %q", td.Expression)
}
})

t.Run("EvaluateFilter", func(t *testing.T) {
t.Parallel()

testdata := []struct {
Expression string
Expected bool
}{
{"domain=example.com", true},
{"domain!=example.com", false},
{"domain=test.example.com", false},
{"name!=example.com", false},
{"domain", true},
{"name", false},
{"display_name", false},
{"!name", true},
{"domain~example*", true},
{"domain!~example*", false},
{"domain~example*&!domain", false},
{"domain>a", true},
{"domain<a", false},
{"domain>z", false},
{"domain<z", true},
{"domain>=example&domain<=test.example.com", true},
{"domain<=example|domain<=test.example.com", true},
{"domain<=example|domain>=test.example.com", false},
}

for _, td := range testdata {
f, err := Parse(td.Expression)
if assert.NoError(t, err, "parsing %q should not return an error", td.Expression) {
matched, err := f.Eval(filterable)
assert.NoError(t, err)
assert.Equal(t, td.Expected, matched, "unexpected filter result for %q", td.Expression)
}
}
})
}

type filterableType struct {
key string
value string
}

func (f *filterableType) EvalEqual(_ string, value string) (bool, error) {
if value == unknown {
return false, evalError
}

return strings.EqualFold(f.value, value), nil
}

func (f *filterableType) EvalLess(_ string, value string) (bool, error) {
if value == unknown {
return false, evalError
}

return f.value < value, nil
}

func (f *filterableType) EvalLike(_ string, value string) (bool, error) {
if value == unknown {
return false, evalError
}

regex := regexp.MustCompile("^example.*$")
return regex.MatchString(f.value), nil
}

func (f *filterableType) EvalLessOrEqual(_ string, value string) (bool, error) {
if value == unknown {
return false, evalError
}

return f.value <= value, nil
}

func (f *filterableType) EvalExists(key string) bool {
return f.key == key
}
177 changes: 177 additions & 0 deletions internal/filter/lexer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
//go:generate goyacc -l -v parser.output -o parser.go parser.y

package filter

import (
"errors"
"fmt"
"regexp"
"strings"
"text/scanner"
)

// identifiersMatcher contains a compiled regexp and is used by the Lexer to match filter identifiers.
// Currently, it allows to match any character except a LogicalOp and CompOperator.
var identifiersMatcher = regexp.MustCompile("[^!&|~<>=()]")
Comment on lines +13 to +15
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would make sense to work with a positive list here. Doing it like this, there is only very limited room for expansion. For example, if the regex would have been [^!&|<>=()] before (missing the ~), adding ~ there would have made previously valid identifiers invalid. Yes, this means that more characters have to be escaped/urlencoded, but I don't think that this would be bad, it would just have to be consistent with what the web module writes into the database.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't really understand on why don't doing it the way it is, and I don't want to list all the valid chars because I don't know them all.


// tokenDisplayNames contains a list of all the defined parser tokens and their respective
// friendly names used to output in error messages.
var tokenDisplayNames = map[string]string{
"$unk": `"unknown"`,
"T_EQUAL": `"="`,
"T_UNEQUAL": `"!="`,
"T_LIKE": `"~"`,
"T_UNLIKE": `"!~"`,
"T_LESS_THAN": `"<"`,
"T_GREATER_THAN": `">"`,
"T_LESS_THAN_OR_EQUAL": `"<="`,
"T_GREATER_THAN_OR_EQUAL": `">="`,
"T_IDENTIFIER": `"column or value"`,
}

// init just sets the global yyErrorVerbose variable to true.
func init() {
// Enable parsers error verbose to get more context of the parsing failures
yyErrorVerbose = true
}

// Parse wraps the auto generated yyParse function.
// It parses the given filter string and returns on success a Filter instance.
func Parse(expr string) (rule Filter, err error) {
lex := new(Lexer)
lex.IsIdentRune = isIdentRune
lex.Init(strings.NewReader(expr))

// Set the scanner mode to recognize only identifiers. This way all unrecognized tokens will be returned
// just as they are, and our Lexer#Lex() method will then recognize whatever valid input is required.
// Note: This is in fact not necessary, as our custom function `isIdentRune` accepts any token that matches the
// regex pattern `identifiersMatcher`, so the scanner would never match all the scanner.GoTokens except ScanIdents.
lex.Mode = scanner.ScanIdents

// scanner.Init sets the error function to nil, therefore, we have to register
// our error function after the scanner initialization.
lex.Scanner.Error = lex.ScanError

defer func() {
// All the grammar rules panics when encountering any errors while reducing the filter rules, so try
// to recover from it and return an error instead. Since we're using a named return values, we can set
// the err value even in deferred function. See https://go.dev/blog/defer-panic-and-recover
if r := recover(); r != nil {
err = errors.New(fmt.Sprint(r))
}

if err != nil {
// The lexer may contain some incomplete filter rules constructed before the parser panics, so reset it.
rule = nil
}
}()

yyParse(lex)

rule, err = lex.rule, lex.err
return
}

// Lexer is used to tokenize the filter input into a set of literals.
// This is just a wrapper around the Scanner type and implements the yyLexer interface used by the parser.
type Lexer struct {
scanner.Scanner

rule Filter
err error
}

func (l *Lexer) Lex(yyval *yySymType) int {
token := l.Scan()
lit := l.TokenText()
yyval.text = lit
if token == scanner.Ident {
return T_IDENTIFIER
}

switch lit {
case "&":
return '&'
case "|":
return '|'
case "~":
return T_LIKE
case "=":
return T_EQUAL
case "(":
return '('
case ")":
return ')'
case "!":
next := l.Peek()
switch next {
case '=', '~':
yyval.text = "!" + string(next)
// Since we manually picked the next char input, we also need to advance the internal scanner
// states by calling Scan. Otherwise, the same rune will be scanned multiple times.
l.Scan()

if next == '~' {
return T_UNLIKE
} else {
return T_UNEQUAL
}
default:
return '!'
}
case "<":
if next := l.Peek(); next == '=' {
yyval.text = "<="
// Since we manually picked the next char input, we also need to advance the internal scanner
// states by calling Scan. Otherwise, the same rune will be scanned multiple times.
l.Scan()

return T_LESS_THAN_OR_EQUAL
}

return T_LESS_THAN
case ">":
if next := l.Peek(); next == '=' {
yyval.text = ">="
// Since we manually picked the next char input, we also need to advance the internal scanner
// states by calling Scan. Otherwise, the same rune will be scanned multiple times.
l.Scan()

return T_GREATER_THAN_OR_EQUAL
}

return T_GREATER_THAN
}

// No more inputs to scan that we are interested in.
// Scan returns EOF as well if there's no more token to stream, but we just want to be explicit.
return scanner.EOF
}

// Error receives any syntax/semantic errors produced by the parser.
//
// The parser never returns an error when it fails to parse, but will forward the errors to our lexer with some
// additional context instead. This function then wraps the provided err and adds line, column number and offset
// to the error string. Error is equivalent to "yyerror" in the original yacc.
func (l *Lexer) Error(s string) {
// Replace all parser token names by their corresponding friendly names.
for token, name := range tokenDisplayNames {
s = strings.ReplaceAll(s, token, name)
}

l.err = fmt.Errorf("%d:%d (%d): %s", l.Line, l.Column, l.Offset, s)

// Always reset the current filter rule when encountering an error.
l.rule = nil
}

// ScanError is used to capture all errors the Scanner encounters.
//
// It's a rare case that the scanner actually will fail to scan the input string, but in these cases it will just
// output to std.Err and we won't be able to notice this. Hence, this function is registered by the filter.Parse
// function after the Lexer initialisation.
func (l *Lexer) ScanError(_ *scanner.Scanner, msg string) { l.Error(msg) }

// isIdentRune provides custom implementation of scanner.IsIdentRune.
// This function determines whether a given character is allowed to be part of an identifier.
func isIdentRune(ch rune, _ int) bool { return identifiersMatcher.MatchString(string(ch)) }
Loading
Loading