diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index eb8cb50..f50c1f8 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -23,3 +23,5 @@ jobs: - name: Test run: go clean -testcache && go test -v -race ./... + - name: Multiple run + run: bash multiple_test.sh diff --git a/.gitignore b/.gitignore index 485dee6..4496edc 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ .idea +testdata/statistics.csv diff --git a/LICENCE b/LICENCE new file mode 100644 index 0000000..4c4bdbf --- /dev/null +++ b/LICENCE @@ -0,0 +1,16 @@ +Copyright 2024 Mario Škrlec + +Permission is hereby granted, free of charge, to any person obtaining a copy of this +software and associated documentation files (the “Software”), to deal in the Software +without restriction, including without limitation the rights to use, copy, modify, +merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md index cc7207f..5eeb8c1 100644 --- a/README.md +++ b/README.md @@ -1,29 +1,186 @@ > [!CAUTION] -> This package is still in development +> This package is still a work in progress. You can try it out +> but the API might change in future versions but not drastically. -# Introduction +With **cig**, you can query a .csv file with sql syntax. -With **cig**, you can query a .csv file with sql syntax. It is still in development, -but as time progresses, you would be able to filter data in a csv file with SQL syntax. -For example +- [Installation](#installation) +- [Usage](#usage) +- [Why this exists](#why-this-exists) +- [Tasks until finished](#development-tasks-until-the-project-is-finished) + +**Important considerations:** + +1. Columns to return, columns in where conditions, columns in ORDER BY clause +and values must be enclosed in single quotes. For example: ````sql -SELECT * FROM path:my_data.csv AS e WHERE e.column = 'value' +SELECT 's.ColumnOne', 's.ColumnTwo' +FROM path:path_to_csv.csv AS s WHERE 's.ColumnThree' = 'value' +ORDER BY 's.columnFour', 's.ColumnFive' DESC ```` +2. Alias is required. Without the `AS s` part of the above query, the query +would not be able to run. + +3. Path to a file must be relative to the executing binary or an absolute path. +Consider always giving absolute path for better portability. -For now, you can test it only with the above example, or without the **where** clause what -will return all the rows. The return data type will be `map[string]string` +4. This project does not and will not implement the entire SQL syntax. Other than +tasks outlined in the [Tasks section](#development-tasks-until-the-project-is-finished), +nothing else will be developed except making it faster and maintainable. + +5. This is not a project that should be used in production. Its only use is for simple +lookups and nothing else. In most situations, it is better to import a csv file into +a database of your choice. This project is intended as "something interesting to do" for +me so do not take it too seriously. + +6. This package will be concurrency safe. This means that `Run()` method +will be able to be used inside your own concurrency primitives. Although +I will try to make it faster using concurrency for very large files, +that will not affect using the public API in your code. # Installation -`go get github.com/MarioLegenda/cig` +`go get github.com/MarioLegenda/cig@v0.1.1` + +# Usage + +Below snippet of sql describes almost all current features of this package: + +````sql +SELECT * FROM path:path_to_file.csv AS g WHERE 'g.columnOne' = 'string_value' +AND 'g.columnTwo'::int != '65' OR 'g.columnThree'::float = '56.3' +OFFSET 34 +LIMIT 56 +ORDER BY 'g.columnFour', 'g.columnFive' DESC +```` + +Instead of `*`, you can specify the columns to return like this: + +````sql +SELECT 'g.columnOne', 'g.columnTwo' /** rest of query goes here */ +```` + +If you don't specify `DESC` or `ASC`, `ASC` is assumed. + +In code, you use it like this: + +````go +package main + +import ( + "fmt" + "github.com/MarioLegenda/cig" + "log" +) + +func main() { + c := cig.New() + + result := c.Run(` +SELECT * FROM path:path_to_file.csv AS g WHERE 'g.columnOne' = 'string_value' +AND 'g.columnTwo'::int != '65' OR 'g.columnThree'::float = '56.3' +OFFSET 34 +LIMIT 56 +ORDER BY 'g.columnFour', 'g.columnFive' DESC +`) + + if result.Error != nil { + log.Fatalln(result.Error) + } + + fmt.Println(result.SelectedColumns) + fmt.Println(result.AllColumns) + fmt.Println(result.Data) +} +```` + +Signature of the result is + +````go +type Data struct { + SelectedColumns []string + AllColumns []string + Error error + Data []map[string]string +} +```` + +You can handle errors with the `errors.Is` function if you need fine grained +control of exactly which error happened. + +````go +package main + +import ( + "errors" + "fmt" + "github.com/MarioLegenda/cig" + cigError "github.com/MarioLegenda/cig/pkg" + "log" +) + +func main() { + c := cig.New() + + result := c.Run(` +SELECT * FROM path:path_to_file.csv AS g WHERE 'g.columnOne' = 'string_value' +AND 'g.columnTwo'::int != '65' OR 'g.columnThree'::float = '56.3' +OFFSET 34 +LIMIT 56 +ORDER BY 'g.columnFour', 'g.columnFive' DESC +`) + + if errors.Is(result.Error, cigError.InvalidAlias) { + log.Fatalln(result.Error) + } + + fmt.Println(result.SelectedColumns) + fmt.Println(result.AllColumns) + fmt.Println(result.Data) +} +```` + +This is the full list of errors you can use: + +````go + +var InvalidToken = errors.New("Expected WHERE or LIMIT, OFFSET, ORDER BY, got something else.") +var InvalidSelectToken = errors.New("Expected 'select', got something else.") +var InvalidSelectableColumns = errors.New("Expected selectable column") +var InvalidDuplicatedColumn = errors.New("Duplicated selectable column") +var InvalidFromToken = errors.New("Expected 'FROM', got something else.") +var InvalidFilePathToken = errors.New("Expected 'path:path_to_file' but did not get the path part") +var InvalidAsToken = errors.New("Expected 'as', got something else.") +var InvalidAlias = errors.New("Invalid alias.") +var InvalidColumnAlias = errors.New("Column alias not recognized.") +var InvalidWhereClause = errors.New("Expected WHERE clause, got something else.") +var InvalidConditionColumn = errors.New("Expected condition column.") +var InvalidComparisonOperator = errors.New("Invalid comparison operator") +var InvalidLogicalOperator = errors.New("Invalid logical operator") +var InvalidValueToken = errors.New("Invalid value token.") +var InvalidDataType = errors.New("Invalid data type.") +var InvalidConditionAlias = errors.New("Invalid condition alias.") +var InvalidOrderBy = errors.New("Invalid ORDER BY") + +```` + +# Why this exists + +One use could be in an environment where it is not possible to install a database +just to lookup some values in a .csv file. This package will provide a command line +utility to do so. Other than that, it would be better to import a .csv file into +a database of your choice and use it like that. -# Future development tasks (for now) +# Development tasks until the project is finished -- [ ] Implement logical operators -- [ ] Implement all comparison operators (now, only equality works) -- [ ] Implement picking columns to return -- [ ] Implement OFFSET and LIMIT to implement pagination -- [ ] Implement sorting -- [ ] Implement options (cache?, timeout?) -- [ ] Implement goroutine worker balancer (if needed) +- [x] Implement logical operators +- [x] Implement all comparison operators (now, only equality works) +- [x] Implement picking columns to return +- [x] Implement OFFSET and LIMIT to implement pagination +- [x] Implement sorting +- [ ] Create a command line utility to use it on the command line +- [ ] Implement JOIN with multiple files +- [ ] Implement options (cache, timeout with context, extremely simple optional indexing on first query execution) +- [ ] Implement splitting work into multiple goroutines +- [ ] Implement solutions from one billion rows challenge diff --git a/go.mod b/go.mod index 930db7f..95c8956 100644 --- a/go.mod +++ b/go.mod @@ -2,16 +2,19 @@ module github.com/MarioLegenda/cig go 1.22.1 +require ( + github.com/jedib0t/go-pretty/v6 v6.5.8 + github.com/spf13/cobra v1.8.0 + github.com/stretchr/testify v1.9.0 +) + require ( github.com/davecgh/go-spew v1.1.1 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect - github.com/jedib0t/go-pretty/v6 v6.5.8 // indirect github.com/mattn/go-runewidth v0.0.15 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/rivo/uniseg v0.2.0 // indirect - github.com/spf13/cobra v1.8.0 // indirect github.com/spf13/pflag v1.0.5 // indirect - github.com/stretchr/testify v1.9.0 // indirect golang.org/x/sys v0.17.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index a10ecff..91d4084 100644 --- a/go.sum +++ b/go.sum @@ -20,6 +20,7 @@ github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsT github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/db/conditionResolver/resolveCondition.go b/internal/db/conditionResolver/resolveCondition.go index 78dcf9a..af7d749 100644 --- a/internal/db/conditionResolver/resolveCondition.go +++ b/internal/db/conditionResolver/resolveCondition.go @@ -23,6 +23,9 @@ func ResolveCondition(condition syntaxStructure.Condition, metadata ColumnMetada head := condition var prevOp string + if head == nil { + return false, fmt.Errorf("Invalid condition head. This is internal error and a bug.") + } // setup for head != nil { next := head.Next() diff --git a/internal/db/selectedColumnMetadata/columnMetadata.go b/internal/db/selectedColumnMetadata/columnMetadata.go index 6b4160e..8181c07 100644 --- a/internal/db/selectedColumnMetadata/columnMetadata.go +++ b/internal/db/selectedColumnMetadata/columnMetadata.go @@ -7,6 +7,7 @@ type columnMetadata struct { type ColumnMetadata interface { Column(pos int) string + Position(name string) int Names() []string HasPosition(pos int) bool } @@ -25,6 +26,16 @@ func (cm columnMetadata) Column(pos int) string { return "" } +func (cm columnMetadata) Position(name string) int { + for p, s := range cm.names { + if s == name { + return cm.positions[p] + } + } + + return -1 +} + func (cm columnMetadata) HasPosition(pos int) bool { for _, s := range cm.positions { if s == pos { diff --git a/internal/job/searchFn.go b/internal/job/searchFn.go index a3adba2..f35d843 100644 --- a/internal/job/searchFn.go +++ b/internal/job/searchFn.go @@ -26,12 +26,6 @@ func SearchFactory( if err != nil { return nil, fmt.Errorf("Error in job %d while reading file. Trying to skip the first row but failed: %w", id, err) } - limit := constraints.Limit() - offset := constraints.Offset() - orderBy := constraints.OrderBy() - - var currentCollectedLimit int64 - var currentCollectedOffset int64 collectionFinished := false @@ -56,41 +50,46 @@ func SearchFactory( break } - if offset != nil && currentCollectedOffset < offset.Value() { - currentCollectedOffset++ - - continue - } - - if limit != nil && currentCollectedLimit == limit.Value() { - collectionFinished = true - break - } - if condition != nil { ok, err := conditionResolver.ResolveCondition(condition, metadata, lines) if err != nil { return nil, fmt.Errorf("Error in job %d while reading from the file: %w", id, err) } - if ok { - if limit != nil { - currentCollectedLimit++ - } + /* v, _ := strconv.ParseInt(lines[2], 10, 64) + if v < 2023 { + fmt.Println(v, ok) + }*/ + if ok { collectedLines = append(collectedLines, lines) } } else { - if limit != nil { - currentCollectedLimit++ - } - collectedLines = append(collectedLines, lines) } } } + limit := constraints.Limit() + offset := constraints.Offset() + orderBy := constraints.OrderBy() + + if orderBy != nil { + sortResults(collectedLines, orderBy, metadata) + } + + var currentCollectedOffset int64 + for _, line := range collectedLines { + if offset != nil && offset.Value() != currentCollectedOffset { + currentCollectedOffset++ + continue + } + + if limit != nil && int64(len(results)) == limit.Value() { + break + } + res, err := createResult(line, selectedColumns) if err != nil { return nil, fmt.Errorf("Error in job %d while reading from the file: %w", id, err) @@ -99,10 +98,6 @@ func SearchFactory( results = append(results, res) } - if orderBy != nil { - return sortResults(results, orderBy), nil - } - return results, nil } } diff --git a/internal/job/sort.go b/internal/job/sort.go index 63cd90a..163b74e 100644 --- a/internal/job/sort.go +++ b/internal/job/sort.go @@ -1,53 +1,103 @@ package job import ( + "github.com/MarioLegenda/cig/internal/db/conditionResolver" "github.com/MarioLegenda/cig/internal/syntax/operators" "github.com/MarioLegenda/cig/internal/syntax/syntaxStructure" "sort" + "strconv" ) -type MapResult map[string]string +type comparableConstraint interface { + int64 | float64 | string +} -var currentColumn int -var columns []string -var direction string +type By func(p1, p2 []string) bool -func (s SearchResult) Len() int { - return len(s) +type resultSorter struct { + results [][]string + by By } -func (s SearchResult) Swap(i, j int) { - s[i], s[j] = s[j], s[i] +func (s *resultSorter) Len() int { + return len(s.results) } -func (s SearchResult) Less(i, j int) bool { - if direction == operators.Asc { - return s[i][columns[currentColumn]] < s[j][columns[currentColumn]] - } +func (s *resultSorter) Swap(i, j int) { + s.results[i], s.results[j] = s.results[j], s.results[i] +} - return s[i][columns[currentColumn]] > s[j][columns[currentColumn]] +// Less is part of sort.Interface. It is implemented by calling the "by" closure in the sorter. +func (s *resultSorter) Less(i, j int) bool { + return s.by(s.results[i], s.results[j]) } -func sortResults(result SearchResult, orderBy syntaxStructure.OrderBy) SearchResult { - currentColumn = 0 - columns = make([]string, 0) - direction = orderBy.Direction() - if direction == "" { - direction = operators.Asc +func (by By) Sort(results [][]string) { + ps := &resultSorter{ + results: results, + by: by, } - ssColumns := orderBy.Columns() - for _, c := range ssColumns { - columns = append(columns, c.Column()) + sort.Sort(ps) +} + +func sortResults(result [][]string, orderBy syntaxStructure.OrderBy, metadata conditionResolver.ColumnMetadata) [][]string { + orderByColumns := orderBy.Columns() + direction := orderBy.Direction() + for _, c := range orderByColumns { + currentPosition := metadata.Position(c.Column()) + + fn := func(p1, p2 []string) bool { + v1int, p1IntErr := strconv.ParseInt(p1[currentPosition], 10, 64) + v2int, p2IntErr := strconv.ParseInt(p2[currentPosition], 10, 64) + + if p1IntErr != nil && p2IntErr != nil { + if direction == operators.Desc { + return v1int > v2int + } + + return v1int < v2int + } + + v1float, p1FloatErr := strconv.ParseFloat(p1[currentPosition], 64) + v2float, p2FloatErr := strconv.ParseFloat(p2[currentPosition], 64) + + if p1FloatErr != nil && p2FloatErr != nil { + if direction == operators.Desc { + return v1float > v2float + } + + return v1float < v2float + } + + if direction == operators.Desc { + return p1[currentPosition] > p2[currentPosition] + } + + return p1[currentPosition] < p2[currentPosition] + } + + By(fn).Sort(result) } - for i, _ := range columns { - currentColumn = i - sort.Sort(result) + return result +} + +/*func getValue[T int64 | float64 | string](p1 string, p2 string) (T, T) { + v1i, errV1i := strconv.ParseInt(p1, 10, 64) + v2i, errV2i := strconv.ParseInt(p2, 10, 64) + + if errV1i != nil && errV2i != nil { + return T(v1i), T(v2i) } - currentColumn = 0 - columns = make([]string, 0) + v1f, errV1f := strconv.ParseFloat(p1, 64) + v2f, errV2f := strconv.ParseFloat(p2, 64) - return result + if errV1f != nil && errV2f != nil { + return T(v1f), T(v2f) + } + + return T(p1), T(p2) } +*/ diff --git a/internal/syntax/tokenizer/tokenize.go b/internal/syntax/tokenizer/tokenize.go index 88d5705..982bc83 100644 --- a/internal/syntax/tokenizer/tokenize.go +++ b/internal/syntax/tokenizer/tokenize.go @@ -54,8 +54,6 @@ func Tokenize(sql string) []string { tokens = append(tokens, string(buf)) } - buf = make([]byte, 0) - return append(tokens, Tokenize(sql[i:])...) } diff --git a/internal/syntax/validation/validateAlias.go b/internal/syntax/validation/validateAlias.go index 3df9d47..c3afcdd 100644 --- a/internal/syntax/validation/validateAlias.go +++ b/internal/syntax/validation/validateAlias.go @@ -1,6 +1,8 @@ package validation -import "github.com/MarioLegenda/cig/pkg" +import ( + "github.com/MarioLegenda/cig/pkg" +) func validateAlias(token string) (string, error) { if token == "" { diff --git a/internal/syntax/validation/validateSelectableColumns.go b/internal/syntax/validation/validateSelectableColumns.go index 55b0ac2..d0edb96 100644 --- a/internal/syntax/validation/validateSelectableColumns.go +++ b/internal/syntax/validation/validateSelectableColumns.go @@ -46,7 +46,6 @@ func validSelectableColumns(tokens []string) (int, []SelectableColumn, error) { } columnNamesToValidate = append(columnNamesToValidate, splitted[1]) - columnMode = false selectableColumns = append(selectableColumns, SelectableColumn{ Alias: splitted[0], diff --git a/multiple_test.sh b/multiple_test.sh new file mode 100644 index 0000000..d06f8b2 --- /dev/null +++ b/multiple_test.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# Run this after every feature to ensure to false positives +# or race conditions that are not caught on a single run + +num_runs=5 +for ((i = 1; i <= $num_runs; i++)) +do + if go clean -testcache ; then + # Run tests + if go test -race -v ./... ; then + echo "" + else + echo "Test run $i: One of the tests failed. Exiting!" + exit 1 + fi + else + echo "Error cleaning test cache. Exiting!" + exit 1 + fi + +done \ No newline at end of file diff --git a/sort_test.go b/sort_test.go index d0f9a19..c7383e4 100644 --- a/sort_test.go +++ b/sort_test.go @@ -1,20 +1,14 @@ package cig import ( - "encoding/csv" - "errors" "github.com/stretchr/testify/assert" - "io" - "os" - "sort" "testing" ) func TestSingleColumnStringSort(t *testing.T) { - t.Skip("") c := New() - sql := "SELECT 'e.Industry_aggregation_NZSIOC' FROM path:testdata/example.csv AS e ORDER BY 'e.Industry_aggregation_NZSIOC' LIMIT 10 " + sql := "SELECT 'e.Year' FROM path:testdata/example.csv AS e ORDER BY 'e.Year' LIMIT 10 " res := c.Run(sql) assert.Nil(t, res.Error) @@ -23,43 +17,20 @@ func TestSingleColumnStringSort(t *testing.T) { assert.Equal(t, 10, len(foundResults)) - cls, err := collectColumn(1) - assert.Nil(t, err) - - sort.Strings(cls) - - cigCls := make([]string, len(foundResults)) - for i, c := range foundResults { - cigCls[i] = c["Industry_aggregation_NZSIOC"] + for _, res := range foundResults { + assert.Equal(t, res["Year"], "2013") } - assert.Equal(t, len(cigCls), len(cls)) - - for i, fileColumn := range cls { - assert.Equal(t, cigCls[i], fileColumn) - } -} + sql = "SELECT 'e.Year' FROM path:testdata/example.csv AS e ORDER BY 'e.Year' DESC LIMIT 10 " -func collectColumn(pos int) ([]string, error) { - f, err := os.Open("testdata/example.csv") - if err != nil { - return nil, err - } - - columns := make([]string, 0) - r := csv.NewReader(f) - defer f.Close() + res = c.Run(sql) + assert.Nil(t, res.Error) - for { - b, err := r.Read() - if err != nil && !errors.Is(err, io.EOF) { - return nil, err - } + foundResults = res.Data - if errors.Is(err, io.EOF) { - return columns, nil - } + assert.Equal(t, 10, len(foundResults)) - columns = append(columns, b[1]) + for _, res := range foundResults { + assert.Equal(t, res["Year"], "2021") } }