Skip to content

Commit

Permalink
Optimization CSV validation for aggregation cases. Added CSV generator (
Browse files Browse the repository at this point in the history
#79)

This commit introduces an application performance benchmarking feature
geared towards handling large datasets, and adds necessary dependencies
for data generation in these benchmarks. In addition, it optimizes the
CSV validation process by avoiding columns without any rules or
aggregate rules for improved performance.
  • Loading branch information
SmetDenis committed Mar 24, 2024
1 parent ab5b241 commit c06a6d0
Show file tree
Hide file tree
Showing 88 changed files with 605 additions and 212 deletions.
123 changes: 64 additions & 59 deletions Makefile
Expand Up @@ -12,91 +12,96 @@

.PHONY: build

REPORT ?= table
COLUMNS ?= 300

ifneq (, $(wildcard ./vendor/jbzoo/codestyle/src/init.Makefile))
include ./vendor/jbzoo/codestyle/src/init.Makefile
endif

CMD_VALIDATE ?= validate:csv --ansi -vvv
DOCKER_IMAGE ?= jbzoo/csv-blueprint:local
BLUEPRINT ?= COLUMNS=300 time $(PHP_BIN) ./csv-blueprint $(CMD_VALIDATE)
BLUEPRINT_DOCKER ?= docker run --rm --workdir=/parent-host -v .:/parent-host $(DOCKER_IMAGE) $(CMD_VALIDATE)
BENCH_BIN ?= time ${PHP_BIN} ./tests/Benchmarks/bench.php

VALID_CSV ?= --csv='./tests/fixtures/demo.csv'
VALID_SCHEMA ?= --schema='./tests/schemas/demo_valid.yml'
INVALID_CSV ?= --csv='./tests/fixtures/batch/*.csv'
INVALID_SCHEMA ?= --schema='./tests/schemas/demo_invalid.yml'

build:
# Build/install ########################################################################################################
build: ##@Project Build project in development mode
@composer install --optimize-autoloader
@rm -f `pwd`/ci-report-converter


build-prod:
build-prod: ##@Project Build project in production mode
@composer install --no-dev --classmap-authoritative
@rm -f `pwd`/ci-report-converter


build-phar-file:
build-phar-file: ##@Project Build PHAR file
curl -L "https://github.com/box-project/box/releases/download/4.5.1/box.phar" -o ./build/box.phar
@php ./build/box.phar --version
@php ./build/box.phar compile -vv
@ls -lh ./build/csv-blueprint.phar


update:
update: ##@Project Update dependencies
@echo "Composer flags: $(JBZOO_COMPOSER_UPDATE_FLAGS)"
@composer update $(JBZOO_COMPOSER_UPDATE_FLAGS)


# Demo #################################################################################################################
demo: ##@Demo Run demo via PHP binary
$(call title,"Demo - Valid CSV \(PHP binary\)")
@${BLUEPRINT} ${VALID_CSV} ${VALID_SCHEMA}
$(call title,"Demo - Invalid CSV \(PHP binary\)")
@${BLUEPRINT} ${INVALID_CSV} ${INVALID_SCHEMA}

demo: ##@Project Run all demo commands
@make demo-valid
@make demo-invalid


demo-valid: ##@Project Run demo valid CSV
$(call title,"Demo - Valid CSV")
@${PHP_BIN} ./csv-blueprint validate:csv \
--csv=./tests/fixtures/demo.csv \
--schema=./tests/schemas/demo_valid.yml \
--skip-schema -v

demo-invalid: ##@Project Run demo invalid CSV
$(call title,"Demo - Invalid CSV")
@${PHP_BIN} ./csv-blueprint validate:csv \
--csv=./tests/fixtures/demo.csv \
--schema=./tests/schemas/invalid_schema.yml \
--report=$(REPORT) -v


demo-github: ##@Project Run demo invalid CSV
@${PHP_BIN} ./csv-blueprint validate:csv \
--csv=./tests/fixtures/batch/*.csv \
--schema=./tests/schemas/demo_invalid.yml \
--report=$(REPORT) \
--ansi
REPORT ?= table
demo-github: ##@Demo Run demo invalid CSV for GitHub Actions
@${BLUEPRINT} ${INVALID_CSV} ${INVALID_SCHEMA} --report=$(REPORT)


# Docker ###############################################################################################################

build-docker:
docker-build: ##@Docker (Re-)build Docker image
$(call title,"Building Docker Image")
@docker build -t jbzoo/csv-blueprint:local .


docker-in:
@docker run -it --entrypoint /bin/sh jbzoo/csv-blueprint:local

@docker build -t $(DOCKER_IMAGE) .

demo-docker: ##@Project Run demo via Docker
docker-demo: ##@Docker Run demo via Docker
$(call title,"Demo - Valid CSV \(via Docker\)")
@docker run --rm \
-v `pwd`:/parent-host \
jbzoo/csv-blueprint:local \
validate:csv \
--csv=/parent-host/tests/fixtures/demo.csv \
--schema=/parent-host/tests/schemas/demo_valid.yml \
--ansi -vvv
@${BLUEPRINT_DOCKER} ${VALID_CSV} ${VALID_SCHEMA}
$(call title,"Demo - Invalid CSV \(via Docker\)")
@docker run --rm \
-v `pwd`:/parent-host \
jbzoo/csv-blueprint:local \
validate:csv \
--csv=/parent-host/tests/fixtures/demo.csv \
--schema=/parent-host/tests/schemas/demo_invalid.yml \
--ansi -vvv
@${BLUEPRINT_DOCKER} ${INVALID_CSV} ${INVALID_SCHEMA}

docker-in: ##@Docker Enter into Docker container
@docker run -it --entrypoint /bin/sh $(DOCKER_IMAGE)


# Benchmarks ###########################################################################################################
BENCH_CSV ?= --csv=./build/bench/20_1000000_header.csv
BENCH_SCHEMA ?= --schema=./tests/benchmarks/benchmark.yml

bench-php: ##@Benchmarks Run PHP binary benchmarks
$(call title,"PHP Benchmarks - PHP binary")
${BLUEPRINT} $(BENCH_CSV) $(BENCH_SCHEMA) --profile

bench-docker: ##@Benchmarks Run Docker benchmarks
$(call title,"PHP Benchmarks - Docker")
@time ${BLUEPRINT_DOCKER} $(BENCH_CSV) $(BENCH_SCHEMA) --profile


BENCH_ROWS := 1000 100000 1000000
bench-prepare: ##@Benchmarks Create CSV files
$(call title,"PHP Benchmarks - Prepare CSV files")
exit 1; # Disabled for now. Enable if you need to generate CSV files.
@echo "Remove old CSV files"
@mkdir -pv ./build/bench/
@rm -fv ./build/bench/*.csv
@$(foreach rows,$(BENCH_ROWS), \
echo "Generate CSV: rows=$(rows)"; \
${BENCH_BIN} -H --columns=1 --rows=$(rows) -q & \
${BENCH_BIN} -H --columns=3 --rows=$(rows) -q & \
${BENCH_BIN} -H --columns=5 --rows=$(rows) -q & \
${BENCH_BIN} -H --columns=10 --rows=$(rows) -q & \
${BENCH_BIN} -H --columns=20 --rows=$(rows) -q & \
wait; \
echo "Generate CSV: rows=$(rows) - done"; \
)
@ls -lh ./build/bench/*.csv;
5 changes: 3 additions & 2 deletions README.md
Expand Up @@ -435,13 +435,14 @@ docker run --rm \
jbzoo/csv-blueprint:latest \
validate:csv \
--csv=./tests/fixtures/demo.csv \
--schema=./tests/schemas/demo_invalid.yml
--schema=./tests/schemas/demo_invalid.yml \
--ansi -vvv


# OR build it from source.
git clone git@github.com:JBZoo/Csv-Blueprint.git csv-blueprint
cd csv-blueprint
make build-docker # local tag is "jbzoo/csv-blueprint:local"
make docker-build # local tag is "jbzoo/csv-blueprint:local"
```


Expand Down
27 changes: 14 additions & 13 deletions composer.json
Expand Up @@ -27,25 +27,26 @@
"prefer-stable" : true,

"require" : {
"php" : "^8.1",
"ext-mbstring" : "*",
"php" : "^8.1",
"ext-mbstring" : "*",

"league/csv" : "^9.15.0",
"jbzoo/data" : "^7.1.1",
"jbzoo/cli" : "^7.1.8",
"jbzoo/utils" : "^7.2.0",
"jbzoo/ci-report-converter" : "^7.2.1",
"league/csv" : "^9.15.0",
"jbzoo/data" : "^7.1.1",
"jbzoo/cli" : "^7.1.8",
"jbzoo/utils" : "^7.2.0",
"jbzoo/ci-report-converter" : "^7.2.1",

"symfony/yaml" : ">=6.4.3",
"symfony/filesystem" : ">=6.4.3",
"symfony/finder" : ">=6.4.0",
"markrogoyski/math-php" : "^2.9.0",
"respect/validation" : "^2.3.5"
"symfony/yaml" : ">=6.4.3",
"symfony/filesystem" : ">=6.4.3",
"symfony/finder" : ">=6.4.0",
"markrogoyski/math-php" : "^2.9.0",
"respect/validation" : "^2.3.5"
},

"require-dev" : {
"roave/security-advisories" : "dev-latest",
"jbzoo/toolbox-dev" : "^7.1.0"
"jbzoo/toolbox-dev" : "^7.1.0",
"fakerphp/faker" : "^1.23.1"
},

"bin" : ["csv-blueprint"],
Expand Down
2 changes: 1 addition & 1 deletion composer.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions src/Commands/ValidateCsv.php
Expand Up @@ -107,6 +107,10 @@ protected function configure(): void

protected function executeAction(): int
{
if ($this->getOptBool('profile')) {
\define('PROFILE_MODE', true);
}

$csvFilenames = $this->getCsvFilepaths();
$schemaFilenames = $this->getSchemaFilepaths();
$matchedFiles = Utils::matchSchemaAndCsvFiles($csvFilenames, $schemaFilenames);
Expand Down
13 changes: 9 additions & 4 deletions src/Csv/Column.php
Expand Up @@ -16,9 +16,9 @@

namespace JBZoo\CsvBlueprint\Csv;

use JBZoo\CsvBlueprint\Validators\ColumnValidator;
use JBZoo\CsvBlueprint\Validators\Error;
use JBZoo\CsvBlueprint\Validators\ErrorSuite;
use JBZoo\CsvBlueprint\Validators\ValidatorColumn;
use JBZoo\Data\Data;

final class Column
Expand Down Expand Up @@ -97,14 +97,19 @@ public function getInherit(): string
return $this->column->getString('inherit', self::FALLBACK_VALUES['inherit']);
}

public function getValidator(): ValidatorColumn
{
return new ValidatorColumn($this);
}

public function validateCell(string $cellValue, int $line = Error::UNDEFINED_LINE): ErrorSuite
{
return (new ColumnValidator($this))->validateCell($cellValue, $line);
return $this->getValidator()->validateCell($cellValue, $line);
}

public function validateList(array $cellValue): ErrorSuite
public function validateList(array &$cellValue): ErrorSuite
{
return (new ColumnValidator($this))->validateList($cellValue);
return $this->getValidator()->validateList($cellValue);
}

private function prepareRuleSet(string $schemaKey): array
Expand Down
29 changes: 21 additions & 8 deletions src/Csv/CsvFile.php
Expand Up @@ -17,8 +17,9 @@
namespace JBZoo\CsvBlueprint\Csv;

use JBZoo\CsvBlueprint\Schema;
use JBZoo\CsvBlueprint\Validators\CsvValidator;
use JBZoo\CsvBlueprint\Utils;
use JBZoo\CsvBlueprint\Validators\ErrorSuite;
use JBZoo\CsvBlueprint\Validators\ValidatorCsv;
use League\Csv\Reader as LeagueReader;
use League\Csv\Statement;
use League\Csv\TabularDataReader;
Expand All @@ -30,6 +31,7 @@ final class CsvFile
private LeagueReader $reader;
private Schema $schema;
private bool $isEmpty;
private ?array $header = null;

public function __construct(string $csvFilename, null|array|string $csvSchemaFilenameOrArray = null)
{
Expand Down Expand Up @@ -59,18 +61,29 @@ public function getCsvStructure(): ParseConfig
*/
public function getHeader(): array
{
if ($this->structure->isHeader() && !$this->isEmpty) {
// TODO: add handler for empty file
// League\Csv\SyntaxError : The header record does not exist or is empty at offset: `0
return $this->reader->getHeader();
if ($this->header === null) {
Utils::debug('Start getHeader() from CSV');
$this->header = [];

if ($this->structure->isHeader() && !$this->isEmpty) {
// TODO: add handler for empty file
// League\Csv\SyntaxError : The header record does not exist or is empty at offset: `0
$this->header = $this->reader->getHeader();
}

Utils::debug('End getHeader()');
}

return [];
return $this->header;
}

public function getRecords(): \Iterator
{
return $this->reader->getRecords($this->getHeader());
Utils::debug('Start getRecords() from CSV');
$records = $this->reader->getRecords($this->getHeader());
Utils::debug('End getRecords()');

return $records;
}

public function getRecordsChunk(int $offset = 0, int $limit = -1): TabularDataReader
Expand All @@ -80,7 +93,7 @@ public function getRecordsChunk(int $offset = 0, int $limit = -1): TabularDataRe

public function validate(bool $quickStop = false): ErrorSuite
{
return (new CsvValidator($this, $this->schema))->validate($quickStop);
return (new ValidatorCsv($this, $this->schema))->validate($quickStop);
}

private function prepareReader(): LeagueReader
Expand Down
20 changes: 18 additions & 2 deletions src/Rules/AbstarctRule.php
Expand Up @@ -17,13 +17,21 @@
namespace JBZoo\CsvBlueprint\Rules;

use JBZoo\CsvBlueprint\Utils;
use JBZoo\CsvBlueprint\Validators\ColumnValidator;
use JBZoo\CsvBlueprint\Validators\Error;
use JBZoo\CsvBlueprint\Validators\ValidatorColumn;

use function JBZoo\Utils\bool;

abstract class AbstarctRule
{
public const INPUT_TYPE = self::INPUT_TYPE_UNDEF;

public const INPUT_TYPE_UNDEF = self::INPUT_TYPE_STRINGS;
public const INPUT_TYPE_BOOL = 0;
public const INPUT_TYPE_INTS = 1;
public const INPUT_TYPE_FLOATS = 2;
public const INPUT_TYPE_STRINGS = 3;

// Modes
public const DEFAULT = 'default';
public const EQ = '';
Expand Down Expand Up @@ -62,7 +70,7 @@ public function __construct(
// TODO: Move resolving and validating expected value on this stage to make it only once (before validation).
}

public function validate(array|string $cellValue, int $line = ColumnValidator::FALLBACK_LINE): ?Error
public function validate(array|string $cellValue, int $line = ValidatorColumn::FALLBACK_LINE): ?Error
{
// TODO: Extract to abstract boolean cell/agregate rule
if ($this->isEnabled($cellValue) === false) {
Expand Down Expand Up @@ -140,6 +148,14 @@ public function getRuleCode(?string $mode = null): string
return Utils::camelToKebabCase((new \ReflectionClass($this))->getShortName()) . $postfix;
}

/**
* @phan-suppress PhanPluginPossiblyStaticPublicMethod
*/
public function getInputType(): int
{
return static::INPUT_TYPE;
}

protected function getOptionAsBool(): bool
{
// TODO: Replace to warning message
Expand Down
4 changes: 2 additions & 2 deletions src/Rules/AbstarctRuleCombo.php
Expand Up @@ -18,8 +18,8 @@

use JBZoo\CsvBlueprint\Rules\Aggregate\AbstarctAggregateRuleCombo;
use JBZoo\CsvBlueprint\Rules\Cell\AbstractCellRuleCombo;
use JBZoo\CsvBlueprint\Validators\ColumnValidator;
use JBZoo\CsvBlueprint\Validators\Error;
use JBZoo\CsvBlueprint\Validators\ValidatorColumn;

abstract class AbstarctRuleCombo extends AbstarctRule
{
Expand All @@ -36,7 +36,7 @@ abstract protected function getExpected(): float;

abstract protected function getActual(array|string $value): float;

public function validate(array|string $cellValue, int $line = ColumnValidator::FALLBACK_LINE): ?Error
public function validate(array|string $cellValue, int $line = ValidatorColumn::FALLBACK_LINE): ?Error
{
$error = $this->validateCombo($cellValue);

Expand Down

0 comments on commit c06a6d0

Please sign in to comment.