Skip to content

Commit

Permalink
Improve CSV schema validation with filename patterns
Browse files Browse the repository at this point in the history
This commit introduces filename pattern validation to the CSV schema, which allows increased data consistency checks. It also extends the capability to include additional columns to the schema, providing a more flexible structure. Enhancements were also made to error handling, introducing a quick-stop feature to expedite error discovery.
  • Loading branch information
Denis Smet committed Mar 13, 2024
1 parent ad046c0 commit 778859c
Show file tree
Hide file tree
Showing 10 changed files with 95 additions and 13 deletions.
21 changes: 18 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,11 @@ This gives you great flexibility when validating CSV files.
```yml
# It's a full example of the CSV schema file in YAML format.

# Regular expression to match the file name. If not set, then no pattern check
# This way you can validate the file name before the validation process.
# Feel free to check parent directories as well.
filename_pattern: /demo(-\d+)?\.csv$/

csv: # Here are default values. You can skip this section if you don't need to override the default values
header: true # If the first row is a header. If true, name of each column is required
delimiter: , # Delimiter character in CSV file
Expand Down Expand Up @@ -362,6 +367,8 @@ columns:
cardinal_direction: true # Valid cardinal direction. Examples: "N", "S", "NE", "SE", "none", ""
usa_market_name: true # Check if the value is a valid USA market name. Example: "New York, NY"

- name: "another_column"

```


Expand All @@ -370,15 +377,16 @@ columns:

```json
{
"csv" : {
"filename_pattern" : "/demo(-\\d+)?\\.csv$/",
"csv" : {
"header" : true,
"delimiter" : ",",
"quote_char" : "\\",
"enclosure" : "\"",
"encoding" : "utf-8",
"bom" : false
},
"columns" : [
"columns" : [
{
"name" : "csv_header_name",
"description" : "Lorem ipsum",
Expand Down Expand Up @@ -412,7 +420,8 @@ columns:
"cardinal_direction" : true,
"usa_market_name" : true
}
}
},
{"name" : "another_column"}
]
}

Expand All @@ -422,6 +431,7 @@ columns:




<details>
<summary>Click to see: PHP Format</summary>

Expand All @@ -430,6 +440,8 @@ columns:
declare(strict_types=1);

return [
'filename_pattern' => '/demo(-\\d+)?\\.csv$/',

'csv' => [
'header' => true,
'delimiter' => ',',
Expand All @@ -438,6 +450,7 @@ return [
'encoding' => 'utf-8',
'bom' => false,
],

'columns' => [
[
'name' => 'csv_header_name',
Expand Down Expand Up @@ -473,6 +486,7 @@ return [
'usa_market_name' => true,
],
],
['name' => 'another_column'],
],
];

Expand All @@ -481,6 +495,7 @@ return [
</details>



## Coming soon

It's random ideas and plans. No orderings and deadlines. <u>But batch processing is the priority #1</u>.
Expand Down
8 changes: 5 additions & 3 deletions schema-examples/full.json
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
{
"csv" : {
"filename_pattern" : "/demo(-\\d+)?\\.csv$/",
"csv" : {
"header" : true,
"delimiter" : ",",
"quote_char" : "\\",
"enclosure" : "\"",
"encoding" : "utf-8",
"bom" : false
},
"columns" : [
"columns" : [
{
"name" : "csv_header_name",
"description" : "Lorem ipsum",
Expand Down Expand Up @@ -41,6 +42,7 @@
"cardinal_direction" : true,
"usa_market_name" : true
}
}
},
{"name" : "another_column"}
]
}
4 changes: 4 additions & 0 deletions schema-examples/full.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
declare(strict_types=1);

return [
'filename_pattern' => '/demo(-\\d+)?\\.csv$/',

'csv' => [
'header' => true,
'delimiter' => ',',
Expand All @@ -23,6 +25,7 @@
'encoding' => 'utf-8',
'bom' => false,
],

'columns' => [
[
'name' => 'csv_header_name',
Expand Down Expand Up @@ -58,5 +61,6 @@
'usa_market_name' => true,
],
],
['name' => 'another_column'],
],
];
7 changes: 7 additions & 0 deletions schema-examples/full.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@

# It's a full example of the CSV schema file in YAML format.

# Regular expression to match the file name. If not set, then no pattern check
# This way you can validate the file name before the validation process.
# Feel free to check parent directories as well.
filename_pattern: /demo(-\d+)?\.csv$/

csv: # Here are default values. You can skip this section if you don't need to override the default values
header: true # If the first row is a header. If true, name of each column is required
delimiter: , # Delimiter character in CSV file
Expand Down Expand Up @@ -66,3 +71,5 @@ columns:
is_longitude: true # Can be integer or float. Example: -89.123456
cardinal_direction: true # Valid cardinal direction. Examples: "N", "S", "NE", "SE", "none", ""
usa_market_name: true # Check if the value is a valid USA market name. Example: "New York, NY"

- name: "another_column"
37 changes: 35 additions & 2 deletions src/Csv/CsvFile.php
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,9 @@ public function validate(bool $quickStop = false): ErrorSuite
{
$errors = new ErrorSuite($this->getCsvFilename());

$errors->addErrorSuit($this->validateHeader())
$errors
->addErrorSuit($this->validateFile($quickStop))
->addErrorSuit($this->validateHeader($quickStop))
->addErrorSuit($this->validateEachCell($quickStop))
->addErrorSuit(self::validateAggregateRules($quickStop));

Expand All @@ -106,7 +108,7 @@ private function prepareReader(): LeagueReader
return $reader;
}

private function validateHeader(): ErrorSuite
private function validateHeader(bool $quickStop = false): ErrorSuite
{
$errors = new ErrorSuite();

Expand All @@ -125,6 +127,10 @@ private function validateHeader(): ErrorSuite

$errors->addError($error);
}

if ($quickStop && $errors->count() > 0) {
return $errors;
}
}

return $errors;
Expand Down Expand Up @@ -152,6 +158,33 @@ private function validateEachCell(bool $quickStop = false): ErrorSuite
return $errors;
}

private function validateFile(bool $quickStop = false): ErrorSuite
{
$errors = new ErrorSuite();

$filenamePattern = $this->schema->getFilenamePattern();
if (
$filenamePattern !== null
&& $filenamePattern !== ''
&& \preg_match($filenamePattern, $this->csvFilename) === 0
) {
$error = new Error(
'filename_pattern',
"Filename \"<c>{$this->csvFilename}</c>\" does not match pattern: \"<c>{$filenamePattern}</c>\"",
$this->csvFilename,
0,
);

$errors->addError($error);

if ($quickStop && $errors->count() > 0) {
return $errors;
}
}

return $errors;
}

private static function validateAggregateRules(bool $quickStop = false): ErrorSuite
{
$errors = new ErrorSuite();
Expand Down
4 changes: 2 additions & 2 deletions src/Schema.php
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,9 @@ public function getColumn(int|string $columNameOrId): ?Column
return $column;
}

public function getFinenamePattern(): ?string
public function getFilenamePattern(): ?string
{
return $this->data->getStringNull('finename_pattern');
return Utils::prepareRegex($this->data->getStringNull('filename_pattern'));
}

public function getIncludes(): array
Expand Down
4 changes: 2 additions & 2 deletions tests/Blueprint/SchemaTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ public function testFilename(): void
public function testGetFinenamePattern(): void
{
$schemaEmpty = new Schema(self::SCHEMA_EXAMPLE_EMPTY);
isSame(null, $schemaEmpty->getFinenamePattern());
isSame(null, $schemaEmpty->getFilenamePattern());

$schemaFull = new Schema(self::SCHEMA_EXAMPLE_FULL);
isSame('^example\.csv$', $schemaFull->getFinenamePattern());
isSame('/^example\.csv$/u', $schemaFull->getFilenamePattern());
}

public function testScvStruture(): void
Expand Down
19 changes: 19 additions & 0 deletions tests/Blueprint/ValidatorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -564,6 +564,25 @@ public function testGetAvaiableRenderFormats(): void
], ErrorSuite::getAvaiableRenderFormats());
}

public function testFilenamePattern(): void
{
$csv = new CsvFile(self::CSV_COMPLEX, ['filename_pattern' => '/demo(-\\d+)?\\.csv$/']);
isSame(
'"filename_pattern" at line 0, column "./tests/fixtures/complex_header.csv". ' .
'Filename "./tests/fixtures/complex_header.csv" does not match pattern: "/demo(-\d+)?\.csv$/".',
\strip_tags((string)$csv->validate()->get(0)),
);

$csv = new CsvFile(self::CSV_COMPLEX, ['filename_pattern' => '']);
isSame('', (string)$csv->validate());

$csv = new CsvFile(self::CSV_COMPLEX, ['filename_pattern' => null]);
isSame('', (string)$csv->validate());

$csv = new CsvFile(self::CSV_COMPLEX, ['filename_pattern' => '/.*\.csv$/']);
isSame('', (string)$csv->validate());
}

private function getRule(?string $columnName, ?string $ruleName, array|bool|float|int|string $options): array
{
return ['columns' => [['name' => $columnName, 'rules' => [$ruleName => $options]]]];
Expand Down
2 changes: 2 additions & 0 deletions tests/schemas/demo_valid.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

# This schema is valid because match the CSV file (tests/fixtures/demo.csv) perfectly.

filename_pattern: /demo(-\d+)?\.csv$/

columns:
- name: Name
rules:
Expand Down
2 changes: 1 addition & 1 deletion tests/schemas/example_full.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
# Created: 2023-03-09

# File name pattern to match. If not set, then no pattern check
finename_pattern: ^example\.csv$
filename_pattern: ^example\.csv$


# Include another schemas
Expand Down

0 comments on commit 778859c

Please sign in to comment.