Skip to content

Commit

Permalink
Add midhinge rules for data validation
Browse files Browse the repository at this point in the history
This commit introduces the new 'midhinge' rules for data validation. These rules calculate the average of the first and third quartiles of a data set, providing a useful measure of its central location. The new rule makes it possible to identify rows that do not meet expected midhinge values, ensuring better data consistency and quality. Corresponding adjustments have been made in schema files and unit tests.
  • Loading branch information
SmetDenis committed Mar 25, 2024
1 parent fd94042 commit 1c0bad2
Show file tree
Hide file tree
Showing 10 changed files with 137 additions and 5 deletions.
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
[![Stable Version](https://poser.pugx.org/jbzoo/csv-blueprint/version)](https://packagist.org/packages/jbzoo/csv-blueprint/) [![Total Downloads](https://poser.pugx.org/jbzoo/csv-blueprint/downloads)](https://packagist.org/packages/jbzoo/csv-blueprint/stats) [![Docker Pulls](https://img.shields.io/docker/pulls/jbzoo/csv-blueprint.svg)](https://hub.docker.com/r/jbzoo/csv-blueprint/tags) [![GitHub License](https://img.shields.io/github/license/jbzoo/csv-blueprint)](https://github.com/JBZoo/Csv-Blueprint/blob/master/LICENSE)

<!-- rules-counter -->
[![Static Badge](https://img.shields.io/badge/Rules-186-green?label=Total%20Number%20of%20Rules&labelColor=darkgreen&color=gray)](schema-examples/full.yml) [![Static Badge](https://img.shields.io/badge/Rules-66-green?label=Cell%20Value&labelColor=blue&color=gray)](src/Rules/Cell) [![Static Badge](https://img.shields.io/badge/Rules-115-green?label=Aggregate%20Column&labelColor=blue&color=gray)](src/Rules/Aggregate) [![Static Badge](https://img.shields.io/badge/Rules-5-green?label=Extra%20Checks&labelColor=blue&color=gray)](#extra-checks) [![Static Badge](https://img.shields.io/badge/Rules-273-green?label=Plan%20to%20add&labelColor=gray&color=gray)](tests/schemas/todo.yml)
[![Static Badge](https://img.shields.io/badge/Rules-192-green?label=Total%20Number%20of%20Rules&labelColor=darkgreen&color=gray)](schema-examples/full.yml) [![Static Badge](https://img.shields.io/badge/Rules-66-green?label=Cell%20Value&labelColor=blue&color=gray)](src/Rules/Cell) [![Static Badge](https://img.shields.io/badge/Rules-121-green?label=Aggregate%20Column&labelColor=blue&color=gray)](src/Rules/Aggregate) [![Static Badge](https://img.shields.io/badge/Rules-5-green?label=Extra%20Checks&labelColor=blue&color=gray)](#extra-checks) [![Static Badge](https://img.shields.io/badge/Rules-323-green?label=Plan%20to%20add&labelColor=gray&color=gray)](tests/schemas/todo.yml)
<!-- /rules-counter -->

## Introduction
Expand Down Expand Up @@ -381,6 +381,17 @@ columns:
quartiles_less: [ 'exclusive', '100%', 8.0 ] # x < 8.0
quartiles_max: [ 'inclusive', 'IQR', 9.0 ] # x <= 9.0

# Midhinge. The average of the first and third quartiles and is thus a measure of location.
# Equivalently, it is the 25% trimmed mid-range or 25% midsummary; it is an L-estimator.
# See: https://en.wikipedia.org/wiki/Midhinge
# Midhinge = (first quartile, third quartile) / 2
midhinge_min: 1.0 # x >= 1.0
midhinge_greater: 2.0 # x > 2.0
midhinge_not: 5.0 # x != 5.0
midhinge: 7.0 # x == 7.0
midhinge_less: 8.0 # x < 8.0
midhinge_max: 9.0 # x <= 9.0

# MAD - mean absolute deviation. The average of the absolute deviations from a central point.
# It is a summary statistic of statistical dispersion or variability.
# See: https://en.wikipedia.org/wiki/Average_absolute_deviation
Expand Down
7 changes: 7 additions & 0 deletions schema-examples/full.json
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,13 @@
"quartiles_less" : ["exclusive", "100%", 8],
"quartiles_max" : ["inclusive", "IQR", 9],

"midhinge_min" : 1,
"midhinge_greater" : 2,
"midhinge_not" : 5,
"midhinge" : 7,
"midhinge_less" : 8,
"midhinge_max" : 9,

"mean_abs_dev_min" : 1,
"mean_abs_dev_greater" : 2,
"mean_abs_dev_not" : 5,
Expand Down
7 changes: 7 additions & 0 deletions schema-examples/full.php
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,13 @@
'quartiles_less' => ['exclusive', '100%', 8.0],
'quartiles_max' => ['inclusive', 'IQR', 9.0],

'midhinge_min' => 1.0,
'midhinge_greater' => 2.0,
'midhinge_not' => 5.0,
'midhinge' => 7.0,
'midhinge_less' => 8.0,
'midhinge_max' => 9.0,

'mean_abs_dev_min' => 1.0,
'mean_abs_dev_greater' => 2.0,
'mean_abs_dev_not' => 5.0,
Expand Down
11 changes: 11 additions & 0 deletions schema-examples/full.yml
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,17 @@ columns:
quartiles_less: [ 'exclusive', '100%', 8.0 ] # x < 8.0
quartiles_max: [ 'inclusive', 'IQR', 9.0 ] # x <= 9.0

# Midhinge. The average of the first and third quartiles and is thus a measure of location.
# Equivalently, it is the 25% trimmed mid-range or 25% midsummary; it is an L-estimator.
# See: https://en.wikipedia.org/wiki/Midhinge
# Midhinge = (first quartile, third quartile) / 2
midhinge_min: 1.0 # x >= 1.0
midhinge_greater: 2.0 # x > 2.0
midhinge_not: 5.0 # x != 5.0
midhinge: 7.0 # x == 7.0
midhinge_less: 8.0 # x < 8.0
midhinge_max: 9.0 # x <= 9.0

# MAD - mean absolute deviation. The average of the absolute deviations from a central point.
# It is a summary statistic of statistical dispersion or variability.
# See: https://en.wikipedia.org/wiki/Average_absolute_deviation
Expand Down
7 changes: 7 additions & 0 deletions schema-examples/full_clean.yml
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,13 @@ columns:
- IQR
- 9.0

midhinge_min: 1.0
midhinge_greater: 2.0
midhinge_not: 5.0
midhinge: 7.0
midhinge_less: 8.0
midhinge_max: 9.0

mean_abs_dev_min: 1.0
mean_abs_dev_greater: 2.0
mean_abs_dev_not: 5.0
Expand Down
49 changes: 49 additions & 0 deletions src/Rules/Aggregate/ComboMidhinge.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
<?php

/**
* JBZoo Toolbox - Csv-Blueprint.
*
* This file is part of the JBZoo Toolbox project.
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*
* @license MIT
* @copyright Copyright (C) JBZoo.com, All rights reserved.
* @see https://github.com/JBZoo/Csv-Blueprint
*/

declare(strict_types=1);

namespace JBZoo\CsvBlueprint\Rules\Aggregate;

use JBZoo\CsvBlueprint\Rules\AbstarctRule;
use MathPHP\Statistics\Descriptive;

final class ComboMidhinge extends AbstarctAggregateRuleCombo
{
public const INPUT_TYPE = AbstarctRule::INPUT_TYPE_FLOATS;

protected const NAME = 'midhinge';

public function getHelpMeta(): array
{
return [
[
'Midhinge. The average of the first and third quartiles and is thus a measure of location.',
'Equivalently, it is the 25% trimmed mid-range or 25% midsummary; it is an L-estimator.',
'See: https://en.wikipedia.org/wiki/Midhinge',
'Midhinge = (first quartile, third quartile) / 2',
],
[],
];
}

protected function getActualAggregate(array $colValues): ?float
{
if (\count($colValues) === 0) {
return null;
}

return Descriptive::midhinge(self::stringsToFloat($colValues));
}
}
2 changes: 1 addition & 1 deletion tests/ReadmeTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ public function testBadgeOfRules(): void

$todoYml = yml(Tools::SCHEMA_TODO);
$planToAdd = \count($todoYml->findArray('columns.0.rules')) +
(\count($todoYml->findArray('columns.0.aggregate_rules')) * 4)
(\count($todoYml->findArray('columns.0.aggregate_rules')) * 6)
+ \count([
'required',
'null_values',
Expand Down
41 changes: 41 additions & 0 deletions tests/Rules/Aggregate/ComboMidhingeTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
<?php

/**
* JBZoo Toolbox - Csv-Blueprint.
*
* This file is part of the JBZoo Toolbox project.
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*
* @license MIT
* @copyright Copyright (C) JBZoo.com, All rights reserved.
* @see https://github.com/JBZoo/Csv-Blueprint
*/

declare(strict_types=1);

namespace JBZoo\PHPUnit\Rules\Aggregate;

use JBZoo\CsvBlueprint\Rules\AbstarctRule as Combo;
use JBZoo\CsvBlueprint\Rules\Aggregate\ComboMidhinge;
use JBZoo\PHPUnit\Rules\TestAbstractAggregateRuleCombo;

use function JBZoo\PHPUnit\isSame;

class ComboMidhingeTest extends TestAbstractAggregateRuleCombo
{
protected string $ruleClass = ComboMidhinge::class;

public function testEqual(): void
{
$rule = $this->create(18, Combo::EQ);
isSame('', $rule->test([]));
isSame('', $rule->test(\range(1, 35)));

$rule = $this->create(3, Combo::EQ);
isSame(
'The midhinge in the column is "18", which is not equal than the expected "3"',
$rule->test(\range(1, 35)),
);
}
}
2 changes: 2 additions & 0 deletions tests/Rules/Aggregate/ComboQuartilesTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ public function testEqual(): void
$rule = $this->create(['inclusive', 'IQR', 199], Combo::EQ);
isSame('', $rule->test($range));

isSame('', $rule->test([]));

isSame(
'The quartile in the column is "100", which is not equal than the expected "199"',
$rule->test(\range(1, 200)),
Expand Down
3 changes: 0 additions & 3 deletions tests/schemas/todo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -236,9 +236,6 @@ columns:
count_true: 1
count_false: 1

# https://github.com/markrogoyski/math-php#statistics---descriptive
midhinge: 1

# https://github.com/markrogoyski/math-php#statistics---averages
harmonic_mean: 1
geometric_mean: 1
Expand Down

0 comments on commit 1c0bad2

Please sign in to comment.