Skip to content

Commit

Permalink
test: Add unicode character tests (#1173)
Browse files Browse the repository at this point in the history
* test: Add row hash integrations tests for a wider set of characters than standard test data
  • Loading branch information
nj1973 committed Jun 21, 2024
1 parent d2c2f47 commit 1879beb
Show file tree
Hide file tree
Showing 15 changed files with 364 additions and 1 deletion.
20 changes: 20 additions & 0 deletions tests/resources/bigquery_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,23 @@ INSERT INTO `pso_data_validator`.`dvt_char_id` VALUES
('DVT3 ', 'Row 3'),
('DVT4 ', 'Row 4'),
('DVT5 ', 'Row 5');

DROP TABLE `pso_data_validator`.`dvt_pangrams`;
CREATE TABLE `pso_data_validator`.`dvt_pangrams`
( id INT64
, lang STRING(100)
, words STRING(1000)
, words_en STRING(1000)
) OPTIONS (description='Integration test table used to test unicode characters.');
-- Text taken from Wikipedia, we cannot guarantee translations :-)
INSERT INTO `pso_data_validator`.`dvt_pangrams` VALUES
(1,'Hebrew', 'שפן אכל קצת גזר בטעם חסה, ודי',
'A bunny ate some lettuce-flavored carrots, and he had enough'),
(2,'Polish', 'Pchnąć w tę łódź jeża lub ośm skrzyń fig',
'Push a hedgehog or eight crates of figs in this boat'),
(3,'Russian', 'Съешь ещё этих мягких французских булок, да выпей же чаю',
'Eat more of these soft French loaves and drink a tea'),
(4,'Swedish', 'Schweiz för lyxfjäder på qvist bakom ugn',
'Switzerland brings luxury feather on branch behind oven'),
(5,'Turkish', 'Pijamalı hasta yağız şoföre çabucak güvendi',
'The sick person in pyjamas quickly trusted the swarthy driver');
22 changes: 21 additions & 1 deletion tests/resources/hive_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,24 @@ INSERT INTO pso_data_validator.dvt_binary VALUES
('DVT-key-2', 2, 'Row 2'),
('DVT-key-3', 3, 'Row 3'),
('DVT-key-4', 4, 'Row 4'),
('DVT-key-5', 5, 'Row 5');
('DVT-key-5', 5, 'Row 5');

DROP TABLE pso_data_validator.dvt_pangrams;
CREATE TABLE pso_data_validator.dvt_pangrams
( id int NOT NULL
, lang varchar(100)
, words varchar(1000)
, words_en varchar(1000)
) COMMENT 'Integration test table used to test unicode characters.';
-- Text taken from Wikipedia, we cannot guarantee translations :-)
INSERT INTO pso_data_validator.dvt_pangrams VALUES
(1,'Hebrew', 'שפן אכל קצת גזר בטעם חסה, ודי',
'A bunny ate some lettuce-flavored carrots, and he had enough'),
(2,'Polish', 'Pchnąć w tę łódź jeża lub ośm skrzyń fig',
'Push a hedgehog or eight crates of figs in this boat'),
(3,'Russian', 'Съешь ещё этих мягких французских булок, да выпей же чаю',
'Eat more of these soft French loaves and drink a tea'),
(4,'Swedish', 'Schweiz för lyxfjäder på qvist bakom ugn',
'Switzerland brings luxury feather on branch behind oven'),
(5,'Turkish', 'Pijamalı hasta yağız şoföre çabucak güvendi',
'The sick person in pyjamas quickly trusted the swarthy driver');
20 changes: 20 additions & 0 deletions tests/resources/mysql_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,23 @@ INSERT INTO `pso_data_validator`.`dvt_char_id` VALUES
('DVT3', 'Row 3'),
('DVT4', 'Row 4'),
('DVT5', 'Row 5');

DROP TABLE `pso_data_validator`.`dvt_pangrams`;
CREATE TABLE `pso_data_validator`.`dvt_pangrams`
( id int NOT NULL PRIMARY KEY
, lang varchar(100)
, words varchar(1000) CHARACTER SET utf8 COLLATE utf8_unicode_ci
, words_en varchar(1000)
) COMMENT 'Integration test table used to test unicode characters.';
-- Text taken from Wikipedia, we cannot guarantee translations :-)
INSERT INTO `pso_data_validator`.`dvt_pangrams` VALUES
(1,'Hebrew', 'שפן אכל קצת גזר בטעם חסה, ודי',
'A bunny ate some lettuce-flavored carrots, and he had enough'),
(2,'Polish', 'Pchnąć w tę łódź jeża lub ośm skrzyń fig',
'Push a hedgehog or eight crates of figs in this boat'),
(3,'Russian', 'Съешь ещё этих мягких французских булок, да выпей же чаю',
'Eat more of these soft French loaves and drink a tea'),
(4,'Swedish', 'Schweiz för lyxfjäder på qvist bakom ugn',
'Switzerland brings luxury feather on branch behind oven'),
(5,'Turkish', 'Pijamalı hasta yağız şoföre çabucak güvendi',
'The sick person in pyjamas quickly trusted the swarthy driver');
33 changes: 33 additions & 0 deletions tests/resources/oracle_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -220,3 +220,36 @@ INSERT INTO pso_data_validator.dvt_char_id VALUES ('DVT3', 'Row 3');
INSERT INTO pso_data_validator.dvt_char_id VALUES ('DVT4', 'Row 4');
INSERT INTO pso_data_validator.dvt_char_id VALUES ('DVT5', 'Row 5');
COMMIT;

DROP TABLE pso_data_validator.dvt_pangrams;
CREATE TABLE pso_data_validator.dvt_pangrams
( id NUMBER(5)
, lang VARCHAR2(100)
, words VARCHAR2(1000 CHAR)
, words_en VARCHAR2(1000)
, CONSTRAINT dvt_pangrams_pk PRIMARY KEY (id)
);
COMMENT ON TABLE pso_data_validator.dvt_pangrams IS 'Integration test table used to test unicode characters.';
-- Text taken from Wikipedia, we cannot guarantee translations :-)
-- Be sure to set "export NLS_LANG=.AL32UTF8" if inserting via SQL*Plus.
INSERT INTO pso_data_validator.dvt_pangrams
VALUES (1,'Hebrew',
'שפן אכל קצת גזר בטעם חסה, ודי',
'A bunny ate some lettuce-flavored carrots, and he had enough');
INSERT INTO pso_data_validator.dvt_pangrams
VALUES (2,'Polish',
'Pchnąć w tę łódź jeża lub ośm skrzyń fig',
'Push a hedgehog or eight crates of figs in this boat');
INSERT INTO pso_data_validator.dvt_pangrams
VALUES (3,'Russian',
'Съешь ещё этих мягких французских булок, да выпей же чаю',
'Eat more of these soft French loaves and drink a tea');
INSERT INTO pso_data_validator.dvt_pangrams
VALUES (4,'Swedish',
'Schweiz för lyxfjäder på qvist bakom ugn',
'Switzerland brings luxury feather on branch behind oven');
INSERT INTO pso_data_validator.dvt_pangrams
VALUES (5,'Turkish',
'Pijamalı hasta yağız şoföre çabucak güvendi',
'The sick person in pyjamas quickly trusted the swarthy driver');
COMMIT;
22 changes: 22 additions & 0 deletions tests/resources/postgresql_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -289,3 +289,25 @@ INSERT INTO pso_data_validator.dvt_char_id VALUES
('DVT3', 'Row 3'),
('DVT4', 'Row 4'),
('DVT5', 'Row 5');

DROP TABLE pso_data_validator.dvt_pangrams;
CREATE TABLE pso_data_validator.dvt_pangrams
( id int
, lang varchar(100)
, words varchar(1000)
, words_en varchar(1000)
, CONSTRAINT dvt_pangrams_pk PRIMARY KEY (id)
);
COMMENT ON TABLE pso_data_validator.dvt_pangrams IS 'Integration test table used to test unicode characters.';
-- Text taken from Wikipedia, we cannot guarantee translations :-)
INSERT INTO pso_data_validator.dvt_pangrams VALUES
(1,'Hebrew', 'שפן אכל קצת גזר בטעם חסה, ודי',
'A bunny ate some lettuce-flavored carrots, and he had enough'),
(2,'Polish', 'Pchnąć w tę łódź jeża lub ośm skrzyń fig',
'Push a hedgehog or eight crates of figs in this boat'),
(3,'Russian', 'Съешь ещё этих мягких французских булок, да выпей же чаю',
'Eat more of these soft French loaves and drink a tea'),
(4,'Swedish', 'Schweiz för lyxfjäder på qvist bakom ugn',
'Switzerland brings luxury feather on branch behind oven'),
(5,'Turkish', 'Pijamalı hasta yağız şoföre çabucak güvendi',
'The sick person in pyjamas quickly trusted the swarthy driver');
22 changes: 22 additions & 0 deletions tests/resources/snowflake_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -121,3 +121,25 @@ INSERT INTO PSO_DATA_VALIDATOR.PUBLIC.DVT_CHAR_ID VALUES
('DVT3 ', 'Row 3'),
('DVT4 ', 'Row 4'),
('DVT5 ', 'Row 5');

DROP TABLE PSO_DATA_VALIDATOR.PUBLIC.DVT_PANGRAMS;
CREATE TABLE PSO_DATA_VALIDATOR.PUBLIC.DVT_PANGRAMS
( id NUMBER(5)
, lang VARCHAR(100)
, words VARCHAR(1000)
, words_en VARCHAR(1000)
, CONSTRAINT dvt_pangrams_pk PRIMARY KEY (id)
);
COMMENT ON TABLE PSO_DATA_VALIDATOR.PUBLIC.DVT_PANGRAMS IS 'Integration test table used to test unicode characters.';
-- Text taken from Wikipedia, we cannot guarantee translations :-)
INSERT INTO PSO_DATA_VALIDATOR.PUBLIC.DVT_PANGRAMS VALUES
(1,'Hebrew', 'שפן אכל קצת גזר בטעם חסה, ודי',
'A bunny ate some lettuce-flavored carrots, and he had enough'),
(2,'Polish', 'Pchnąć w tę łódź jeża lub ośm skrzyń fig',
'Push a hedgehog or eight crates of figs in this boat'),
(3,'Russian', 'Съешь ещё этих мягких французских булок, да выпей же чаю',
'Eat more of these soft French loaves and drink a tea'),
(4,'Swedish', 'Schweiz för lyxfjäder på qvist bakom ugn',
'Switzerland brings luxury feather on branch behind oven'),
(5,'Turkish', 'Pijamalı hasta yağız şoföre çabucak güvendi',
'The sick person in pyjamas quickly trusted the swarthy driver');
14 changes: 14 additions & 0 deletions tests/resources/sqlserver_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,17 @@ INSERT INTO pso_data_validator.dvt_binary VALUES (CAST('DVT-key-2' AS binary), 2
INSERT INTO pso_data_validator.dvt_binary VALUES (CAST('DVT-key-3' AS binary), 3, 'Row 3');
INSERT INTO pso_data_validator.dvt_binary VALUES (CAST('DVT-key-4' AS binary), 4, 'Row 4');
INSERT INTO pso_data_validator.dvt_binary VALUES (CAST('DVT-key-5' AS binary), 5, 'Row 5');

DROP TABLE pso_data_validator.dvt_pangrams;
CREATE TABLE pso_data_validator.dvt_pangrams
( id int NOT NULL PRIMARY KEY
, lang varchar(100)
, words nvarchar(1000)
, words_en varchar(1000)
);
-- Text taken from Wikipedia, we cannot guarantee translations :-)
INSERT INTO pso_data_validator.dvt_pangrams VALUES (1,'Hebrew', 'שפן אכל קצת גזר בטעם חסה, ודי', 'A bunny ate some lettuce-flavored carrots, and he had enough');
INSERT INTO pso_data_validator.dvt_pangrams VALUES (2,'Polish', 'Pchnąć w tę łódź jeża lub ośm skrzyń fig', 'Push a hedgehog or eight crates of figs in this boat');
INSERT INTO pso_data_validator.dvt_pangrams VALUES (3,'Russian', 'Съешь ещё этих мягких французских булок, да выпей же чаю', 'Eat more of these soft French loaves and drink a tea');
INSERT INTO pso_data_validator.dvt_pangrams VALUES (4,'Swedish', 'Schweiz för lyxfjäder på qvist bakom ugn', 'Switzerland brings luxury feather on branch behind oven');
INSERT INTO pso_data_validator.dvt_pangrams VALUES (5,'Turkish', 'Pijamalı hasta yağız şoföre çabucak güvendi', 'The sick person in pyjamas quickly trusted the swarthy driver');
31 changes: 31 additions & 0 deletions tests/resources/teradata_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,34 @@ INSERT INTO udf.dvt_char_id VALUES ('DVT2', 'Row 2');
INSERT INTO udf.dvt_char_id VALUES ('DVT3', 'Row 3');
INSERT INTO udf.dvt_char_id VALUES ('DVT4', 'Row 4');
INSERT INTO udf.dvt_char_id VALUES ('DVT5', 'Row 5');
DROP TABLE udf.dvt_pangrams;
CREATE TABLE udf.dvt_pangrams
( id NUMBER(5) NOT NULL PRIMARY KEY
, lang VARCHAR(100)
, words VARCHAR(1000) CHARACTER SET UNICODE
, words_en VARCHAR(1000)
);
COMMENT ON TABLE udf.dvt_pangrams IS 'Integration test table used to test unicode characters.';
-- Text taken from Wikipedia, we cannot guarantee translations :-)
-- Ensure to load data in utf8 mode: bteq -c utf8
INSERT INTO udf.dvt_pangrams
VALUES (1,'Hebrew',
'שפן אכל קצת גזר בטעם חסה, ודי',
'A bunny ate some lettuce-flavored carrots, and he had enough');
INSERT INTO udf.dvt_pangrams
VALUES (2,'Polish',
'Pchnąć w tę łódź jeża lub ośm skrzyń fig',
'Push a hedgehog or eight crates of figs in this boat');
INSERT INTO udf.dvt_pangrams
VALUES (3,'Russian',
'Съешь ещё этих мягких французских булок, да выпей же чаю',
'Eat more of these soft French loaves and drink a tea');
INSERT INTO udf.dvt_pangrams
VALUES (4,'Swedish',
'Schweiz för lyxfjäder på qvist bakom ugn',
'Switzerland brings luxury feather on branch behind oven');
INSERT INTO udf.dvt_pangrams
VALUES (5,'Turkish',
'Pijamalı hasta yağız şoföre çabucak güvendi',
'The sick person in pyjamas quickly trusted the swarthy driver');
25 changes: 25 additions & 0 deletions tests/system/data_sources/test_hive.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from data_validation.partition_builder import PartitionBuilder
from tests.system.data_sources.common_functions import (
binary_key_assertions,
id_type_test_assertions,
null_not_null_assertions,
run_test_from_cli_args,
)
Expand Down Expand Up @@ -341,6 +342,30 @@ def test_row_validation_binary_pk_to_bigquery():
binary_key_assertions(df)


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
)
def test_row_validation_pangrams_to_bigquery():
"""Hive to BigQuery dvt_pangrams row validation.
This is testing comparisons across a wider set of characters than standard test data.
"""
parser = cli_tools.configure_arg_parser()
args = parser.parse_args(
[
"validate",
"row",
"-sc=hive-conn",
"-tc=bq-conn",
"-tbls=pso_data_validator.dvt_pangrams",
"--primary-keys=id",
"--hash=*",
]
)
df = run_test_from_cli_args(args)
id_type_test_assertions(df)


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
Expand Down
29 changes: 29 additions & 0 deletions tests/system/data_sources/test_mysql.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,35 @@ def test_row_validation_char_pk_to_bigquery():
id_type_test_assertions(df)


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
)
def test_row_validation_pangrams_to_bigquery():
"""MySQL to BigQuery dvt_pangrams row validation.
This is testing comparisons across a wider set of characters than standard test data.
Note, we are skipping this test because unicode characters are being converted to standard ascii.
"""
pytest.skip(
"Skipping test_row_validation_pangrams_to_bigquery because failing on MySQL."
)
parser = cli_tools.configure_arg_parser()
args = parser.parse_args(
[
"validate",
"row",
"-sc=mysql-conn",
"-tc=bq-conn",
"-tbls=PSO_DATA_VALIDATOR.PUBLIC.DVT_PANGRAMS=pso_data_validator.dvt_pangrams",
"--primary-keys=id",
"--hash=*",
]
)
df = run_test_from_cli_args(args)
id_type_test_assertions(df)


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
Expand Down
24 changes: 24 additions & 0 deletions tests/system/data_sources/test_oracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,6 +548,30 @@ def test_row_validation_char_pk_to_bigquery():
id_type_test_assertions(df)


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
)
def test_row_validation_pangrams_to_bigquery():
"""Oracle to BigQuery dvt_pangrams row validation.
This is testing comparisons across a wider set of characters than standard test data.
"""
parser = cli_tools.configure_arg_parser()
args = parser.parse_args(
[
"validate",
"row",
"-sc=ora-conn",
"-tc=bq-conn",
"-tbls=pso_data_validator.dvt_pangrams",
"--primary-keys=id",
"--hash=*",
]
)
df = run_test_from_cli_args(args)
id_type_test_assertions(df)


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
Expand Down
24 changes: 24 additions & 0 deletions tests/system/data_sources/test_postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -813,6 +813,30 @@ def test_row_validation_char_pk_to_bigquery():
id_type_test_assertions(df)


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
)
def test_row_validation_pangrams_to_bigquery():
"""PostgreSQL to BigQuery dvt_pangrams row validation.
This is testing comparisons across a wider set of characters than standard test data.
"""
parser = cli_tools.configure_arg_parser()
args = parser.parse_args(
[
"validate",
"row",
"-sc=pg-conn",
"-tc=bq-conn",
"-tbls=pso_data_validator.dvt_pangrams",
"--primary-keys=id",
"--hash=*",
]
)
df = run_test_from_cli_args(args)
id_type_test_assertions(df)


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
Expand Down
24 changes: 24 additions & 0 deletions tests/system/data_sources/test_snowflake.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,30 @@ def test_row_validation_char_pk_to_bigquery():
id_type_test_assertions(df)


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
)
def test_row_validation_pangrams_to_bigquery():
"""Snowflake to BigQuery dvt_pangrams row validation.
This is testing comparisons across a wider set of characters than standard test data.
"""
parser = cli_tools.configure_arg_parser()
args = parser.parse_args(
[
"validate",
"row",
"-sc=snowflake-conn",
"-tc=bq-conn",
"-tbls=PSO_DATA_VALIDATOR.PUBLIC.DVT_PANGRAMS=pso_data_validator.dvt_pangrams",
"--primary-keys=id",
"--hash=*",
]
)
df = run_test_from_cli_args(args)
id_type_test_assertions(df)


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
Expand Down
Loading

0 comments on commit 1879beb

Please sign in to comment.